In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
import re

### Reading Data

In [2]:
data = pd.read_excel('train_data.xlsx')

In [3]:
data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,PROPERTY,Sentiment,OLD_DOCUMENT_CLASS,CATEGORY_OC_NAME,Length
0,101539,45565,"hi ralph, considering the enormity of the matt...",unknown,General Complaints,non_relevant,88
1,22024,22024,how do i redeem my cash back on my credit card,neutral,Rewards/Offers,relevant,46
2,54698,54698,thank you for following up,positive,Thank You Messages,non_relevant,26
3,108902,52928,"hey, i had recently booked a flight though my ...",unknown,Others,non_relevant,210
4,70798,14824,can you assist with that,unknown,Assistance Needed,relevant,24
...,...,...,...,...,...,...,...
98479,50493,50493,yarasshoeboutique bmoharrisbank butterflybeach...,neutral,Others,non_relevant,119
98480,38984,38984,sorry it’s not a transfer,negative,Money Transfer,relevant,25
98481,91080,35106,yes it does.,unknown,Others,non_relevant,12
98482,82960,26986,i need help my bank card isn’t working,unknown,Card Failed/Not Working,relevant,38


## Data Cleaning

### Removing unnecessary columns

In [4]:
# columns which should be dropped
data['text'] = data['PROPERTY']
data['class'] = data['CATEGORY_OC_NAME'] 
drop_cols = ['Unnamed: 0.1','Unnamed: 0','Sentiment', 'OLD_DOCUMENT_CLASS','Length', 'PROPERTY', 'CATEGORY_OC_NAME']
data.drop(drop_cols, axis = 1, inplace = True)

### DeEmojifying

In [5]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
data['text'] = data['text'].apply(deEmojify)

### Stop words

In [6]:
stop_words = set(stopwords.words('english'))
stop_words.update(list(punctuation))
abbreviations = {'smh' : 'shaking my head', 'thnq' : 'thank you',  'okey' : 'ok', 'thanks' : 'thank you', 'okay' : 'ok'}  

In [8]:
for i in range(len(data)):
    msg = data['text'][i]
    msg = re.sub('((www\.[^\s]+) | (https?://[^\s]+))', '', msg) # removing links
    msg = re.sub('@[^\s]+', '', msg) # removing usernames, emails
    msg = [i.strip() for i in msg.split(',')] # removing extra spaces and commas
    words = []
    for word in msg:
        for w in word.split(' '):
            w = w.lower()
            w = w.strip()
            if w not in stop_words:
                if w in list(abbreviations.keys()):
                    words.append(abbreviations[w.strip()])
                else: 
                    words.append(w)
    data['text'][i] = " ".join(words) 

In [9]:
# replacing months with date and $ with price
month = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
for j in range(len(data)):
    msg = data['text'][j]
    s = ''
    for i in msg.split(' '):
        if i in month:
            s += 'date '
        elif len(i) > 0 and i[0] == '$':
            s += 'price ' 
        else:
            s += i
            s += ' '
    data['text'][j] = s.strip()

In [10]:
# removing question marks and punctuations
punctuation += '0123456789'
for j in range(len(data)):
    msg = data['text'][j]
    for i in punctuation:
        msg = msg.replace(i, '')
    data['text'][j] = msg

In [43]:
data['text'][14894]

'back canada date'

### Splitting data into train and test

In [12]:
Y = data['class'].values
data.drop(['class'], inplace = True, axis = 1)
X = data['text'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 1)

### Tf- Idf Vectorizer

In [13]:
count_vec_using_tf_idf = TfidfVectorizer(ngram_range = (1, 2), max_df = 0.85, min_df = 0.0002, max_features = 5000)
X_train_features = count_vec_using_tf_idf.fit_transform(X_train)
X_test_features = count_vec_using_tf_idf.transform(X_test)

### Score Calculator Function

In [14]:
def get_score(Y_pred, Y_true):
    s = 0
    for i in range(len(Y_true)):
        if Y_true[i] != Y_pred[i]:
            s += 1
    return 1 - (s / len(Y_true))

### Support vector classifier

In [15]:
svc = SVC(kernel = "linear", C = 0.97 , gamma = 'scale')
svc.fit(X_train_features, Y_train)
Y_test_pred = svc.predict(X_test_features)

#### score for Support vector classifier

In [16]:
score = get_score(Y_test_pred, Y_test)
print(score)

0.892679459843639


### Random forest classifier

In [17]:
clf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
clf.fit(X_train_features, Y_train)
Y_test_pred1 = clf.predict(X_test_features)

#### score for Random forest classifier

In [18]:
score = get_score(Y_test_pred1, Y_test)
print(score)

0.9007005787389583


### Naive Bayes Classifier

#### MultinomialNB Classifier

In [19]:
clf = MultinomialNB()
clf.fit(X_train_features, Y_train)
Y_test_pred2 = clf.predict(X_test_features)

#### Score for MultinomialNB Classifier

In [20]:
score = get_score(Y_test_pred2, Y_test)
print(score)

0.8627271804244085


#### BernoulliNB Classifier

In [21]:
clf = BernoulliNB()
clf.fit(X_train_features, Y_train)
Y_test_pred2 = clf.predict(X_test_features)

#### Score for BernoulliNB Classifier

In [22]:
score = get_score(Y_test_pred2, Y_test)
print(score)

0.8577520560462991


### Decision Tree Classifier

In [23]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_features, Y_train)
Y_test_pred3 = clf.predict(X_test_features)

#### Score for Decision Tree Classifier

In [24]:
score = get_score(Y_test_pred3, Y_test)
print(score)

0.8812062138288151


### Logistic Regression Classifier

In [25]:
clf = LogisticRegression(solver = 'saga', max_iter = 700000, C = 4)
clf.fit(X_train_features, Y_train)
Y_test_pred4 = clf.predict(X_test_features)

#### Score for Logistic Regression Classifier

In [26]:
score = get_score(Y_test_pred4, Y_test)
print(score)

0.8903441973804447
