In [127]:
import pandas as pd
from pandas import option_context

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import re
import string

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('training_base.csv')

In [9]:
df.shape

(3576, 10)

In [3]:
def remove_space(text):
    text = text.strip()
    text = re.sub("\s\s+", " ", text)
    return text
    
df['tweet_compound'] = df.tweet_compound.map(remove_space)
df['tweet'] = df.tweet.map(remove_space)

In [4]:
df.to_csv('training_base_new.csv')

In [5]:
df1 = pd.read_csv('training_base_new.csv')

In [7]:
train, test = train_test_split(df1, test_size = 0.2, random_state=10)

In [8]:
stop = stopwords.words('english')

# Intent analysis

## Naive Bayes

In [14]:
X_tr1 = train['tweet_compound']
y_tr1 = train[['intent']]

In [15]:
# creating bag of words model 
cv = CountVectorizer(max_features=1100) 
  
X1 = cv.fit_transform(X_tr1).toarray() 
y1 = y_tr1.iloc[:, 0].values 

# fitting naive bayes to the training set 
from sklearn.naive_bayes import MultinomialNB 

classifier = MultinomialNB();
classifier.fit(X1, y1) 

MultinomialNB()

In [16]:
test_nb1 = test.copy()
X_test1 = test['tweet_compound']
X_t1_cv = cv.fit_transform(X_test1).toarray() 

In [17]:
y_pred1 = classifier.predict(X_t1_cv) 

In [18]:
test_nb1['pred'] =  classifier.predict(X_t1_cv)

## Round 2 Naive Bayes

In [20]:
X_tr2 = train['tweet_compound']
y_tr2 = train[['intent']]

In [21]:
# creating bag of words model 
cv2 = CountVectorizer(max_features=1100) 
  
X2 = cv2.fit_transform(X_tr2).toarray() 
y2 = y_tr1.iloc[:, 0].values 

classifier2 = MultinomialNB();
classifier2.fit(X2, y2) 

MultinomialNB()

In [61]:
test_nb2 = test.copy()
X_test2 = test['tweet']
X_t2_cv = cv2.fit_transform(X_test2).toarray() 

In [62]:
y_pred2 = classifier2.predict(X_t2_cv) 

In [63]:
test_nb2['pred'] =  classifier2.predict(X_t2_cv)

In [64]:
accuracy2 = accuracy_score(test['intent'], y_pred2)
print("Accuracy: %.2f%%" % (accuracy2 * 100.0))

Accuracy: 10.06%


In [66]:
confusion_matrix(test['intent'], test_nb2['pred'])

array([[ 13,   8,  94, 149],
       [ 20,  13, 134, 174],
       [  7,   1,  21,  16],
       [  7,   7,  27,  25]])

Very poor performance - do not pursue any further

## With XGBoost
### No tuning

In [37]:
X_tr3 = train['tweet']
y_tr3 = train[['intent']]

In [38]:
# creating bag of words model 
cv3 = CountVectorizer(stop_words = stop,
                      max_features=880) 
  
X3 = cv3.fit_transform(X_tr3).toarray() 
y3 = y_tr3.iloc[:, 0].values 

# fitting XGB to the training set 
gbm3 = xgb.XGBClassifier(max_depth=5,
                        learning_rate = 0.05,
                        objective = 'multi:softmax',
                        num_class = 4)

In [39]:
gbm3.fit(X3,y3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_class=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [67]:
test_gbm3 = test.copy()
X_test3 = test['tweet']
X_t3_cv = cv3.fit_transform(X_test3).toarray() 
y_pred3 = gbm3.predict(X_t3_cv)
gbm_pred3 = [round(value) for value in y_pred3]
test_gbm3['pred'] =  gbm_pred3

In [68]:
accuracy3 = accuracy_score(test['intent'], gbm_pred3)
print("Accuracy: %.2f%%" % (accuracy3 * 100.0))

Accuracy: 49.02%


In [69]:
confusion_matrix(test['intent'], test_gbm3['pred'])

array([[ 91, 150,   5,  18],
       [ 69, 257,   3,  12],
       [ 11,  31,   2,   1],
       [  8,  57,   0,   1]])

In [70]:
print(classification_report(test['intent'], test_gbm3['pred']))

              precision    recall  f1-score   support

           0       0.51      0.34      0.41       264
           2       0.52      0.75      0.61       341
           3       0.20      0.04      0.07        45
           4       0.03      0.02      0.02        66

    accuracy                           0.49       716
   macro avg       0.31      0.29      0.28       716
weighted avg       0.45      0.49      0.45       716



In [71]:
test_gbm3.groupby('pred').count()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,username,tweet,tweet_compound,intent,topic,neg,neu,pos,compound
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,179,179,179,179,179,179,179,179,179,179,179
2,495,495,495,495,495,495,495,495,495,495,495
3,10,10,10,10,10,10,10,10,10,10,10
4,32,32,32,32,32,32,32,32,32,32,32


## Tuned parameters
### Need to update

In [151]:
X_tr4 = train['tweet']
y_tr4 = train[['intent']]

In [196]:
# creating bag of words model 
cv4 = CountVectorizer(stop_words = stop,
                      max_features=880) 
  
X4 = cv4.fit_transform(X_tr4).toarray() 
y4 = y_tr4.iloc[:, 0].values 

# fitting XGB to the training set 
gbm4 = xgb.XGBClassifier(max_depth = 7,
                         learning_rate = 0.05,
                         subsample= 0.9,
                         colsample_bytree = 0.8,
                         objective= 'multi:softmax',
                         num_class = 4)

In [197]:
gbm4.fit(X4,y4)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_class=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [198]:
test_gbm4 = test.copy()
X_test4 = test_gbm4['tweet']
X_t4_cv = cv4.fit_transform(X_test4).toarray() 
y_pred4 = gbm4.predict(X_t4_cv)
gbm_pred4 = [round(value) for value in y_pred4]
test_gbm4['pred'] =  gbm_pred4

In [199]:
accuracy4 = accuracy_score(test['intent'], gbm_pred4)
print("Accuracy: %.2f%%" % (accuracy4 * 100.0))

Accuracy: 50.00%


In [200]:
confusion_matrix(test['intent'], test_gbm4['pred'])

array([[106, 132,   5,  21],
       [ 67, 248,   4,  22],
       [ 10,  29,   3,   3],
       [ 13,  52,   0,   1]])

In [201]:
print(classification_report(test['intent'], test_gbm4['pred']))

              precision    recall  f1-score   support

           0       0.54      0.40      0.46       264
           2       0.54      0.73      0.62       341
           3       0.25      0.07      0.11        45
           4       0.02      0.02      0.02        66

    accuracy                           0.50       716
   macro avg       0.34      0.30      0.30       716
weighted avg       0.47      0.50      0.47       716



Worse than the first model, as expected. Need to redo the gridsearch given now have 4 classes. 

## Random forest

In [129]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X4,y4)

X_test4_rf = cv4.fit_transform(test['tweet']).toarray() 
y_pred4_rf = clf.predict(X_test4_rf)

In [130]:
print("Accuracy:",accuracy_score(test['intent'], y_pred4_rf))

Accuracy: 0.4301675977653631


In [131]:
confusion_matrix(test['intent'], y_pred4_rf)

array([[137,  78,  12,  37],
       [135, 143,  11,  52],
       [ 15,  13,   9,   8],
       [ 16,  28,   3,  19]])

In [132]:
print(classification_report(test['intent'], y_pred4_rf))

              precision    recall  f1-score   support

           0       0.45      0.52      0.48       264
           2       0.55      0.42      0.47       341
           3       0.26      0.20      0.23        45
           4       0.16      0.29      0.21        66

    accuracy                           0.43       716
   macro avg       0.35      0.36      0.35       716
weighted avg       0.46      0.43      0.44       716



Better than previous models in terms of predicting class 3, which is important. However, worse performance overall