In [33]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score,classification_report

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob
import pickle

In [15]:
stop = stopwords.words('english')

## Sentiment analysis with random forrest

In [16]:
df = pd.read_csv('training_base_sent.csv')
df.dropna(inplace=True)

In [17]:
def remove_space(text):
    text = text.strip()
    text = re.sub("\s\s+", " ", text)
    return text
    
df['tweet_compound'] = df.tweet_compound.map(remove_space)
df['tweet'] = df.tweet.map(remove_space)

In [18]:
df['vader'] = 0
df.loc[(df['compound'] > 0.1),'vader'] = 1
df.loc[(df['compound'] < -0.1),'vader'] = -1

In [19]:
df

Unnamed: 0.1,Unnamed: 0,username,tweet,tweet_compound,sentiment,intent,topic,neg,neu,pos,compound,vader
0,89389,mstothard,can it sustain its evergrowing costs and gener...,can it sustain its evergrowing costs and gener...,0,4,2,0.000,0.880,0.120,0.7430,1
1,65049,saadishtiaq,is premium and metal account subscription base...,is premium and metalaccount subscription based...,0,2,2,0.000,1.000,0.000,0.0000,0
2,82191,loadthepony,cant access my account this morning and hours ...,cant access my account this morning and hours ...,-1,1,0,0.137,0.777,0.086,-0.2789,-1
3,59055,magicroundabout,im guessing that you did not read the tsandcs ...,im guessing that you did not read the tsandcs ...,0,4,4,0.000,0.871,0.129,0.4497,1
4,75253,erickpinos,in light of the currency crisis the global pop...,in light of the currency crisis the global pop...,0,3,2,0.141,0.756,0.103,-0.2732,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
3564,39327,merveartar11,couldnt change my number and nobody helped,couldnt change my number and nobody helped,-1,0,1,0.169,0.831,0.000,-0.0572,0
3565,10748,sportschool,done shall i provide my details per dm,done shall i provide my details per dm,0,4,1,0.000,1.000,0.000,0.0000,0
3566,9863,whatkatidoes,i still dont have the card i ordered i paid on...,i still dont have the card i ordered i paid on...,-1,0,4,0.099,0.793,0.108,-0.1779,-1
3567,52723,mmis2000,happy to raise them here thank you,happy to raise them here thank you,1,3,2,0.000,0.446,0.554,0.7351,1


In [20]:
tr, test = train_test_split(df,test_size = 0.2,random_state = 10)

In [27]:
tr.groupby('vader').count()

Unnamed: 0_level_0,Unnamed: 0,username,tweet,tweet_compound,sentiment,intent,topic,neg,neu,pos,compound
vader,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-1,830,830,830,830,830,830,830,830,830,830,830
0,830,830,830,830,830,830,830,830,830,830,830
1,1195,1195,1195,1195,1195,1195,1195,1195,1195,1195,1195


In [22]:
# creating bag of words model 
cv = CountVectorizer(max_features=1000) 
  
X = cv.fit_transform(tr['tweet']).toarray() 
y = tr[['sentiment']].iloc[:, 0].values 

In [23]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X,y)

X_test = cv.fit_transform(test['tweet']).toarray() 
y_pred=clf.predict(X_test)

In [24]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(test['sentiment'], y_pred))

Accuracy: 0.4943977591036415


In [25]:
confusion_matrix(test['sentiment'], y_pred)

array([[198, 148,  13],
       [ 95, 148,  17],
       [ 35,  53,   7]])

In [26]:
print(classification_report(test['sentiment'], y_pred))

              precision    recall  f1-score   support

          -1       0.60      0.55      0.58       359
           0       0.42      0.57      0.49       260
           1       0.19      0.07      0.11        95

    accuracy                           0.49       714
   macro avg       0.41      0.40      0.39       714
weighted avg       0.48      0.49      0.48       714



In [34]:
# save the model to disk
filename = 'sentimentRF_model.sav'
pickle.dump(clf, open(filename, 'wb'))

## Vader

In [28]:
#Create a Gaussian Classifier
clf2 = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf2.fit(X,y)

X_test2 = cv.fit_transform(test['tweet']).toarray() 
y_pred2=clf2.predict(X_test)

In [29]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(test['sentiment'], y_pred2))

Accuracy: 0.4943977591036415


In [30]:
confusion_matrix(test['sentiment'], y_pred2)

array([[211, 134,  14],
       [102, 138,  20],
       [ 51,  40,   4]])

In [31]:
print(classification_report(test['sentiment'], y_pred2))

              precision    recall  f1-score   support

          -1       0.58      0.59      0.58       359
           0       0.44      0.53      0.48       260
           1       0.11      0.04      0.06        95

    accuracy                           0.49       714
   macro avg       0.38      0.39      0.38       714
weighted avg       0.47      0.49      0.48       714

