In [1]:
import pandas as pd
import numpy as np

#Text processing libraries
import re
import nltk
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction

#Load data-visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

#evaluation metrics
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

stop_words = set(stopwords.words('english')) 

lemmatizer = WordNetLemmatizer() 

pd.pandas.set_option('display.max_columns',None)

In [2]:
df = pd.read_csv('sentimental_anys_data_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,0,1,11,43,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,12,40,coming border kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1,10,41,im getting borderland kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,1,10,42,im coming borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,1,12,46,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,1,11,43,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,12,40,coming border kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,10,41,im getting borderland kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,10,42,im coming borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,12,46,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(73996, 35)

In [5]:
(df.isnull().sum()/len(df))*100

Sentiment                            0.000000
Tweet_word_count                     0.000000
Tweet_char_count                     0.000000
Tweet_clean                          2.310936
ApexLegends                          0.000000
AssassinsCreed                       0.000000
Battlefield                          0.000000
Borderlands                          0.000000
CS-GO                                0.000000
CallOfDuty                           0.000000
CallOfDutyBlackopsColdWar            0.000000
Cyberpunk2077                        0.000000
Dota2                                0.000000
FIFA                                 0.000000
Facebook                             0.000000
Fortnite                             0.000000
Google                               0.000000
GrandTheftAuto(GTA)                  0.000000
Hearthstone                          0.000000
HomeDepot                            0.000000
LeagueOfLegends                      0.000000
MaddenNFL                         

In [6]:
df = df.dropna()

In [7]:
df.isnull().sum()

Sentiment                            0
Tweet_word_count                     0
Tweet_char_count                     0
Tweet_clean                          0
ApexLegends                          0
AssassinsCreed                       0
Battlefield                          0
Borderlands                          0
CS-GO                                0
CallOfDuty                           0
CallOfDutyBlackopsColdWar            0
Cyberpunk2077                        0
Dota2                                0
FIFA                                 0
Facebook                             0
Fortnite                             0
Google                               0
GrandTheftAuto(GTA)                  0
Hearthstone                          0
HomeDepot                            0
LeagueOfLegends                      0
MaddenNFL                            0
Microsoft                            0
NBA2K                                0
Nvidia                               0
Overwatch                

In [8]:
df.head()

Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,1,11,43,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,12,40,coming border kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,10,41,im getting borderland kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,10,42,im coming borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,12,46,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
df['Tweet_clean'][0]

'im getting borderland murder'

In [10]:
y = df['Sentiment']
X = df.drop(columns=['Sentiment'], axis=1)

In [11]:
y.value_counts()

Sentiment
 0    30237
-1    21894
 1    20155
Name: count, dtype: int64

In [12]:
# Splitting the data into Training and Test data.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (50600, 34) (50600,)
Test (21686, 34) (21686,)


In [13]:
vectorizer = TfidfVectorizer(stop_words='english') 
X_train= vectorizer.fit_transform(X_train['Tweet_clean'])
X_test= vectorizer.transform(X_test['Tweet_clean'])
X_train.shape, X_test.shape

((50600, 30766), (21686, 30766))

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [15]:
df.head()

Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,1,11,43,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,12,40,coming border kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,10,41,im getting borderland kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,10,42,im coming borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,12,46,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
#Function to fit and apply a model
def model_apply(model):
    #train the model
    model.fit(X_train,y_train)
    #make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test=model.predict(X_test)
    #model evaluation
    
    print("\n")
    print("For training data : ")
    print(model)
    print('Accuracy score: ',accuracy_score(y_train, y_pred_train))
    print('Weighted F1 score: ',f1_score(y_pred=y_pred_train,y_true=y_train,average='weighted'))
    print("Precision Score : ", precision_score(y_pred=y_pred_train,y_true=y_train,average='weighted'))
    print("Recall Score : ",recall_score(y_pred=y_pred_train,y_true=y_train,average='weighted'))
    print('Confusion Matrix: \n',confusion_matrix(y_pred_train,y_train))
    
    print("For Test data : ")
    print(model)
    print('Accuracy score: ',accuracy_score(y_test, y_pred_test))
    print('Weighted F1 score: ',f1_score(y_pred=y_pred_test,y_true=y_test,average='weighted'))
    print("Precision Score : ", precision_score(y_pred=y_pred_test,y_true=y_test,average='weighted'))
    print("Recall Score : ",recall_score(y_pred=y_pred_test,y_true=y_test,average='weighted'))
    print('Confusion Matrix: \n',confusion_matrix(y_pred_test,y_test))
    print("="*55)

In [17]:
lr = LogisticRegression(max_iter=500)
model_apply(lr)

LogisticRegression(max_iter=500)
Accuracy score:  0.78921885087153
Weighted F1 score:  0.7886457801399764
Precision Score :  0.7900492803609667
Recall Score :  0.78921885087153
Confusion Matrix: 
 [[5154  729  464]
 [1023 7579 1201]
 [ 391  763 4382]]


In [18]:
#Multinomial Naive Bayes
nb=MultinomialNB()
model_apply(nb)

MultinomialNB()
Accuracy score:  0.7698515171078115
Weighted F1 score:  0.7677301461657312
Precision Score :  0.7772399611493764
Recall Score :  0.7698515171078115
Confusion Matrix: 
 [[4980  746  568]
 [1315 7821 1585]
 [ 273  504 3894]]


In [19]:
knn = KNeighborsClassifier()
model_apply(knn)

KNeighborsClassifier()
Accuracy score:  0.8408189615420086
Weighted F1 score:  0.8408564621436017
Precision Score :  0.8452448583983011
Recall Score :  0.8408189615420086
Confusion Matrix: 
 [[5886 1048  607]
 [ 462 7624  716]
 [ 220  399 4724]]


In [20]:
adaboost = AdaBoostClassifier()
model_apply(adaboost)

AdaBoostClassifier()
Accuracy score:  0.4463709305542746
Weighted F1 score:  0.31970042976541224
Precision Score :  0.584275597508533
Recall Score :  0.4463709305542746
Confusion Matrix: 
 [[ 590  108   78]
 [5946 8849 5728]
 [  32  114  241]]


In [21]:
gb = GradientBoostingClassifier()
model_apply(gb)

GradientBoostingClassifier()
Accuracy score:  0.5741031079959421
Weighted F1 score:  0.5569143736714569
Precision Score :  0.6142265660591322
Recall Score :  0.5741031079959421
Confusion Matrix: 
 [[2596  711  438]
 [3677 7494 3249]
 [ 295  866 2360]]


In [22]:
xg = XGBClassifier()
model_apply(xg)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [-1  0  1]