## **Data Intialisation**

In [72]:
import nltk
nltk.download('punkt')
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
import pandas as pd

In [74]:
df=pd.read_csv('tweet_emotions.csv')

In [75]:
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [76]:
df.shape

(40000, 3)

In [77]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [78]:
df['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [79]:
# Dropping rows with other emotion labels
df=df.drop(df[df.sentiment=='anger'].index)
df=df.drop(df[df.sentiment=='boredom'].index)
df=df.drop(df[df.sentiment=='enthusiasm'].index)
df=df.drop(df[df.sentiment=='empty'].index)
df=df.drop(df[df.sentiment=='fun'].index)
df=df.drop(df[df.sentiment=='surprise'].index)
df=df.drop(df[df.sentiment=='relief'].index)
df=df.drop(df[df.sentiment=='hate'].index)
df=df.drop(df[df.sentiment=='neutral'].index)
df=df.drop(df[df.sentiment=='worry'].index)

In [80]:
del df["tweet_id"]

In [81]:
df.head()

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


In [82]:
df.shape
df['sentiment'].value_counts()

happiness    5209
sadness      5165
love         3842
Name: sentiment, dtype: int64

## **Data Preprocessing**

In [83]:
from sklearn import preprocessing
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
df['label']= lbl_enc.fit_transform(df.sentiment.values)

In [84]:
print(df['sentiment'].value_counts())
print(df['label'].value_counts())

happiness    5209
sadness      5165
love         3842
Name: sentiment, dtype: int64
0    5209
2    5165
1    3842
Name: label, dtype: int64


In [85]:
#Encoding output labels 'happiness' as '0' 
#Encoding output labels 'sadness' as '1' 

In [86]:
#Making all review to lowercase
df['content']=df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [87]:
# Removing Punctuation, Symbols
df['content'] = df['content'].str.replace('[^\w\s]',' ')

  df['content'] = df['content'].str.replace('[^\w\s]',' ')


In [88]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
stops.remove("not")
stops.remove("but")
stops.remove("no")
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stops))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
import re
#Correcting Letter Repetitions
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df['content'] = df['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [90]:
# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(df['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [91]:
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,n bed headache ughh waitin call,2
2,sadness,funeral ceremony gloomy friday,2
6,sadness,sleep but im not thinking old friend want but ...,2
8,sadness,charviray love miss,2
9,sadness,sorry least friday,2


In [92]:
#Function to split string into tokens
def identify_tokens(row):
    tokens=nltk.word_tokenize(row)
    token_words=[w for w in tokens if w.isalpha()]
    return token_words

In [93]:
#Tokenization of dataframe
df['content']=df["content"].apply(identify_tokens)

In [94]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#Function for Lemmatizing the list of words
def lem_list(row):
    lemmatized_list = [lemmatizer.lemmatize(word) for word in row]
    return(lemmatized_list)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [95]:
# Lemmatize of the dataframe
def rejoin_words(row):
    joined_words = (" ".join(row))
    return joined_words

In [96]:
#Joining the processed words in the data_frame
df['content'] = df["content"].apply(rejoin_words)

In [97]:
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,n bed headache ughh waitin call,2
2,sadness,funeral ceremony gloomy friday,2
6,sadness,sleep but im not thinking old friend want but ...,2
8,sadness,charviray love miss,2
9,sadness,sorry least friday,2


# Splitting dataset 

# Dataframe

In [127]:
from sklearn.model_selection import train_test_split
x1_train,x1_test,y1_train,y1_test = train_test_split(df[['content']],df[['label']])

In [128]:
x1_test.shape

(3554, 1)

In [129]:
x1_train.shape

(10662, 1)

# Feature Extraction

# TFIDF Vectorizer

In [130]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
#Extracting TF-IDF parameters
tfidf = TfidfVectorizer(max_features=1000, analyzer='word', ngram_range=(1,3))
x_train_tfidf = tfidf.fit_transform(x_train)
x_val_tfidf = tfidf.fit_transform(x_test)

# Count Vectorizer

In [132]:
from sklearn.feature_extraction.text import CountVectorizer

In [133]:
#Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df['content'])
x_train_count = count_vect.transform(x_train)
x_val_count = count_vect.transform(x_test)

# Feature Extracton using Lexical Methods

In [134]:
pip install --upgrade vaderSentiment 




In [135]:

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [136]:
x1_train.head()

Unnamed: 0,content
24157,got alot around today get job app completed fi...
21039,thanks follow loving energy site help but draw...
19130,damnit sucks one ones thought drag back lol
33954,rap second sat moviess haha quot swim lake sex...
1735,wan na go concert thailand really wan na go mu...


In [137]:
x1_train['negative'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["neg"])
x1_test['negative'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["neg"])
x1_train['positive'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["pos"])
x1_test['positive'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["pos"])
x1_train['neutral'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["neu"])
x1_test['neutral'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["neu"])
x1_train['compound'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["compound"])
x1_test['compound'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["compound"])

In [138]:
from textblob import TextBlob
x1_train['subjectivity'] = x1_train['content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
x1_test['subjectivity'] = x1_test['content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
x1_train['polarity'] = x1_train['content'].apply(lambda x: TextBlob(x).sentiment.polarity)
x1_test['polarity'] = x1_test['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [139]:
x1_train.head

<bound method NDFrame.head of                                                  content  negative  positive  \
24157  got alot around today get job app completed fi...     0.000     0.214   
21039  thanks follow loving energy site help but draw...     0.000     0.527   
19130        damnit sucks one ones thought drag back lol     0.534     0.192   
33954  rap second sat moviess haha quot swim lake sex...     0.000     0.200   
1735   wan na go concert thailand really wan na go mu...     0.147     0.000   
...                                                  ...       ...       ...   
25767  good morning everyone nice see today hope wond...     0.000     0.752   
24072  haha not know work blip apart thanks song nice...     0.000     0.521   
34470                                  heading nadia yee     0.000     0.000   
37631  chris pine zachary quinto leonard nimoy snl to...     0.000     0.216   
37609                         dyed hair back super black     0.000     0.494   

       ne

# **Training of Models**
## **I)Using TF-IDF Vectorizer**

In [142]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [143]:
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)
y_pred = nb.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.38938618925831203
[[742 510 696]
 [220 289 179]
 [772 488 796]]
              precision    recall  f1-score   support

           0       0.43      0.38      0.40      1948
           1       0.22      0.42      0.29       688
           2       0.48      0.39      0.43      2056

    accuracy                           0.39      4692
   macro avg       0.38      0.40      0.37      4692
weighted avg       0.42      0.39      0.40      4692



In [145]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x_train_tfidf, y_train)
y_pred = lsvm.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

print(classification_report(y_pred, y_test))

0.3998294970161978
[[ 568  362  483]
 [  87  157   37]
 [1079  768 1151]]
              precision    recall  f1-score   support

           0       0.33      0.40      0.36      1413
           1       0.12      0.56      0.20       281
           2       0.69      0.38      0.49      2998

    accuracy                           0.40      4692
   macro avg       0.38      0.45      0.35      4692
weighted avg       0.55      0.40      0.44      4692



In [146]:
# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x_train_tfidf, y_train)
y_pred = logreg.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.3874680306905371
[[748 522 688]
 [184 246 159]
 [802 519 824]]
              precision    recall  f1-score   support

           0       0.43      0.38      0.41      1958
           1       0.19      0.42      0.26       589
           2       0.49      0.38      0.43      2145

    accuracy                           0.39      4692
   macro avg       0.37      0.39      0.37      4692
weighted avg       0.43      0.39      0.40      4692



In [147]:
# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train_tfidf, y_train)
y_pred = rf.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.38491048593350385
[[817 574 764]
 [177 216 134]
 [740 497 773]]
              precision    recall  f1-score   support

           0       0.47      0.38      0.42      2155
           1       0.17      0.41      0.24       527
           2       0.46      0.38      0.42      2010

    accuracy                           0.38      4692
   macro avg       0.37      0.39      0.36      4692
weighted avg       0.43      0.38      0.40      4692



## **II) Using Count Vectorizer**

In [148]:
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(x_train_count, y_train)
y_pred = nb.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.6293691389599319
[[1042  441  266]
 [ 338  609  103]
 [ 354  237 1302]]
              precision    recall  f1-score   support

           0       0.60      0.60      0.60      1749
           1       0.47      0.58      0.52      1050
           2       0.78      0.69      0.73      1893

    accuracy                           0.63      4692
   macro avg       0.62      0.62      0.62      4692
weighted avg       0.64      0.63      0.63      4692



In [149]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x_train_count, y_train)
y_pred = lsvm.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.6413043478260869
[[1122  497  252]
 [ 230  533   65]
 [ 382  257 1354]]
              precision    recall  f1-score   support

           0       0.65      0.60      0.62      1871
           1       0.41      0.64      0.50       828
           2       0.81      0.68      0.74      1993

    accuracy                           0.64      4692
   macro avg       0.62      0.64      0.62      4692
weighted avg       0.68      0.64      0.65      4692



In [150]:
# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x_train_count, y_train)
y_pred = logreg.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.6216965046888321
[[1060  479  296]
 [ 338  596  114]
 [ 336  212 1261]]
              precision    recall  f1-score   support

           0       0.61      0.58      0.59      1835
           1       0.46      0.57      0.51      1048
           2       0.75      0.70      0.72      1809

    accuracy                           0.62      4692
   macro avg       0.61      0.61      0.61      4692
weighted avg       0.63      0.62      0.63      4692



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [151]:
# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train_count, y_train)
y_pred = rf.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.6378942881500427
[[1055  470  240]
 [ 293  576   69]
 [ 386  241 1362]]
              precision    recall  f1-score   support

           0       0.61      0.60      0.60      1765
           1       0.45      0.61      0.52       938
           2       0.82      0.68      0.74      1989

    accuracy                           0.64      4692
   macro avg       0.62      0.63      0.62      4692
weighted avg       0.66      0.64      0.65      4692



## **III) Using Vader Sentiments**

In [153]:
# Model 1: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = lsvm.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

  return f(*args, **kwargs)


0.5258863252673045
[[ 505  439   91]
 [ 258  246   72]
 [ 522  303 1118]]
              precision    recall  f1-score   support

           0       0.39      0.49      0.44      1035
           1       0.25      0.43      0.31       576
           2       0.87      0.58      0.69      1943

    accuracy                           0.53      3554
   macro avg       0.50      0.50      0.48      3554
weighted avg       0.63      0.53      0.56      3554



In [154]:
# Model 2: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = logreg.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

  return f(*args, **kwargs)


0.544456949915588
[[874 697 295]
 [ 83 108  33]
 [328 183 953]]
              precision    recall  f1-score   support

           0       0.68      0.47      0.55      1866
           1       0.11      0.48      0.18       224
           2       0.74      0.65      0.69      1464

    accuracy                           0.54      3554
   macro avg       0.51      0.53      0.48      3554
weighted avg       0.67      0.54      0.59      3554



In [155]:
# Model 3: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = rf.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

  rf.fit(x1_train.drop(['content'], axis=1), y1_train)


0.5506471581316826
[[761 459 369]
 [293 373  89]
 [231 156 823]]
              precision    recall  f1-score   support

           0       0.59      0.48      0.53      1589
           1       0.38      0.49      0.43       755
           2       0.64      0.68      0.66      1210

    accuracy                           0.55      3554
   macro avg       0.54      0.55      0.54      3554
weighted avg       0.56      0.55      0.55      3554



In [1]:
#Here maximum accuracy is obtained when we use count vectorizer features and Logistic Regression model