In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import re

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
from sklearn.svm import LinearSVC

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
path= '/Users/mrams/Downloads'
os.chdir(path)

train = pd.read_csv("train.csv")
train['dataset'] = 'train'
test = pd.read_csv("test.csv")
test['dataset'] = 'test'
val = pd.read_csv("val.csv")
val['dataset'] = 'val'

df = pd.concat([train, test, val], ignore_index=True, axis=0)


def clean_text_remove_stop(df):
    sentences = []
    for i in range(0,len(df)):
        sent=df["sentence"][i]
        sent=re.sub(r'[,.;@#?!&$\-\']+', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(' +', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'\"', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'[^a-zA-Z]', " ", sent, flags=re.VERBOSE)
        sent=sent.replace(',', '')
        sent=' '.join(sent.split())
        sent=re.sub("\n|\r", "", sent)
        sent = ' '.join([word for word in sent.split() if word not in stopwords.words("english")])
        sentences.append(sent)
    df['clean'] = sentences
    df['token'] = [str(word_tokenize(entry)) for entry in df['sentence']]
    return df

def clean_text_keep_stop(df):
    sentences = []
    for i in range(0,len(df)):
        sent=df["sentence"][i]
        sent=re.sub(r'[,.;@#?!&$\-\']+', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(' +', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'\"', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'[^a-zA-Z]', " ", sent, flags=re.VERBOSE)
        sent=sent.replace(',', '')
        sent=' '.join(sent.split())
        sent=re.sub("\n|\r", "", sent)
        sentences.append(sent)
    df['clean'] = sentences
    df['token'] = [str(word_tokenize(entry)) for entry in df['sentence']]
    return df

def CountVect(df):
    sent_list=[]
    for i in range(0,len(df)):
        sent_list.append(df['clean'][i])

    MyCountV=CountVectorizer(
        input="content",
        lowercase=True)
    MyDTM = MyCountV.fit_transform(sent_list)  # create a sparse matrix
    MyDTM = MyDTM.toarray()  # convert to a regular array
    ColumnNames=MyCountV.get_feature_names_out()
    MyDTM_DF=pd.DataFrame(MyDTM,columns=ColumnNames)
    return(MyDTM_DF)

def tfidf(df):
    sent_list=[]
    for i in range(0,len(df)):
        sent_list.append(df['clean'][i])

    MyVect_TF=TfidfVectorizer(input='content')
    Vect = MyVect_TF.fit_transform(sent_list)

    ColumnNamesTF=MyVect_TF.get_feature_names_out()
    DF_TF=pd.DataFrame(Vect.toarray(),columns=ColumnNamesTF)

    return (DF_TF)


In [None]:
''' Here is an example of how to use the code above.
Say you want to build a model with the input using tf-idf vectorizer and keeping
stopwords. After running the code above, this is what you would run.'''


clean = clean_text_remove_stop(df)
tf_matrix = tfidf(df)

train_clean = clean[clean['dataset'] == 'train']
train_index = clean[clean['dataset'] == 'train'].index.values.astype(int)
test_clean = clean[clean['dataset'] == 'test']
test_index = clean[clean['dataset'] == 'test'].index.values.astype(int)
val_clean = clean[clean['dataset'] == 'val']
val_index = clean[clean['dataset'] == 'val'].index.values.astype(int)


trainLabel = train_clean['emotion'].astype('category')
testLabel = test_clean['emotion'].astype('category')
valLabel = val_clean['emotion'].astype('category')

train_df = tf_matrix.iloc[train_index]
test_df = tf_matrix.iloc[test_index]
val_df = tf_matrix.iloc[val_index]

''' You would now fit the model with the X being train_df and y being trainLabel.
To compute accuracy, you would predict test_df and compare to testLabel'''

print(train_df.shape)

(16000, 16929)


In [None]:
print(clean['token'])

0                     ['i', 'didnt', 'feel', 'humiliated']
1        ['i', 'can', 'go', 'from', 'feeling', 'so', 'h...
2        ['im', 'grabbing', 'a', 'minute', 'to', 'post'...
3        ['i', 'am', 'ever', 'feeling', 'nostalgic', 'a...
4                        ['i', 'am', 'feeling', 'grouchy']
                               ...                        
19995    ['im', 'having', 'ssa', 'examination', 'tomorr...
19996    ['i', 'constantly', 'worry', 'about', 'their',...
19997    ['i', 'feel', 'its', 'important', 'to', 'share...
19998    ['i', 'truly', 'feel', 'that', 'if', 'you', 'a...
19999    ['i', 'feel', 'like', 'i', 'just', 'wan', 'na'...
Name: token, Length: 20000, dtype: object


In [None]:
X = clean['token']
y = clean['emotion']

In [None]:
X

0                     ['i', 'didnt', 'feel', 'humiliated']
1        ['i', 'can', 'go', 'from', 'feeling', 'so', 'h...
2        ['im', 'grabbing', 'a', 'minute', 'to', 'post'...
3        ['i', 'am', 'ever', 'feeling', 'nostalgic', 'a...
4                        ['i', 'am', 'feeling', 'grouchy']
                               ...                        
19995    ['im', 'having', 'ssa', 'examination', 'tomorr...
19996    ['i', 'constantly', 'worry', 'about', 'their',...
19997    ['i', 'feel', 'its', 'important', 'to', 'share...
19998    ['i', 'truly', 'feel', 'that', 'if', 'you', 'a...
19999    ['i', 'feel', 'like', 'i', 'just', 'wan', 'na'...
Name: token, Length: 20000, dtype: object

In [None]:
y

0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
19995    sadness
19996        joy
19997        joy
19998        joy
19999        joy
Name: emotion, Length: 20000, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
print('Training Data Shape: ', X_train.shape)
print('Testing Data Shape: ', X_test.shape)
print('Training Data Shape Labels: ', y_train.shape)
print('Testing Data Shape Labels: ', y_test.shape)

Training Data Shape:  (14000,)
Testing Data Shape:  (6000,)
Training Data Shape Labels:  (14000,)
Testing Data Shape Labels:  (6000,)


In [None]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [None]:
Tf_vect = TfidfVectorizer()
Tf_vect.fit(clean['token'])

X_train_idf = Tf_vect.transform(X_train)
X_test_idf = Tf_vect.transform(X_test)
# print(X_train_counts)

In [None]:
print(Tf_vect.vocabulary_)



In [None]:
print(X_train_idf)

  (0, 15101)	0.132076456325213
  (0, 15094)	0.15558326525033414
  (0, 13926)	0.2959070818342496
  (0, 10416)	0.22433885350098404
  (0, 10349)	0.15607629897491995
  (0, 5532)	0.09265492466693272
  (0, 3202)	0.42955118324423214
  (0, 1878)	0.643321740242774
  (0, 486)	0.38241118631881277
  (0, 482)	0.2071245830528201
  (1, 16602)	0.2896009690423873
  (1, 16457)	0.17560964941566465
  (1, 16217)	0.22184892250421262
  (1, 15424)	0.33814317234199653
  (1, 13845)	0.1645212691896104
  (1, 12526)	0.49114368607631365
  (1, 10349)	0.13240102834759812
  (1, 9867)	0.14050974061346075
  (1, 8291)	0.2961101809269895
  (1, 6995)	0.25224416108914577
  (1, 5536)	0.12033121078522978
  (1, 2767)	0.3355336611615007
  (1, 1511)	0.3712163766958757
  (2, 16600)	0.15076688506642566
  (2, 15094)	0.10114598936465737
  :	:
  (13998, 10126)	0.13147783878048364
  (13998, 9867)	0.07164742972731335
  (13998, 8691)	0.15854429086661206
  (13998, 7766)	0.26377877544829126
  (13998, 7458)	0.07641175939228274
  (13998, 69

In [None]:
from sklearn.svm import SVC
model = SVC(gamma='auto')
model.fit(X_train_idf, y_train)

SVC(gamma='auto')

In [None]:
print(X_test_idf.shape)
print(y_test.shape)
predictions = model.predict(X_test_idf)

(6000, 17068)
(6000,)


In [None]:
print(predictions.shape)

(6000,)


In [None]:
print("SVM Accuracy Score -> ",accuracy_score(y_test, predictions)*100)
print("SVM Weighted Precision Scores -> ", precision_score(y_test, predictions, average = 'weighted',zero_division = 0))
print("SVM Macro Precision Scores -> ", precision_score(y_test, predictions, average = 'macro', zero_division = 0))
print("SVM Recall Scores -> ", recall_score(y_test, predictions, average = 'weighted', zero_division = 0))
print("SVM F1 Scores -> ", f1_score(y_test, predictions, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(y_test, predictions, target_names=target_names, zero_division = 0))


SVM Accuracy Score ->  33.416666666666664
SVM Weighted Precision Scores ->  0.11166736111111111
SVM Macro Precision Scores ->  0.05569444444444444
SVM Recall Scores ->  0.33416666666666667
SVM F1 Scores ->  0.16739641890485116
              precision    recall  f1-score   support

     sadness       0.00      0.00      0.00       841
       anger       0.00      0.00      0.00       690
        love       0.33      1.00      0.50      2005
    surprise       0.00      0.00      0.00       518
        fear       0.00      0.00      0.00      1746
         joy       0.00      0.00      0.00       200

    accuracy                           0.33      6000
   macro avg       0.06      0.17      0.08      6000
weighted avg       0.11      0.33      0.17      6000



In [None]:
# testing with the naive bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_idf, y_train)# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_idf)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_test, predictions_NB)*100)
print("Naive Bayes Weighted Precision Scores -> ", precision_score(y_test, predictions_NB, average = 'weighted',zero_division = 0))
print("Naive Bayes Macro Precision Scores -> ", precision_score(y_test, predictions_NB, average = 'macro', zero_division = 0))
print("Naive Bayes Recall Scores -> ", recall_score(y_test, predictions_NB, average = 'weighted', zero_division = 0))
print("Naive Bayes F1 Scores -> ", f1_score(y_test, predictions_NB, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(y_test, predictions_NB, target_names=target_names, zero_division = 0))

Naive Bayes Accuracy Score ->  62.050000000000004
Naive Bayes Weighted Precision Scores ->  0.7131631373942838
Naive Bayes Macro Precision Scores ->  0.6873162890830794
Naive Bayes Recall Scores ->  0.6205
Naive Bayes F1 Scores ->  0.5237925281331177
              precision    recall  f1-score   support

     sadness       0.98      0.16      0.27       841
       anger       0.90      0.09      0.17       690
        love       0.56      0.98      0.71      2005
    surprise       1.00      0.02      0.04       518
        fear       0.68      0.88      0.77      1746
         joy       0.00      0.00      0.00       200

    accuracy                           0.62      6000
   macro avg       0.69      0.36      0.33      6000
weighted avg       0.71      0.62      0.52      6000



In [None]:
model_3 = LinearSVC(verbose=0)
model_3.fit(train_df, trainLabel)

LinearSVC()

In [None]:
predictions2 = model_3.predict(test_df)

In [None]:
print("SVM Accuracy Score -> ",accuracy_score(testLabel, predictions2)*100)
print("SVM Weighted Precision Scores -> ", precision_score(testLabel, predictions2, average = 'weighted',zero_division = 0))
print("SVM Macro Precision Scores -> ", precision_score(testLabel, predictions2, average = 'macro', zero_division = 0))
print("SVM Recall Scores -> ", recall_score(testLabel, predictions2, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(testLabel, predictions2, target_names=target_names, zero_division = 0))

SVM Accuracy Score ->  89.2
SVM Weighted Precision Scores ->  0.8910983101735691
SVM Macro Precision Scores ->  0.8660035884953946
SVM Recall Scores ->  0.892
              precision    recall  f1-score   support

     sadness       0.88      0.88      0.88       275
       anger       0.88      0.84      0.86       224
        love       0.90      0.93      0.92       695
    surprise       0.80      0.77      0.78       159
        fear       0.93      0.93      0.93       581
         joy       0.81      0.67      0.73        66

    accuracy                           0.89      2000
   macro avg       0.87      0.84      0.85      2000
weighted avg       0.89      0.89      0.89      2000

