In [3]:
import pandas as pd
import numpy as np
import nltk
import os
import re

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
from sklearn.svm import LinearSVC

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mrams\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mrams\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mrams\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
path= '/Users/mrams/Downloads'
os.chdir(path)

train = pd.read_csv("train.csv")
train['dataset'] = 'train'
test = pd.read_csv("test.csv")
test['dataset'] = 'test'
val = pd.read_csv("val.csv")
val['dataset'] = 'val'

df = pd.concat([train, test, val], ignore_index=True, axis=0)


def clean_text_remove_stop(df):
    sentences = []
    for i in range(0,len(df)):
        sent=df["sentence"][i]
        sent=re.sub(r'[,.;@#?!&$\-\']+', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(' +', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'\"', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'[^a-zA-Z]', " ", sent, flags=re.VERBOSE)
        sent=sent.replace(',', '')
        sent=' '.join(sent.split())
        sent=re.sub("\n|\r", "", sent)
        sent = ' '.join([word for word in sent.split() if word not in stopwords.words("english")])
        sentences.append(sent)
    df['clean'] = sentences
    df['token'] = [str(word_tokenize(entry)) for entry in df['sentence']]
    return df

def clean_text_keep_stop(df):
    sentences = []
    for i in range(0,len(df)):
        sent=df["sentence"][i]
        sent=re.sub(r'[,.;@#?!&$\-\']+', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(' +', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'\"', ' ', sent, flags=re.IGNORECASE)
        sent=re.sub(r'[^a-zA-Z]', " ", sent, flags=re.VERBOSE)
        sent=sent.replace(',', '')
        sent=' '.join(sent.split())
        sent=re.sub("\n|\r", "", sent)
        sentences.append(sent)
    df['clean'] = sentences
    df['token'] = [str(word_tokenize(entry)) for entry in df['sentence']]
    return df

def CountVect(df):
    sent_list=[]
    for i in range(0,len(df)):
        sent_list.append(df['clean'][i])
        
    MyCountV=CountVectorizer(
        input="content", 
        lowercase=True)
    MyDTM = MyCountV.fit_transform(sent_list)  # create a sparse matrix
    MyDTM = MyDTM.toarray()  # convert to a regular array
    ColumnNames=MyCountV.get_feature_names_out()
    MyDTM_DF=pd.DataFrame(MyDTM,columns=ColumnNames)
    return(MyDTM_DF)

def tfidf(df):
    sent_list=[]
    for i in range(0,len(df)):
        sent_list.append(df['clean'][i])
   
    MyVect_TF=TfidfVectorizer(input='content')
    Vect = MyVect_TF.fit_transform(sent_list)
    
    ColumnNamesTF=MyVect_TF.get_feature_names_out()
    DF_TF=pd.DataFrame(Vect.toarray(),columns=ColumnNamesTF)
     
    return (DF_TF)
    

In [7]:
''' Here is an example of how to use the code above.
Say you want to build a model with the input using tf-idf vectorizer and keeping 
stopwords. After running the code above, this is what you would run.'''


clean = clean_text_keep_stop(df)
tf_matrix = tfidf(df)

train_clean = clean[clean['dataset'] == 'train']
train_index = clean[clean['dataset'] == 'train'].index.values.astype(int)
test_clean = clean[clean['dataset'] == 'test']
test_index = clean[clean['dataset'] == 'test'].index.values.astype(int)
val_clean = clean[clean['dataset'] == 'val']
val_index = clean[clean['dataset'] == 'val'].index.values.astype(int)


trainLabel = train_clean['emotion'].astype('category')
testLabel = test_clean['emotion'].astype('category')
valLabel = val_clean['emotion'].astype('category')

train_df = tf_matrix.iloc[train_index]
test_df = tf_matrix.iloc[test_index]
val_df = tf_matrix.iloc[val_index]

''' You would now fit the model with the X being train_df and y being trainLabel.
To compute accuracy, you would predict test_df and compare to testLabel'''

' You would now fit the model with the X being train_df and y being trainLabel.\nTo compute accuracy, you would predict test_df and compare to testLabel'

In [28]:
print(clean.shape)

(20000, 5)


In [29]:
X = clean['token']
y = clean['emotion']

In [30]:
X

0                     ['i', 'didnt', 'feel', 'humiliated']
1        ['i', 'can', 'go', 'from', 'feeling', 'so', 'h...
2        ['im', 'grabbing', 'a', 'minute', 'to', 'post'...
3        ['i', 'am', 'ever', 'feeling', 'nostalgic', 'a...
4                        ['i', 'am', 'feeling', 'grouchy']
                               ...                        
19995    ['im', 'having', 'ssa', 'examination', 'tomorr...
19996    ['i', 'constantly', 'worry', 'about', 'their',...
19997    ['i', 'feel', 'its', 'important', 'to', 'share...
19998    ['i', 'truly', 'feel', 'that', 'if', 'you', 'a...
19999    ['i', 'feel', 'like', 'i', 'just', 'wan', 'na'...
Name: token, Length: 20000, dtype: object

In [31]:
y

0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
19995    sadness
19996        joy
19997        joy
19998        joy
19999        joy
Name: emotion, Length: 20000, dtype: object

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
print('Training Data Shape: ', X_train.shape)
print('Testing Data Shape: ', X_test.shape)
print('Training Data Shape Labels: ', y_train.shape)
print('Testing Data Shape Labels: ', y_test.shape)

Training Data Shape:  (14000,)
Testing Data Shape:  (6000,)
Training Data Shape Labels:  (14000,)
Testing Data Shape Labels:  (6000,)


In [33]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [34]:
Tf_vect = TfidfVectorizer()
Tf_vect.fit(clean['token'])

X_train_idf = Tf_vect.transform(X_train)
X_test_idf = Tf_vect.transform(X_test)
# print(X_train_counts)

In [41]:
print(len(Tf_vect.vocabulary_))

# Python3 program for the above approach
from itertools import chain
 
 
# Function to print all unique keys
# present in a list of dictionaries
def UniqueKeys(arr):
 
    # Stores the list of unique keys
    res = list(set(chain.from_iterable(sub.keys() for sub in arr)))
 
    # Print the list
    print(len(str(res)))
 
# Driver Code
arr = [Tf_vect.vocabulary_]
UniqueKeys(arr)
print(tf_matrix.shape)

17068
189149
(20000, 17070)


In [12]:
print(X_train_idf)

  (0, 16553)	0.1981264202604649
  (0, 15312)	0.06565577956563604
  (0, 13822)	0.3249007884669962
  (0, 12118)	0.13315478368316536
  (0, 11586)	0.16146482167187381
  (0, 10421)	0.14839018627080183
  (0, 10349)	0.08120684094156898
  (0, 7916)	0.09585775293047588
  (0, 7373)	0.10236194729244866
  (0, 6725)	0.19983368199073806
  (0, 6394)	0.16306647710159253
  (0, 5651)	0.3597302976025241
  (0, 5532)	0.04820856068024611
  (0, 3894)	0.6694434270579819
  (0, 2037)	0.10232419084270623
  (0, 1593)	0.24989167255481493
  (0, 1498)	0.1522900632794741
  (0, 553)	0.06210454620559892
  (0, 35)	0.11143967201444599
  (1, 15101)	0.16258703711964595
  (1, 13969)	0.49534410523132905
  (1, 13845)	0.23874163282593833
  (1, 10953)	0.34218418903520653
  (1, 5878)	0.24188546999617963
  (1, 5532)	0.11405885723529079
  :	:
  (13998, 165)	0.5266461424658212
  (13999, 16794)	0.20267230070103254
  (13999, 16503)	0.41091184318735813
  (13999, 16492)	0.4330165416756648
  (13999, 15312)	0.1799899660477918
  (13999, 1

In [43]:
from sklearn.svm import SVC
model = SVC(gamma='auto')
model.fit(X_train_idf, y_train)

SVC(gamma='auto')

In [44]:
print(X_test_idf.shape)
print(y_test.shape)
predictions = model.predict(X_test_idf)

(6000, 17068)
(6000,)


In [45]:
print(predictions.shape)

(6000,)


In [58]:
print("SVM Accuracy Score -> ",accuracy_score(y_test, predictions)*100)
print("SVM Weighted Precision Scores -> ", precision_score(y_test, predictions, average = 'weighted',zero_division = 0))
print("SVM Macro Precision Scores -> ", precision_score(y_test, predictions, average = 'macro', zero_division = 0))
print("SVM Recall Scores -> ", recall_score(y_test, predictions, average = 'weighted', zero_division = 0))
print("SVM F1 Scores -> ", f1_score(y_test, predictions, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(y_test, predictions, target_names=target_names, zero_division = 0))


SVM Accuracy Score ->  34.050000000000004
SVM Weighted Precision Scores ->  0.11594025000000001
SVM Macro Precision Scores ->  0.05675
SVM Recall Scores ->  0.3405
SVM F1 Scores ->  0.17298060425214473
              precision    recall  f1-score   support

     sadness       0.00      0.00      0.00       758
       anger       0.00      0.00      0.00       755
        love       0.34      1.00      0.51      2043
    surprise       0.00      0.00      0.00       492
        fear       0.00      0.00      0.00      1741
         joy       0.00      0.00      0.00       211

    accuracy                           0.34      6000
   macro avg       0.06      0.17      0.08      6000
weighted avg       0.12      0.34      0.17      6000



In [59]:
# testing with the naive bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_idf, y_train)# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_idf)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_test, predictions_NB)*100)
print("Naive Bayes Weighted Precision Scores -> ", precision_score(y_test, predictions_NB, average = 'weighted',zero_division = 0))
print("Naive Bayes Macro Precision Scores -> ", precision_score(y_test, predictions_NB, average = 'macro', zero_division = 0))
print("Naive Bayes Recall Scores -> ", recall_score(y_test, predictions_NB, average = 'weighted', zero_division = 0))
print("Naive Bayes F1 Scores -> ", f1_score(y_test, predictions_NB, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(y_test, predictions_NB, target_names=target_names, zero_division = 0))

Naive Bayes Accuracy Score ->  62.8
Naive Bayes Weighted Precision Scores ->  0.7179921158496885
Naive Bayes Macro Precision Scores ->  0.6970658729529419
Naive Bayes Recall Scores ->  0.628
Naive Bayes F1 Scores ->  0.5319638032577784
              precision    recall  f1-score   support

     sadness       0.94      0.16      0.27       758
       anger       0.99      0.11      0.20       755
        love       0.57      0.98      0.72      2043
    surprise       1.00      0.01      0.02       492
        fear       0.69      0.89      0.78      1741
         joy       0.00      0.00      0.00       211

    accuracy                           0.63      6000
   macro avg       0.70      0.36      0.33      6000
weighted avg       0.72      0.63      0.53      6000



In [8]:
model_3 = LinearSVC(verbose=0)
model_3.fit(train_df, trainLabel)

LinearSVC()

In [10]:
predictions2 = model_3.predict(test_df)

In [11]:
print("SVM Accuracy Score -> ",accuracy_score(testLabel, predictions2)*100)
print("SVM Weighted Precision Scores -> ", precision_score(testLabel, predictions2, average = 'weighted',zero_division = 0))
print("SVM Macro Precision Scores -> ", precision_score(testLabel, predictions2, average = 'macro', zero_division = 0))
print("SVM Recall Scores -> ", recall_score(testLabel, predictions2, average = 'weighted', zero_division = 0))
target_names = clean['emotion'].unique()
print(classification_report(testLabel, predictions2, target_names=target_names, zero_division = 0))

SVM Accuracy Score ->  88.85
SVM Weighted Precision Scores ->  0.8869830358661898
SVM Macro Precision Scores ->  0.8530136053037572
SVM Recall Scores ->  0.8885
              precision    recall  f1-score   support

     sadness       0.88      0.88      0.88       275
       anger       0.86      0.84      0.85       224
        love       0.90      0.93      0.92       695
    surprise       0.80      0.75      0.77       159
        fear       0.93      0.93      0.93       581
         joy       0.75      0.61      0.67        66

    accuracy                           0.89      2000
   macro avg       0.85      0.82      0.84      2000
weighted avg       0.89      0.89      0.89      2000

