In [27]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [7]:
data = pd.read_csv("data/emotions/train.txt", sep=";")
data.columns = ["text", "emotion"]
data.head()

Unnamed: 0,text,emotion
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


In [15]:
data.text[0]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [9]:
data.emotion.value_counts()

emotion
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [18]:
emotion_data = data.copy()

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
to_del_elements = {"no", "not", "n't"}
nlp.Defaults.stop_words = nlp.Defaults.stop_words - to_del_elements

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
def process(text: str) -> str:
    doc = nlp(text)
    processed_text = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
    return " ".join(processed_text) 

In [17]:
process("i can't go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake")

'not feel hopeless damned hopeful care awake'

In [19]:
emotion_data["text_processed"] = emotion_data["text"].apply(process)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(emotion_data["text_processed"],
                                                    emotion_data["emotion"],
                                                    test_size=0.1,
                                                    random_state=42,
                                                    stratify=emotion_data["emotion"])

In [22]:
X_train.shape, X_test.shape

((14399,), (1600,))

In [24]:
y_test[:5]

9963     sadness
10752        joy
11501       fear
4506     sadness
2852       anger
Name: emotion, dtype: object

In [26]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(y_train)

array([2, 2, 2, ..., 2, 2, 5])

#### Naive Bayes

**ngram_range=(1, 2)**

In [29]:
naive_clf = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 2))),
    ("naive_bayes", MultinomialNB())
])

In [30]:
naive_clf.fit(X_train, y_train)

In [31]:
y_pred = naive_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.55      0.96      0.70       123
        fear       0.58      0.88      0.70       127
         joy       0.95      0.72      0.81       709
        love       0.24      0.89      0.38        35
     sadness       0.95      0.74      0.83       598
    surprise       0.14      1.00      0.25         8

    accuracy                           0.76      1600
   macro avg       0.57      0.86      0.61      1600
weighted avg       0.87      0.76      0.79      1600



**ngram_range=(3, 3)**

In [32]:
naive_clf = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(3, 3))),
    ("naive_bayes", MultinomialNB())
])

In [33]:
naive_clf.fit(X_train, y_train)

In [34]:
y_pred = naive_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.05      0.67      0.09        15
        fear       0.15      0.73      0.26        41
         joy       0.96      0.38      0.54      1358
        love       0.06      0.57      0.11        14
     sadness       0.26      0.72      0.38       172
    surprise       0.00      0.00      0.00         0

    accuracy                           0.43      1600
   macro avg       0.25      0.51      0.23      1600
weighted avg       0.85      0.43      0.51      1600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**TF-IDF**

In [35]:
naive_clf = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("naive_bayes", MultinomialNB())
])

In [36]:
naive_clf.fit(X_train, y_train)

In [37]:
y_pred = naive_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.36      0.93      0.52        84
        fear       0.33      0.89      0.48        72
         joy       0.96      0.64      0.77       804
        love       0.10      1.00      0.18        13
     sadness       0.92      0.68      0.78       627
    surprise       0.00      0.00      0.00         0

    accuracy                           0.69      1600
   macro avg       0.44      0.69      0.46      1600
weighted avg       0.88      0.69      0.74      1600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Forest Classifier

**ngram_range=(1, 2)**

In [39]:
randomforest_clf = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 2))),
    ("random_forest", RandomForestClassifier())
])


randomforest_clf.fit(X_train, y_train)

In [40]:
y_pred = randomforest_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.86      0.87      0.87       214
        fear       0.83      0.93      0.88       174
         joy       0.91      0.90      0.90       544
        love       0.71      0.78      0.74       118
     sadness       0.94      0.88      0.91       500
    surprise       0.77      0.88      0.82        50

    accuracy                           0.88      1600
   macro avg       0.84      0.87      0.85      1600
weighted avg       0.89      0.88      0.88      1600



**ngram_range=(3, 3)**

In [41]:
randomforest_clf = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(3, 3))),
    ("random_forest", RandomForestClassifier())
])


randomforest_clf.fit(X_train, y_train)

In [42]:
y_pred = randomforest_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.09      0.70      0.16        27
        fear       0.15      0.77      0.26        39
         joy       0.20      0.58      0.30       183
        love       0.08      0.58      0.15        19
     sadness       0.92      0.32      0.48      1327
    surprise       0.09      1.00      0.16         5

    accuracy                           0.38      1600
   macro avg       0.26      0.66      0.25      1600
weighted avg       0.79      0.38      0.44      1600



**TF-IDF**

In [43]:
randomforest_clf = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("random_forest", RandomForestClassifier())
])


randomforest_clf.fit(X_train, y_train)

In [44]:
y_pred = randomforest_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       anger       0.82      0.85      0.84       210
        fear       0.81      0.90      0.85       176
         joy       0.90      0.84      0.87       575
        love       0.68      0.81      0.74       109
     sadness       0.90      0.88      0.89       476
    surprise       0.70      0.74      0.72        54

    accuracy                           0.85      1600
   macro avg       0.80      0.84      0.82      1600
weighted avg       0.86      0.85      0.86      1600

