In [1]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv('Emotion_classify_Data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  5937 non-null   object
 1   Emotion  5937 non-null   object
dtypes: object(2)
memory usage: 92.9+ KB


In [5]:
df.shape

(5937, 2)

In [6]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [7]:
df['Emotion'].value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [10]:
df['emotion_label'] = df['Emotion'].map({'anger':0,'fear':1,'joy':2})
df.head()

Unnamed: 0,Comment,Emotion,emotion_label
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,2
4,i feel suspicious if there is no one outside l...,fear,1


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test = train_test_split(df.Comment, df.emotion_label,test_size = 0.2, random_state = 2, stratify=df.emotion_label)

In [13]:
X_train.shape, X_test.shape

((4749,), (1188,))

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([('vectorizer',CountVectorizer(ngram_range = (3, 3))),
                ('rf',RandomForestClassifier())])

In [24]:
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(3, 3))),
                ('rf', RandomForestClassifier())])

In [25]:
y_pred = clf.predict(X_test)

In [26]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.55      0.30      0.39       400
           1       0.39      0.78      0.52       388
           2       0.62      0.29      0.39       400

    accuracy                           0.45      1188
   macro avg       0.52      0.46      0.43      1188
weighted avg       0.52      0.45      0.43      1188



In [31]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1,1))),
                ('mnb',MultinomialNB())])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89       400
           1       0.87      0.89      0.88       388
           2       0.94      0.88      0.91       400

    accuracy                           0.89      1188
   macro avg       0.90      0.89      0.89      1188
weighted avg       0.90      0.89      0.89      1188



In [33]:
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1,1))),
                ('rf',RandomForestClassifier())])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       400
           1       0.93      0.91      0.92       388
           2       0.88      0.95      0.92       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([('vectorizer',TfidfVectorizer()),
                ('rf',RandomForestClassifier())])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       400
           1       0.92      0.89      0.91       388
           2       0.86      0.95      0.90       400

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [36]:
## first preprocess the Data:

import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for i in doc:
        if i.is_stop and i.is_punct:
            continue
        filtered_tokens.append(i.lemma_)
    return " ".join(filtered_tokens)    



In [38]:
df['preprocessed_comment'] = df['Comment'].apply(preprocess)

In [39]:
df.head()

Unnamed: 0,Comment,Emotion,emotion_label,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,1,I seriously hate one subject to death but now ...
1,im so full of life i feel appalled,anger,0,I m so full of life I feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,I sit here to write I start to dig out my feel...
3,ive been really angry with r and i feel like a...,joy,2,I ve be really angry with r and I feel like an...
4,i feel suspicious if there is no one outside l...,fear,1,I feel suspicious if there be no one outside l...


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test = train_test_split(df.preprocessed_comment, df.emotion_label,test_size = 0.2, random_state = 2, stratify=df.emotion_label)

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([('vectorizer',TfidfVectorizer()),
                ('rf',RandomForestClassifier())])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       400
           1       0.92      0.87      0.90       388
           2       0.83      0.94      0.88       400

    accuracy                           0.88      1188
   macro avg       0.89      0.88      0.88      1188
weighted avg       0.89      0.88      0.88      1188



In [42]:
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1,1))),
                ('rf',RandomForestClassifier())])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       400
           1       0.93      0.88      0.90       388
           2       0.85      0.95      0.90       400

    accuracy                           0.89      1188
   macro avg       0.90      0.89      0.89      1188
weighted avg       0.90      0.89      0.89      1188



In [46]:
X_test[:5][5026]

'I do say she could but its just a bit annoying and it remind I that I m really unfit and that I have no determination and then I feel really poo and have even less determination so its all a bit of a vicious circle'

In [44]:
y_test[:5]

4845    1
5026    0
238     0
2635    0
586     0
Name: emotion_label, dtype: int64

In [45]:
y_pred[:5]

array([1, 1, 0, 0, 0], dtype=int64)