In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('Emotion_classify_Data.csv')
print(df.shape)
df.head()

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  5937 non-null   object
 1   Emotion  5937 non-null   object
dtypes: object(2)
memory usage: 92.9+ KB


In [14]:
df.describe()

Unnamed: 0,Comment,Emotion
count,5937,5937
unique,5934,3
top,i feel like a tortured artist when i talk to her,anger
freq,2,2000


In [15]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [16]:
df['Emotion_num'] = df.Emotion.map({'anger':0,
                                   'joy':1,
                                   'fear':2
                                   })

In [17]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


In [18]:
df.drop('Emotion',axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,Comment,Emotion_num
0,i seriously hate one subject to death but now ...,2
1,im so full of life i feel appalled,0
2,i sit here to write i start to dig out my feel...,2
3,ive been really angry with r and i feel like a...,1
4,i feel suspicious if there is no one outside l...,2


In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    filtered_token=[]
    doc = nlp(text)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
        
    return ' '.join(filtered_token) 

In [21]:
df['Processed_comment']=df.Comment.apply(preprocess)

In [22]:
df.head()

Unnamed: 0,Comment,Emotion_num,Processed_comment
0,i seriously hate one subject to death but now ...,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,0,m life feel appalled
2,i sit here to write i start to dig out my feel...,2,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,1,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,2,feel suspicious outside like rapture happen


In [27]:
df.Comment[0]

'i seriously hate one subject to death but now i feel reluctant to drop it'

In [28]:
df.Processed_comment[0]

'seriously hate subject death feel reluctant drop'

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Processed_comment, df.Emotion_num,test_size=0.2,random_state=1,stratify=df.Emotion_num)

In [32]:
X_train.shape, X_test.shape

((4749,), (1188,))

In [33]:
y_train.value_counts()

1    1600
0    1600
2    1549
Name: Emotion_num, dtype: int64

In [34]:
y_test.value_counts()

1    400
0    400
2    388
Name: Emotion_num, dtype: int64

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

clf  = Pipeline([
           ('tf-idf_Vectorizer',TfidfVectorizer()),
           ('naive bayes', MultinomialNB())
         ])

In [36]:
clf.fit(X_train,y_train)

In [37]:
y_pred = clf.predict(X_test)

In [38]:
X_test[:3]

1812              hope enjoy read feel free leave comment
4553                                   ve feel distressed
4580    lose lot nesting homemaking instinct desire pr...
Name: Processed_comment, dtype: object

In [39]:
y_test[:3]

1812    1
4553    2
4580    1
Name: Emotion_num, dtype: int64

In [40]:
y_pred[:3]

array([1, 2, 1], dtype=int64)

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       400
           1       0.94      0.89      0.92       400
           2       0.91      0.90      0.91       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf  = Pipeline([
           ('tf-idf_Vectorizer',TfidfVectorizer()),
           ('random forest', RandomForestClassifier())
         ])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       400
           1       0.93      0.92      0.92       400
           2       0.94      0.91      0.92       388

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf  = Pipeline([
           ('count_Vectorizer',CountVectorizer()),
           ('random forest', RandomForestClassifier())
         ])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92       400
           1       0.95      0.93      0.94       400
           2       0.95      0.92      0.93       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

