In [10]:
# For a given comment/text, we are going to use classical NLP techniques 
# and classify under which emotion that particular comment belongs!

# We are going to use techniques like Bag of grams, n-grams, TF-IDF, etc. 
# for text representation and apply different classification algorithms.

# Emotion feature tells whether the given comment is fear 😨, Anger 😡, Joy 😂.
# As there are only 3 classes, this problem comes under the Multi-Class Classification.

import pandas as pd
import numpy as np

df = pd.read_csv('Emotion_classify_Data_TF_IDF__17.csv')
df.head()

(5937, 2)

In [13]:
#                                preprocessing

df.Emotion.value_counts()       # all data emotion distribution correct or almost same

# Numeric mapping to Emotion column   # can use Label encoder too, but it gives random value, here we assign number 
df['Emotion_number'] = df.Emotion.map({
    "anger":0,
    "joy":1,
    "fear":2    })




In [14]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_number
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


In [37]:
#                                      Modelling without Pre-processing Text data
#  splitting data with input as Comment 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Comment,
    df.Emotion,
    test_size=0.2,
    stratify=df.Emotion_number,
    random_state=2022
)
# X_train.shape, X_test.shape, 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB


# Using Pipeline to Train the data 
vectorizers = [TfidfVectorizer, CountVectorizer]
models = [RandomForestClassifier, MultinomialNB ]

for each_vectorizers in vectorizers:
    for model in models:
        clf = Pipeline([
            ('each_vectorizers name', each_vectorizers() ),
            ('model name', model())
        ])
        
        clf.fit(X_train, y_train)
        print(f'clf score for model {model()} and vectorizer {each_vectorizers()} is \n',clf.score(X_test,y_test),'\n')


#get the predictions for X_test and store it in y_pred
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))


clf score for model RandomForestClassifier() and vectorizer TfidfVectorizer() is 
 0.9065656565656566 

clf score for model MultinomialNB() and vectorizer TfidfVectorizer() is 
 0.8998316498316499 

clf score for model RandomForestClassifier() and vectorizer CountVectorizer() is 
 0.9250841750841751 

clf score for model MultinomialNB() and vectorizer CountVectorizer() is 
 0.882996632996633 

