In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv('emotion_sentimen_dataset.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral
5,5,i feel jealous becasue i wanted that kind of l...,love
6,6,when a friend of mine keeps telling me morbid ...,neutral
7,7,i finally fell asleep feeling angry useless an...,worry
8,8,i feel a bit annoyed and antsy in a good way,neutral
9,9,i feel like i ve regained another vital part o...,neutral


In [19]:
data['Emotion'].unique()

array(['hate', 'neutral', 'anger', 'love', 'worry', 'relief', 'happiness',
       'fun', 'empty', 'enthusiasm', 'sadness', 'surprise', 'boredom'],
      dtype=object)

In [20]:
data= data.drop(columns=['Unnamed: 0'])

In [21]:
data.head(20)

Unnamed: 0,text,Emotion
0,i seriously hate one subject to death but now ...,hate
1,im so full of life i feel appalled,neutral
2,i sit here to write i start to dig out my feel...,neutral
3,ive been really angry with r and i feel like a...,anger
4,i feel suspicious if there is no one outside l...,neutral
5,i feel jealous becasue i wanted that kind of l...,love
6,when a friend of mine keeps telling me morbid ...,neutral
7,i finally fell asleep feeling angry useless an...,worry
8,i feel a bit annoyed and antsy in a good way,neutral
9,i feel like i ve regained another vital part o...,neutral


In [22]:
print(data['Emotion'].value_counts())  # Check class distribution 

Emotion
neutral       674538
love           39553
happiness      27175
sadness        17481
relief         16729
hate           15267
anger          12336
fun            10075
enthusiasm      9304
surprise        6954
empty           5542
worry           4475
boredom          126
Name: count, dtype: int64


In [24]:
#THIS PART OF CODE IS USED TO REMOVE 
#STOPWORDS-> THESE ARE WORDS LIKE 'THE','IS' WHICH DO NOT ADD MEANING
#WORDNET LEMMATIZER-> THIS REDUCES WORDS TO THIER BASE MEANING. EG: "RUNNING" INTO "RUN"
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove URLs, mentions, and special characters
    text = re.sub(r'http\S+|@\w+|&amp;|[^A-Za-z\s]', '', text)
    # Lowercase and tokenize
    tokens = text.lower().split()
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['cleaned_content'] = data['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunshrivathsa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/varunshrivathsa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
data.head(5)

Unnamed: 0,text,Emotion,cleaned_content
0,i seriously hate one subject to death but now ...,hate,seriously hate one subject death feel reluctan...
1,im so full of life i feel appalled,neutral,im full life feel appalled
2,i sit here to write i start to dig out my feel...,neutral,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,anger,ive really angry r feel like idiot trusting fi...
4,i feel suspicious if there is no one outside l...,neutral,feel suspicious one outside like rapture happe...


In [26]:
#TF-> TERM FREQUENCY(ONLY WITHIN A SPECIFIC DOCUMENT)
#NUMBER OF TIMES A WORD APPEARS IN A DOCUMENT
# COUNT OF T IN D/TOTAL TERMS IN D

#IDF- INVERSE DOCUMENT FREQUENCY(ACROSS COLLECTION OF DOCUMENTS)
#IDF = LOG(TOTAL NUMBER OF DOCUMENTS/NUMBER OF DOCS WITH T)

#FEATURE SELECTION
# X-> INPUT - WORDS 
# X-> OUTPUT - SENTIMENT 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(data['cleaned_content'])
y = data['Emotion']

In [27]:
#UNDERSTANDING THE OUTPUT-
#1ST ROW CONTAINS 8 WORDS WITH THE INDEX OF THOSE WORDS 2383,2519 FOLLOWED BY THE TF-IDF = TF*IDF SCORES
#HIGHER THE TF-IDF SCORES-> HIGHER IMPORTANCE THE WORD HAS
print(X[0])

  (0, 3864)	0.40354238034305345
  (0, 2038)	0.3210524084849204
  (0, 3028)	0.23658981976259194
  (0, 4262)	0.4315572698065044
  (0, 1107)	0.39395287970571735
  (0, 1658)	0.0774686085106685
  (0, 3596)	0.37152939190691586
  (0, 1349)	0.43889421567300846


In [28]:
#SPLITTING DATASET INTO TESTING AND TRAINING
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
#training the model
from sklearn.tree import DecisionTreeClassifier

regmodel = DecisionTreeClassifier(class_weight='balanced',   max_depth=30,
    min_samples_leaf=1,
    min_samples_split=10,
    random_state=42 )
regmodel.fit(X_train, y_train)


In [30]:
#PREDICTION
from sklearn.metrics import accuracy_score, classification_report

y_pred = regmodel.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.95
              precision    recall  f1-score   support

       anger       0.97      0.64      0.77      2489
     boredom       1.00      0.81      0.89        21
       empty       0.99      0.82      0.90      1096
  enthusiasm       1.00      0.96      0.98      1839
         fun       1.00      0.85      0.92      1977
   happiness       0.99      0.49      0.66      5370
        hate       0.99      0.92      0.95      3018
        love       1.00      0.73      0.84      8001
     neutral       0.94      1.00      0.97    134999
      relief       0.97      0.60      0.74      3396
     sadness       1.00      0.87      0.93      3428
    surprise       0.99      0.88      0.93      1372
       worry       0.94      0.96      0.95       905

    accuracy                           0.95    167911
   macro avg       0.98      0.81      0.88    167911
weighted avg       0.95      0.95      0.94    167911

