In [2]:
import pandas as pd

In [3]:
import spacy

In [14]:
from sklearn.model_selection import train_test_split

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [43]:
from sklearn.naive_bayes import MultinomialNB

In [4]:
df = pd.read_csv('train.txt')

In [5]:
df.sample(7)

Unnamed: 0,i didnt feel humiliated,sadness
15419,i can t fly paulo coelho do you feel useful,joy
7151,i feel i find i felt target blank clasheen by ...,sadness
1036,i feel for this little pound lovely is truly a...,love
15426,i feel irritable and low but i just cannot put...,anger
849,i believe just imagining what it would be like...,joy
12726,i feel like we have so much to be thankful for,joy
11500,i shared previously the tv program and another...,sadness


In [6]:
df.rename(columns={'i didnt feel humiliated':'comment', 'sadness':'emotion'}, inplace = True)

In [7]:
df.head()

Unnamed: 0,comment,emotion
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


In [8]:
df.emotion.value_counts()

emotion
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [9]:
min_samples = 2159
df_1 = df[df.emotion == 'joy'].sample(min_samples, random_state=2022)
df_2 = df[df.emotion == 'sadness'].sample(min_samples, random_state=2022)
df_3 = df[df.emotion == 'anger'].sample(min_samples, random_state=2022)

In [10]:
df_balanced = pd.concat([df_1, df_2, df_3], axis = 0)
df_balanced.emotion.value_counts()

emotion
joy        2159
sadness    2159
anger      2159
Name: count, dtype: int64

In [11]:
df_balanced.head()

Unnamed: 0,comment,emotion
15477,i want to do with my life is an amazing feelin...,joy
551,i checked on you was a long time ago i can say...,joy
4020,i should do but i think it means that i should...,joy
13216,i feel the near and lively presence of the wel...,joy
2783,i am left tonight feeling so hopeful for the f...,joy


In [12]:
df_balanced.emotion = df_balanced.emotion.map({'joy':0, 'sadness':1, 'anger':2})

In [22]:
df_balanced.sample(3)

Unnamed: 0,comment,emotion
14955,i found working out of detroit specialized in ...,2
9274,i mustered up energy to feel christmassy i rem...,2
12959,i went to german class and it made me feel so ...,1


In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(df_balanced.comment, df_balanced.emotion, test_size=0.2, stratify= df_balanced.emotion, random_state = 21)

In [16]:
xtrain.shape

(5181,)

In [23]:
ytrain.shape

(5181,)

In [24]:
xtest.shape

(1296,)

In [37]:
clf = Pipeline([
    ('countvector', CountVectorizer(ngram_range=(1,3))),
    ('model', RandomForestClassifier())
])

clf.fit(xtrain, ytrain)

y_pred = clf.predict(xtest)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       432
           1       0.87      0.94      0.90       432
           2       0.95      0.94      0.95       432

    accuracy                           0.92      1296
   macro avg       0.93      0.92      0.92      1296
weighted avg       0.93      0.92      0.92      1296



In [25]:
nlp = spacy.load('en_core_web_sm')
def preproces(text):
    processed_list = []
    doc = nlp(text)
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        processed_list.append(token.lemma_)
    return " ".join(processed_list)

In [29]:
df_balanced['preprocessed_text'] = df_balanced.comment.apply(preproces)

In [30]:
df_balanced.sample(3)

Unnamed: 0,comment,emotion,preprocessed_text
8220,i feel lethargic and crave junk food and pop,1,feel lethargic crave junk food pop
1211,i feel like i have been beaten hard with a bas...,1,feel like beat hard baseball bat arm doctor sa...
4300,i feel so horrible that you had to go through ...,1,feel horrible grow little early friend


In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(df_balanced.preprocessed_text, df_balanced.emotion, test_size=0.2, stratify= df_balanced.emotion, random_state = 21)

In [32]:
xtrain.shape

(5181,)

In [33]:
ytest.shape

(1296,)

In [36]:
clf1 = Pipeline([
    ('countvector', CountVectorizer(ngram_range=(1,3))),
    ('model', RandomForestClassifier(n_estimators=50))
])

clf1.fit(xtrain, ytrain)

y_pred = clf1.predict(xtest)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91       432
           1       0.87      0.94      0.90       432
           2       0.95      0.94      0.95       432

    accuracy                           0.92      1296
   macro avg       0.92      0.92      0.92      1296
weighted avg       0.92      0.92      0.92      1296



In [42]:
clf2 = Pipeline([
    ('TFIDFvector', TfidfVectorizer(ngram_range=(1,3))),
    ('model', RandomForestClassifier(n_estimators=100))
])

clf2.fit(xtrain, ytrain)

y_pred = clf2.predict(xtest)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       432
           1       0.89      0.90      0.89       432
           2       0.94      0.93      0.93       432

    accuracy                           0.91      1296
   macro avg       0.91      0.91      0.91      1296
weighted avg       0.91      0.91      0.91      1296



In [44]:
clf3 = Pipeline([
    ('TFIDFvector', TfidfVectorizer(ngram_range=(1,3))),
    ('model', MultinomialNB())
])

clf3.fit(xtrain, ytrain)

y_pred = clf3.predict(xtest)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       432
           1       0.89      0.91      0.90       432
           2       0.93      0.95      0.94       432

    accuracy                           0.92      1296
   macro avg       0.92      0.92      0.92      1296
weighted avg       0.92      0.92      0.92      1296



### TF-IDF

In [45]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [47]:
v = TfidfVectorizer()
v.fit(corpus)
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [73]:
#tf

mat = v.transform(corpus)
print(mat.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26982671 0.
  0.         0.5680354  0.30652086 0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.

In [66]:
all_feture_names = v.get_feature_names_out()
all_feture_names


array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'grapes',
       'iphone', 'ironman', 'is', 'loki', 'microsoft', 'model', 'new',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [72]:
#let's print the idf of each word:

for word in all_feture_names:

    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f'{word} : {idf_score}')

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891
