## 1. Load dataset

In [50]:
import nltk
from nltk.tokenize import word_tokenize

In [51]:
from google.colab import drive
drive.mount('/content/drive') #Processed_Lemmatization_emotions.json

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import pandas as pd
df = pd.read_json('/content/drive/MyDrive/DA-23/Pr_8/Processed_Lemmatization_emotions.json')

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19606 entries, 0 to 19998
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  19606 non-null  object
 1   Emotion   19606 non-null  object
dtypes: object(2)
memory usage: 459.5+ KB


In [54]:
df.describe()

Unnamed: 0,Sentence,Emotion
count,19606,19606
unique,19538,6
top,im feeling little stressed,joy
freq,3,6644


In [55]:
df['Emotion'].value_counts()

joy         6644
sadness     5675
anger       2640
fear        2318
love        1619
surprise     710
Name: Emotion, dtype: int64

## 2. Prepocessing

In [None]:
df_balanced = pd.DataFrame(columns=['Sentence','Emotion'])
df_balanced = df_balanced.append(df[df.Emotion=='joy'].sample(n=710, random_state=1, ignore_index=True)) # good
df_balanced = df_balanced.append(df[df.Emotion=='sadness'].sample(n=710, random_state=1, ignore_index=True)) # bad
df_balanced = df_balanced.append(df[df.Emotion=='anger'].sample(n=710, random_state=1, ignore_index=True)) # bad
df_balanced = df_balanced.append(df[df.Emotion=='fear'].sample(n=710, random_state=1, ignore_index=True)) # bad
df_balanced = df_balanced.append(df[df.Emotion=='love'].sample(n=710, random_state=1, ignore_index=True)) # good
df_balanced = df_balanced.append(df[df.Emotion=='surprise'].sample(n=710, random_state=1, ignore_index=True)) # good

In [57]:
#df = df_balanced
df['Emotion'].value_counts()

joy         6644
sadness     5675
anger       2640
fear        2318
love        1619
surprise     710
Name: Emotion, dtype: int64

In [58]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [59]:
y = df['Emotion'].values
pd.get_dummies(df).astype('float32').values
enc = LabelEncoder().fit(y)
print(set(y))
y = enc.transform(y)
print("y:", set(y))
one_hot_labels = to_categorical(y)
print(one_hot_labels)

{'anger', 'fear', 'surprise', 'sadness', 'joy', 'love'}
y: {0, 1, 2, 3, 4, 5}
[[1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [60]:
corpus = df['Sentence'].values
corpus

array(['boyfriend split came friend house visiting male friend confrontation another room tried find aroused friend feeling part',
       'certain friend tried push seat violent way apparent reason may excited something',
       'father child killed accident', ..., 'stayed vienna class',
       'able afford new sofa leather one cost sek waited year',
       'winning rowing race karapiro regatta friend'], dtype=object)

In [61]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
ml = 10
nw = 1000

In [62]:
tokenizer = Tokenizer(num_words=nw, oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
sentences = tokenizer.texts_to_sequences(corpus)

In [63]:
sentences[:2]

[[843, 1, 255, 44, 251, 1, 1, 44, 1, 146, 293, 548, 68, 1, 44, 3, 112],
 [499, 44, 548, 944, 1, 418, 24, 1, 163, 134, 160, 26]]

In [64]:
padded = pad_sequences(sentences, maxlen=ml)
padded[:2]

array([[ 44,   1, 146, 293, 548,  68,   1,  44,   3, 112],
       [548, 944,   1, 418,  24,   1, 163, 134, 160,  26]], dtype=int32)

In [65]:
print(len(one_hot_labels))
print(len(padded))
print(type(one_hot_labels))
print(type(padded))

19606
19606
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## 3. Model

In [66]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [67]:
#
def create_model(max_features = 10000,emb_dim = 128):
    model = keras.Sequential()
    model.add(layers.Embedding(max_features, emb_dim))
    model.add(layers.GRU(128))
    model.add(layers.Dense(6, activation='softmax'))
    model.compile(optimizer="adam",
                  loss=keras.losses.categorical_crossentropy,
                  metrics=["CategoricalAccuracy"])
    return model


## 4. Prediction

In [68]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [69]:
xtr, xts, ytr, yts = train_test_split(padded, one_hot_labels, test_size=0.2, random_state=42)

In [70]:
print(xtr.shape)
print(ytr.shape)

print(xts.shape)
print(yts.shape)

(15684, 10)
(15684, 6)
(3922, 10)
(3922, 6)


In [71]:
model = create_model(max_features=nw)
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 128)         128000    
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 227846 (890.02 KB)
Trainable params: 227846 (890.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [72]:
history_cv = model.fit(xtr,
                    ytr,
                    epochs=5,
                    batch_size=64,
                    validation_split=0.2,
                    shuffle = True,
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [73]:
predictions = model.predict(xts)#.argmax



In [74]:
predictions=enc.inverse_transform(predictions.argmax(axis=1)).tolist()
predictions[:10]

['sadness',
 'anger',
 'surprise',
 'anger',
 'anger',
 'sadness',
 'joy',
 'anger',
 'joy',
 'joy']

In [75]:
labels_test = enc.inverse_transform(yts.argmax(axis=-1).tolist()).tolist()
labels_test[:10]

['sadness',
 'anger',
 'surprise',
 'anger',
 'anger',
 'sadness',
 'joy',
 'anger',
 'sadness',
 'joy']

In [77]:
print(classification_report(labels_test, predictions))
###  THESE WERE VALUES FOR BALANCED DATASET
#              precision    recall  f1-score   support
#
#       anger       0.60      0.75      0.66       134
#        fear       0.68      0.72      0.70       143
#         joy       0.67      0.60      0.64       161
#        love       0.69      0.82      0.75       123
#     sadness       0.73      0.54      0.62       158
#    surprise       0.86      0.81      0.84       133
#
#    accuracy                           0.70       852
#   macro avg       0.71      0.71      0.70       852
#weighted avg       0.71      0.70      0.70       852

              precision    recall  f1-score   support

       anger       0.74      0.75      0.74       531
        fear       0.81      0.76      0.78       466
         joy       0.86      0.84      0.85      1303
        love       0.76      0.69      0.73       349
     sadness       0.81      0.88      0.85      1146
    surprise       0.86      0.72      0.79       127

    accuracy                           0.81      3922
   macro avg       0.81      0.77      0.79      3922
weighted avg       0.81      0.81      0.81      3922

