In [None]:
#Model architecture inspired by Y. Kim, "Convolutional Neural Networks for Sentence Classification", 2014
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
w2v = api.load("word2vec-google-news-300") 

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import re
import nltk
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Flatten
from keras.layers.embeddings import Embedding

In [None]:
df = pd.read_csv('LOCAL_PATH_TO_DATASET')
df = df[['Emotion','Statement']]
display(df.head())

Unnamed: 0,Emotion,Statement
0,guilt,Once when I was in the cell group (religious a...
1,shame,When I overslept for the second time on the da...
2,shame,I had not punched a ticket in the bus because ...
3,disgust,When a man spoke very sexistly in the company ...
4,shame,About a dozen girls laughed at me and I was su...


In [None]:
def process_text(document):
     
    # Remove extra white space from text
    document = re.sub(r'\s+', ' ', document, flags=re.I)
         
    # Remove all the special characters from text
    document = re.sub(r'\W', ' ', str(document))
 
    # Remove all single characters from text
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
 
    # Converting to Lowercase
    document = document.lower()
 
    # Word tokenization       
    tokens = document.split()

    tokens = [token for token in tokens if token not in en_stop]
                 
    clean_txt = ' '.join(tokens)
 
    return clean_txt

In [None]:
from tqdm import tqdm

nltk.download('stopwords')
# For sentence tokenization
#nltk.download('punkt')
en_stop = set(nltk.corpus.stopwords.words('english'))
import re
from nltk import WordNetLemmatizer
stemmer = WordNetLemmatizer()

nltk.download('wordnet')
df['preprocessedStatement'] = df.Statement.apply(process_text)
display(df.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Emotion,Statement,preprocessedStatement
0,guilt,Once when I was in the cell group (religious a...,cell group religious activity found almost eve...
1,shame,When I overslept for the second time on the da...,overslept second time day examination
2,shame,I had not punched a ticket in the bus because ...,punched ticket bus card ticket collector came ...
3,disgust,When a man spoke very sexistly in the company ...,man spoke sexistly company friends mine
4,shame,About a dozen girls laughed at me and I was su...,dozen girls laughed sure nothing wrong


In [None]:
max_length = df.preprocessedStatement.apply(lambda x: len(x.split())).max()

t = Tokenizer()
t.fit_on_texts(df['preprocessedStatement'] )
vocab_size = len(t.word_index) + 1
encoded_text = t.texts_to_sequences(df['preprocessedStatement'] )
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    try:
      embedding_vector = w2v[word]
    except KeyError:
      pass
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
import tensorflow as tf
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
# Encode labels in column 'Emotion'. 
df['Emotion'] = le.fit_transform(df['Emotion']) 
y = df.pop('Emotion')
y_new = tf.keras.utils.to_categorical(y, num_classes=7)
print(y_new)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [None]:
list(le.inverse_transform([0,1,2,3,4,5,6]))

['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame']

In [None]:
y.value_counts()

4    1092
5    1082
0    1079
2    1076
6    1071
1    1066
3    1050
Name: Emotion, dtype: int64

In [None]:
# 2 options to handle imbalanced dataset: class_weight or focal loss
class_weight = {0: 6, 1: 22, 2: 30, 3: 1, 4: 2, 5: 7, 6: 5}

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y_new, test_size=0.05, stratify=y_new)

x_val = x_train[:100]
y_val = y_train[:100]
x_train = x_train[100:]
y_train = y_train[100:]

In [None]:
from tensorflow.keras import activations

def focal_loss(gamma=2., alpha=4., from_logits=False):

    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax if from_logits is False.
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.cast(y_true, dtype=tf.float32)
        y_pred = tf.cast(y_pred, dtype=tf.float32)
        if from_logits:
            y_pred = activations.softmax(y_pred)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.math.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed


In [None]:
model_w2v = Sequential()
callback = EarlyStopping(monitor='val_loss', patience=3)
model_w2v.add(Embedding(vocab_size, 300, input_length=max_length, weights=[embedding_matrix], trainable=False))
model_w2v.add(Conv1D(128, 5, activation='relu'))
model_w2v.add(GlobalMaxPooling1D())
model_w2v.add(Flatten())
model_w2v.add(Dense(64, activation='relu'))
model_w2v.add(Dense(7, activation='softmax'))
#model_w2v.compile(loss=focal_loss(alpha=1), optimizer='adam', metrics=['accuracy']) #use if you are using focal loss
model_w2v.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #use if you are using class_weight
model_w2v.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 78, 300)           2651700   
_________________________________________________________________
conv1d (Conv1D)              (None, 74, 128)           192128    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 455       
Total params: 2,852,539
Trainable params: 200,839
Non-trainable params: 2,651,700
_________________________________________________________________


In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model_w2v,show_shapes= True)

In [None]:
model_w2v.fit(x_train, y_train, epochs = 20, validation_data=(x_val, y_val), callbacks=[callback]) #insert class_weight=class_weight if using class_weight

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


<keras.callbacks.History at 0x7f07d305c190>

In [None]:
y_pred = model_w2v.predict(x_test)

In [None]:
y_pred_clean = np.argmax(y_pred, 1)
y_test_clean = np.argmax(y_test, 1)

In [None]:
print(classification_report(y_test_clean, y_pred_clean))

              precision    recall  f1-score   support

           0       0.37      0.33      0.35        54
           1       0.56      0.57      0.56        53
           2       0.56      0.78      0.65        54
           3       0.43      0.42      0.43        52
           4       0.63      0.82      0.71        55
           5       0.79      0.56      0.65        54
           6       0.61      0.43      0.50        54

    accuracy                           0.56       376
   macro avg       0.56      0.56      0.55       376
weighted avg       0.56      0.56      0.55       376

