In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GlobalAveragePooling1D, Flatten
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelBinarizer

In [28]:
# path = '/content/drive/MyDrive/Capstone/'
path = '.'

In [29]:
train_sentences = []
train_labels = []
with open(f"{path}/dataset/train.txt") as f:
    for items in f:
        item = items.split(";")
        train_sentences.append(item[0])
        train_labels.append(item[1].rstrip())

test_sentences = []
test_labels = []
with open(f"{path}/dataset/test.txt") as f:
    for items in f:
        item = items.split(";")
        test_sentences.append(item[0])
        test_labels.append(item[1].rstrip())

val_sentences = []
val_labels = []
with open(f"{path}/dataset/val.txt") as f:
    for items in f:
        item = items.split(";")
        val_sentences.append(item[0])
        val_labels.append(item[1].rstrip())

In [30]:
encoder = LabelBinarizer()
train_labels_encode = encoder.fit_transform(train_labels)
test_labels_encode = encoder.fit_transform(test_labels)
val_labels_encode = encoder.fit_transform(val_labels)

In [33]:
train_labels_encode[33]

array([1, 0, 0, 0, 0, 0])

In [5]:
print(set(train_labels))

{'fear', 'love', 'sadness', 'anger', 'joy', 'surprise'}


In [6]:
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000
vocab_size = 5000
max_length = 32
embedding_dim = 64

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index


train_seq = tokenizer.texts_to_sequences(train_sentences)

train_padded = pad_sequences(train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [8]:
val_seq = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
val_padded = np.array(val_padded)

In [9]:
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_padded = np.array(test_padded)

In [10]:
labels = list(train_labels)
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

train_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
val_label_seq = np.array(label_tokenizer.texts_to_sequences(val_labels))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    # tf.keras.layers.Dropout(.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(7, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 7)                 455       
                                                                 
Total params: 394,759
Trainable params: 394,759
Non-trainable params: 0
_________________________________________________________________


In [12]:
onehot_train_label = np.squeeze(tf.one_hot(train_label_seq,7).numpy())
onehot_val_label = np.squeeze(tf.one_hot(val_label_seq,7).numpy())
onehot_test_label = np.squeeze(tf.one_hot(test_label_seq,7).numpy())

In [13]:
onehot_test_label[14]

array([0., 0., 0., 0., 0., 1., 0.], dtype=float32)

In [14]:
model.compile(optimizer = 'Adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [15]:
num_epochs = 10

history = model.fit(train_padded, onehot_train_label, epochs=num_epochs, validation_data=(val_padded, onehot_val_label), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds, average='micro')
    recall = recall_score(y_true, y_preds, average='micro')
    f1 = f1_score(y_true, y_preds, average='micro')
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [17]:
# Now we make predictions using the test data to see how the model performs
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
predicted = model.predict(test_padded)
evaluate_preds(np.argmax(onehot_test_label, axis=1), np.argmax(predicted, axis=1))

Acc: 88.80%
Precision: 0.89
Recall: 0.89
F1 score: 0.89


{'accuracy': 0.89, 'precision': 0.89, 'recall': 0.89, 'f1': 0.89}

In [35]:
model.save('model_final.model')



INFO:tensorflow:Assets written to: model_final.model\assets


INFO:tensorflow:Assets written to: model_final.model\assets


In [36]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("INFO: Saving models done")

INFO: Saving models done


In [18]:
predicted[6]

array([4.4330285e-08, 5.5475417e-04, 3.3713162e-03, 8.1147671e-01,
       1.8421654e-01, 1.3171051e-04, 2.4896866e-04], dtype=float32)

1 = joy
2 = sadness
3 = anger
4 = fear
5 = love
6 = surprise

In [19]:
ngetes_seq = tokenizer.texts_to_sequences(['i then feel your tender touch as you enfold me with his love'])
ngetes_padded = pad_sequences(ngetes_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
ngetes_padded = np.array(ngetes_padded)

In [20]:
p = model.predict(ngetes_padded)

In [21]:
p

array([[7.5453912e-14, 2.3606457e-05, 1.7429383e-07, 3.4346454e-06,
        1.5609038e-10, 9.9997282e-01, 2.8482113e-08]], dtype=float32)

In [22]:
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000
vocab_size = 5000
max_length = 32
embedding_dim = 64
tokenizer_test = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_test.fit_on_texts(['i then feel your tender touch as you enfold me with his love i'])
word_index_test = tokenizer_test.word_index


train_seq_test = tokenizer_test.texts_to_sequences(train_sentences)

train_padded_test = pad_sequences(train_seq_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [23]:
print(word_index_test)

{'<OOV>': 1, 'i': 2, 'then': 3, 'feel': 4, 'your': 5, 'tender': 6, 'touch': 7, 'as': 8, 'you': 9, 'enfold': 10, 'me': 11, 'with': 12, 'his': 13, 'love': 14}
