# Generative Sentiments NLP Model

This notebook inplements a bidirectional LSTM model used to classify an emotional sentence into one of 6 emotions:
- Anger
- Sadness
- Joy
- Fear
- Surprise
- Love

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv("data/train.txt", delimiter=';', header=None, names=['sentence','label'])

In [None]:
train_df.head()

In [None]:
sns.countplot(train_df['label'])

In [None]:
MAX_LENGTH = 100 # max length of any input sentance in the data
MAX_WORDS = 20000

## Preprocessing

In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
def clean_stopwords(text):
    """
    remove simple words that don't help understand the sentiment all that well
    """
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [None]:
text = train_df['sentence'].apply(clean_stopwords)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(text) 

In [None]:
#save tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def pad_and_tokenize(text):
    sequences = tokenizer.texts_to_sequences(text)

    data = pad_sequences(sequences, maxlen=MAX_LENGTH)
    
    return data

In [None]:
X = pad_and_tokenize(text)
X.shape # expecting (16000, 100)

## Generate Y Data

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [None]:
def encode_y_data(train_df):
    encoder = LabelEncoder()
    
    y = encoder.fit_transform(train_df['label'])
    y_data = np_utils.to_categorical(y)
    
    # save the encoded classes so we can decode later on
    np.save('classes.npy', encoder.classes_)
    
    return y_data

In [None]:
Y = encode_y_data(train_df)
Y.shape # expecting (16000, 6) because we have 6 labels
Y

## Split train and test sets

In [None]:
X_train= X[:13000]  # TODO change ratios
Y_train= Y[:13000]
X_test= X[13000:]
Y_test= Y[13000:]

## Build Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Dense, Bidirectional, LSTM

In [None]:
def model(Y):
    model = Sequential()
    
    model.add(Embedding(MAX_WORDS, 64, input_length=MAX_LENGTH))
    
    model.add(Bidirectional(LSTM(128)))
    
    model.add(Dropout(0.7))
    
    model.add(Dense(32, activation='relu'))
    
    model.add(Dense(Y.shape[1], activation='softmax'))
    
    return model

In [None]:
model = model(Y)
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## Train the model

In [None]:
# use an early stopper for performance
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stopper = EarlyStopping(monitor='val_loss')

model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, callbacks=[early_stopper])

## Prediction Utils

In [None]:
def pad_and_tokenize(text):
    sequences = tokenizer.texts_to_sequences(text)

    data = pad_sequences(sequences, maxlen=MAX_LENGTH)
    
    return data

In [None]:
def predict(sentence):
    """
    takes an english sentance and uses the trained model to predict the emotion
    """
    input_sentence = np.array([sentence])
    tokenized_sentence = pad_and_tokenize(input_sentence)
    
    # pass tokenized sentence to trained model
    prediction = model.predict(tokenized_sentence)
    
    confidence = np.max(prediction)
    label_idx = np.argmax(prediction)
    
    result = np.array([label_idx])
    
    # load the saved encoder classes so we can decode the label properly
    encoder = LabelEncoder()
    encoder.classes_ = np.load('classes.npy', allow_pickle=True)
    
    label = encoder.inverse_transform(result)
    
    return label[0], confidence

## Manually Testing

In [None]:
sentence = 'I feel inspired'
label, confidence = predict(sentence)
print(f'Label: {label}')
print(f'Confidence: {confidence}')

## Save the model

In [None]:
# model.save('data/sentiment-model')
# from tensorflow.keras.models import save_model
# VERSION = 1
# save_model(
#     model,
#     'generative-sentiments/f{VERSION}',
#     overwrite=True,
#     include_optimizer=True,
#     save_format=None,
#     signatures=None,
#     options=None
# )
