# LSTM_model

This notebook contains the functions for the words embeddings (using [GloVe](https://nlp.stanford.edu/projects/glove/)) as well as the training of the model to classify the genre of a movie based on its overview.

In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequencesfrom keras.callbacks import EarlyStopping
import time

Using TensorFlow backend.


In [81]:
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('datasets/preprocessed.csv')
df.dropna(inplace = True)

In [3]:
df.head()

Unnamed: 0,original_title,overview,genres,genre label
0,Toy Story,led woody andy toys live happily room andy bir...,Animation,2
1,Jumanji,siblings judy peter discover enchanted board g...,Adventure,1
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,Romance,17
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,Comedy,5
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,Comedy,5


## Obtain the 100-dimensional GloVe embeddings 

In the case of this project, we are only interested in the words that contain only letters in the alphabet.

In [4]:
def read_glove_vecs_only_alpha(glove_file):
    
    with open(glove_file, 'r',encoding='utf8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            
            # only consider words containing alphabetical letters
            if curr_word.isalpha():
                words.add(curr_word)
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
            
    return words_to_index, index_to_words, word_to_vec_map

In [5]:
# obtain the GloVe dataset of dimensionality 100
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs_only_alpha('datasets/glove.6B/glove.6B.100d.txt')

In [6]:
# sanity check to ensure that they are all of same length
print(len(word_to_index), len(index_to_word), len(word_to_vec_map))

327091 327091 327091


## Determine the maximum sequence length

In [7]:
df['overview length'] = df['overview'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(df["overview length"].mean()))
print("max length of sentence: " + str(df["overview length"].max()))
print("std dev length of sentence: " + str(df["overview length"].std()))

mean length of sentence: 30.37972260945585
max length of sentence: 141
std dev length of sentence: 19.037445459264383


In [15]:
# determine the maximum length of a movie overview
max_sequence_length = df["overview length"].max()

## Convert the sentences to their respective indices and define the embedding layer

In [9]:
df.head(5)

Unnamed: 0,original_title,overview,genres,genre label,overview length
0,Toy Story,led woody andy toys live happily room andy bir...,Animation,2,33
1,Jumanji,siblings judy peter discover enchanted board g...,Adventure,1,37
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,Romance,17,32
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,Comedy,5,25
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,Comedy,5,22


In [40]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can betest = pd.read_csv('datasets/genreLabels.csv') given to `Embedding()` 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    # number of training examples
    m = X.shape[0]                                   
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    # loop over training examples
    for i in range(m):                               
        
        # Convert the ith training sentence in lower case and split is into words -> get a list of words.
        sentence_words = [x.lower() for x in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words in sentence_words
        for w in sentence_words:
            
            # check that the word is within our GloVe dataset, otherwise pass
            if w in word_to_index.keys():
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                
                # Increment j to j + 1
                j = j+1
            else:
                pass
                
    return X_indices

In [41]:
# test the function
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = max_sequence_length)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['funny lol' 'lets play baseball' 'food is ready for you']
X1_indices = [[102421. 167414.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.  

In [38]:
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 100-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    # adding 1 to fit Keras embedding (requirement)
    vocab_len = len(word_to_index) + 1    
    
    # define dimensionality of your GloVe word vectors (in our case 100)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False) 

    # Build the embedding layer, required before setting the weights of the embedding layer
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [13]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

W0705 15:37:28.153141 140557313316608 deprecation_wrapper.py:119] From /home/thoo2/anaconda3/envs/tensorflow_gpu/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0705 15:37:28.168717 140557313316608 deprecation_wrapper.py:119] From /home/thoo2/anaconda3/envs/tensorflow_gpu/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0705 15:37:28.176140 140557313316608 deprecation_wrapper.py:119] From /home/thoo2/anaconda3/envs/tensorflow_gpu/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0705 15:37:28.176744 140557313316608 deprecation_wrapper.py:119] From /home/thoo2/anaconda3/envs/tensorflow_gpu/lib/python3.6/site-packages/keras/backend/tensorflow_backend.p

weights[0][1][3] = -0.17395


In [71]:
def GenreClassifier(input_shape, word_to_vec_map, word_to_index, nbClasses):
    """
    Function creating the graph of the model
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras 
    """
    
    # Define input of the graph of dtype 'int32' as it contains indices
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors 
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # output is a batch of sequences
    X = LSTM(128, return_sequences = True)(embeddings)
    
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences = False, return_state = False)(X)
    
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    
    # Propagate X through a Dense layer with softmax activation to get back a batch of 23-dimensional vectors.
    X = Dense(nbClasses)(X)
    
    # Add a softmax activation
    X = Activation('softmax')(X)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
        
    return model

In [74]:
X = df['overview'].values
y = df['genre label'].values

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [64]:
print("There are " +str(len(X_train)) + " samples in the training set")
print("There are " +str(len(X_test)) + " samples in the test set")

There are 28356 samples in the training set
There are 13967 samples in the test set


In [65]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [75]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_sequence_length)
y_train_oh = convert_to_one_hot(y_train, C = len(df["genres"].unique()))

In [76]:
#credits of function to http://parneetk.github.io/blog/neural-networks-in-keras/
def plot_model_history(model_history):
    
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.savefig('graphs/history_early_stopping_21_11_not_preprocessed.png')
    plt.show()

In [87]:
from keras.callbacks import EarlyStopping
import time

def trainModel(X_train_indices, Y_train_oh, word_to_vec_map, word_to_index, max_length, summary = False, 
               dropout_rate = 0.5, batch_size = 32, epochs = 50, loss ='categorical_crossentropy', 
               optimizer ='adam'):
    
    model = GenreClassifier((max_sequence_length,), word_to_vec_map, word_to_index, len(df["genres"].unique()))
    
    if summary:
        model.summary()
        
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    
    earlystop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=3, verbose=1, mode='auto')
    callbacks_list = [earlystop]
    
    start = time.time()
    history = model.fit(X_train_indices, Y_train_oh, epochs = 50, 
                             callbacks=None, batch_size = batch_size, validation_split = 0.1, shuffle=True)
    end = time.time()
    print("Model took {} seconds (which is {} minutes or {} hours) to train".format((end - start), (end - start)/60, (end - start)/3600))
    
    return history, model

In [88]:
history, model = trainModel(X_train_indices, y_train_oh, word_to_vec_map, word_to_index, max_length = max_sequence_length)

Train on 25520 samples, validate on 2836 samples
Epoch 1/50
 2144/25520 [=>............................] - ETA: 8:50 - loss: 2.5640 - acc: 0.2351

KeyboardInterrupt: 

In [None]:
model.save_weights("models/Epochs50_Adam_CCloss.h5") 

In [None]:
plot_model_history(history)

In [None]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = max_sequence_length)
y_test_oh = convert_to_one_hot(y_test, C = len(df["genres"].unique()))
loss, acc = model.evaluate(X_test_indices, y_test_oh)

print("Test accuracy = ", acc)