# Pre-trained Embeddings notebook


In the previous notebook [2019-07-30_deep_learning_summary](https://github.com/sv650s/sb-capstone/blob/master/2019_07_30_deep_learning_summary.ipynb) we looked at the results for various deep learning models. There were 2 models that came back with the best results:

* 1 layer bi direcitonal GRU with attention
* 3 layer CNN with maxpooling - we train this with 15 epoch without early stopping

Overall, GRU model offered better precision however poor recall

CNN had a good balance for F1 (precision and recall) scores for class 2, 3, 4 - our problem classes, but did poorly for our class 1 and 5

In this notebook, we will use pre-trained word embeddings for both models. Idea is that in our previous notebooks, embeddings were random to start with and as we train the model as well as embeddings.

With pre-trained embeddings, we should see improvements in our model in term of training time since the embedding vectors are pre-trained


Pre-trained word vectors will come from Google's Word2Vec model pre-trained on Google News: https://github.com/mmihaltz/word2vec-GoogleNews-vectors

In [0]:
from google.colab import drive
import sys
drive.mount('/content/drive')
DRIVE_DIR = "drive/My Drive/Springboard/capstone"

# add this to sys patch so we can import utility functions
sys.path.append(DRIVE_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, GRU, SpatialDropout1D, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
import pandas as pd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import pickle
from datetime import datetime
from sklearn.metrics import confusion_matrix, classification_report
import os
import seaborn as sns
import matplotlib.pyplot as plt


# custom utility functions
import util.dict_util as du
import util.plot_util as pu
import util.file_util as fu
import util.keras_util as ku


sns.set()

import logging
logging.basicConfig(level=logging.INFO)

DATE_FORMAT = '%Y-%m-%d'
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
DATA_FILE = f"{DRIVE_DIR}/data/amazon_reviews_us_Wireless_v1_00-preprocessed-110k.csv"
LABEL_COLUMN = "star_rating"
REVIEW_COLUMN = "review_body"


In [0]:
import tensorflow as tf
# checl to make sure we are using GPU here
tf.test.gpu_device_name()

''

In [0]:
df = pd.read_csv(f"{DATA_FILE}")
ratings = df[LABEL_COLUMN]
reviews = df[REVIEW_COLUMN]

## Preprocessing our Data

In [0]:
from  keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence


# one hot encode our lables so we can pass it to our model later
print("One hot enocde label data...")
y = OneHotEncoder().fit_transform(rating.values.reshape(len(ratings), 1)).toarray()

# split our data into train and test sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(reviews, y, random_state=1)

# Pre-process our features (review body)
t = Tokenizer()
# fit the tokenizer on the documents so that we can get word index dict - key = word, value = index of tokenizer
t.fit_on_texts(features_train)

# tokenize both our training and test data
train_sequences = t.texts_to_sequences(features_train)
test_sequences = t.texts_to_sequences(features_test)

print("Vocabulary size={}".format(len(t.word_counts)))
print("Number of Documents={}".format(t.document_count))

# figure out 99% percentile for our max sequence length
df["feature_length"] = df.review_body.apply(lambda x: len(x.split()))
max_sequence_length = int(df.feature_length.quantile([0.99]).values[0])
print(f'Max Sequence Length: {max_sequence_length}')

# pad our reviews to the max sequence length
X_train = sequence.pad_sequences(train_sequences, maxlen=max_sequence_length)
X_test = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length)

One hot enocde label data...
Splitting data into training and test sets...


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Vocabulary size=40788
Number of Documents=84032
Max Sequence Length: 186


In [0]:
import numpy as np

def load_pretrained_embeddings(word_to_index, max_features, embedding_size, embedding_file_path):    
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')

    # turns the embedding file into a dictionary key = word, value = word vector
    embeddings_index = dict(get_coefs(*row.split(" ")) 
                                for row in open(embedding_file_path, encoding="utf8", errors='ignore') 
                                    if len(row)>100)

    # convert the values into a array of word arrays
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_to_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    
    for word, idx in word_to_index.items():
        if idx >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [0]:
# keep number of 
MAX_FEATURES = len(t.word_index)
EMBED_SIZE = 300
pg_embeddings = load_pretrained_embeddings(word_to_index=t.word_index, max_features=MAX_FEATURES, 
                                            embedding_size=EMBED_SIZE, 
                                            embedding_file_path=f'{DRIVE_DIR}/data/paragram_300_sl999.txt')

  


In [0]:
MAX_FEATURES

40788

In [0]:
max_sequence_length

186

In [0]:
from keras.engine.topology import Layer
from keras import backend as K
import keras


class AttentionLayer(Layer):
    
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        """
        
        self.supports_masking = True
        self.init = keras.initializers.get('glorot_uniform')

        self.W_regularizer = keras.regularizers.get(W_regularizer)
        self.b_regularizer = keras.regularizers.get(b_regularizer)

        self.W_constraint = keras.constraints.get(W_constraint)
        self.b_constraint = keras.constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(AttentionLayer, self).__init__(**kwargs)
        

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    
    def call(self, x, mask=None):
        # TF backend doesn't support it
        # eij = K.dot(x, self.W) 
        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 
                              K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        
        return K.sum(weighted_input, axis=1)

    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
    
    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [0]:
MODEL_NAME = "biGRU_1layer_attention-paragram"
EPOCHS  = 50
BATCH_SIZE = 128
VOCAB_SIZE = len(t.word_counts)
GRU_DIM = 250 # total GRU units



In [0]:
max_sequence_length

186

In [0]:
pg_embeddings.shape

(40788, 300)

In [0]:
VOCAB_SIZE

40789

In [0]:
EMBED_SIZE

300

In [0]:
from keras.layers import CuDNNGRU
from keras.callbacks import EarlyStopping, ReduceLROnPlateau


reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                              factor=0.4,
                              patience=2, 
                              min_lr=0.00001,
                             mode='auto')

early_stop = EarlyStopping(monitor='val_loss', 
                           patience=2, 
                           mode='auto', 
                           verbose=1,
                          restore_best_weights=True)


# model = Sequential()
# model.add(Embedding(VOCAB_SIZE, 
#                     EMBED_SIZE, 
#                     input_length=max_sequence_length, 
#                     weights=[pg_embeddings], Trainable=True))
# model.add(Bidirectional(CuDNNGRU(GRU_DIM*2, return_sequences=True)))
# model.add(AttentionLayer(max_sequence_length))
# model.add(Dense(GRU_DIM*2, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(GRU_DIM, activation='relu'))
# model.add(Dense(5, activation='softmax'))
# model.compile(loss='categorical_crossentropy', 
#               optimizer='adam', 
#               metrics=['accuracy'])

inp = keras.layers.Input(shape=(max_sequence_length,))
x = keras.layers.Embedding(VOCAB_SIZE, 
                    EMBED_SIZE, 
                    input_length=max_sequence_length, 
                    weights=[pg_embeddings], trainable=True)(inp)
x = AttentionLayer(max_len)(x)
x = keras.layers.Dense(gru_units*2, activation='relu')(x)
x = keras.layers.Dropout(rate=0.2)(x)
x = keras.layers.Dense(gru_units, activation='relu')(x)
x = keras.layers.Dropout(rate=0.2)(x)

outp = keras.layers.Dense(1, activation='sigmoid')(x)
# initialize the model
model = keras.models.Model(inputs=inp, outputs=outp)
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])


mw = ku.ModelWrapper(model, MODEL_NAME, LABEL_COLUMN, DATA_FILE,
                     embedding=EMBED_SIZE,
                     tokenizer=t, description="Pre-trained Embedding - paragram")


network_history = mw.fit(X_train, y_train,
                      batch_size=BATCH_SIZE,
                      epochs=EPOCHS,
                      verbose=1,
                      validation_split=0.2,
                      callbacks=[reduce_lr, early_stop])


InvalidArgumentError: ignored

array([[-2.37248142e-01, -1.22240219e-01, -2.24036211e-01,
        -1.18400449e-01, -8.95509420e-02,  4.07487098e-01,
        -5.83458725e-01, -5.35285833e-01,  4.05001557e-01,
        -3.92891890e-01, -9.62554805e-01,  1.14978469e-01,
        -2.67563920e-01,  7.76065163e-02, -2.71578070e-01,
         6.45317581e-01, -1.67960207e-01, -2.55414393e-02,
        -3.92467597e-02,  8.91531022e-01, -1.18249568e+00,
        -5.25316941e-01, -5.67054779e-01,  1.28508804e-01,
         6.21470361e-01, -3.26528030e-01, -1.00567880e-01,
         2.32459440e-01,  6.88036270e-01, -6.28223759e-01,
        -2.47412930e-01,  2.31602231e-01,  6.48537292e-02,
         1.95445532e-01, -2.57372761e-01, -1.76315115e-01,
        -6.01665050e-03, -6.25397357e-01,  6.67910767e-01,
        -1.50553274e-01,  5.52504054e-01, -1.02782965e-01,
         2.15866674e-01, -9.64930139e-01,  2.38529526e-01,
        -5.82853130e-01,  1.08787729e+00, -1.50640432e-01,
         7.29183877e-02,  1.18933724e-03, -1.98510079e-0