In [1]:
import os, sys
import glob
import shutil
import zipfile
import numpy as np
import tarfile
import hashlib
from typing import Sequence, Tuple, TypeVar, Union
from pathlib import Path
import yaml
import re
import pickle
import requests
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import time
from numpy import array
from numpy.random import random, permutation, randn, normal, uniform, choice
from keras import applications
from keras.models import Sequential
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
from keras.preprocessing import image, sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from matplotlib import pyplot as plt
import sklearn.manifold
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
import tensorflow as tf

In [3]:
print(tf.__version__)

1.15.0


In [4]:
!pip install myutils
from myutils import *

Collecting myutils
  Downloading https://files.pythonhosted.org/packages/0a/1a/8a4890f87e4a3866b9467bf09e4b4aedef15a3c1657436ee44a087ed9c8d/myutils-0.0.21.zip
Building wheels for collected packages: myutils
  Building wheel for myutils (setup.py) ... [?25l[?25hdone
  Created wheel for myutils: filename=myutils-0.0.21-cp36-none-any.whl size=1361 sha256=45fec45ac2c5bb3834ae5d06bfc3554446aafaaae811f214d722cb74bbe0084a
  Stored in directory: /root/.cache/pip/wheels/84/15/15/1b5ef18349eaee0cf7a3bf369ab61667a871e534e9928aa9e0
Successfully built myutils
Installing collected packages: myutils
Successfully installed myutils-0.0.21


### **Upload Data Sets to Google Colab**

In [5]:
#Read from local drive (choose file when running)
from google.colab import files
uploaded = files.upload()


Saving Amazon_Product_Reviews.csv to Amazon_Product_Reviews.csv


In [0]:
# Convert to dataframe
import io
review_df = pd.read_csv(io.BytesIO(uploaded['Amazon_Product_Reviews.csv']))

# **Pre-process Data**

In [0]:
REPLACE_BY_SPACE_RE = re.compile('[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    return text
review_df.Text = review_df.Text.apply(clean_text)

In [8]:
# a. Print the first ten observations
review_df.info()
review_df[:10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650000 entries, 0 to 649999
Data columns (total 2 columns):
Text     650000 non-null object
Label    650000 non-null object
dtypes: object(2)
memory usage: 9.9+ MB


Unnamed: 0,Text,Label
0,model may ok sedentary types im active get aro...,Bad
1,fast read filled unepected humour profound ins...,Good
2,bought one chargersthe instructions say lights...,Bad
3,ecited find book ostensibly muslim feminism vo...,Bad
4,big jvc fan like model suspiscious saw several...,Bad
5,love style couple years dvd giving problems do...,Bad
6,cannot scroll dvd menu set vertically triangle...,Bad
7,movie animals really keeps grandson occupied i...,Neutral
8,found copy cookbook local used book store mied...,Good
9,book basic book using sourdough author obvious...,Neutral


In [9]:
# b. Create tokens with 50,000 as max number of words and filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(review_df['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 612517 unique tokens.


In [10]:
#c. Create padding sequence and limit the length to 250.
X = tokenizer.texts_to_sequences(review_df['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (650000, 250)


In [11]:
# d. Create one-hot representation of Rating class column.
Y = pd.get_dummies(review_df['Label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (650000, 3)


In [12]:
#e. Partition the data into 70% training, 30% for testing datasets. Use Seed=802 for data partitioning.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.30, random_state = 802)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(455000, 250) (455000, 3)
(195000, 250) (195000, 3)


# **Building RRN model**

In [13]:
#f. Build a Recurrent unit neural network with one embedding layer, two LSTM hidden layers, and an output layer
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))         
model.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True))
model.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform'))
model.add(Dense(Y.shape[1], activation='softmax'))
# Note: The values for MAX_NB_WORDS, EMBEDDING_DIM, input_length were already set in step b. 
# Like kernel_initializer, the bias_initializer is also set to 'Xavier' to improve model performance. 
# (Without bias_initializer='glorot_uniform', the valid accuracy was only 0.39-0.40)
# (Setting bias_initializer='glorot_uniform', the valid accuracy increased to nearly 0.70)






In [14]:
# g. Compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 5,161,103
Trainable params: 5,161,103
Non-trainable params: 0
_________________________________________________________________


In [15]:
# h. Fit model with 5 epochs, 1,000 batch size, 15% as validation split 
epochs = 5
batch_size = 1000
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.15)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 386750 samples, validate on 68250 samples
Epoch 1/5





Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
#i. Evaluate model with Test data and view the misclassification error
accr = model.evaluate(X_test,Y_test, batch_size=1000)
print('Test set\n  Loss: {:0.9f}\n  Accuracy: {:0.9f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.770831045
  Accuracy: 0.690641025


In [17]:
#j. Regularize the previous LSTM model by building the model again but include a dropout of 0.40 in each LSTM hidden layer. 
model2 = Sequential()
model2.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model2.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True, dropout=0.4))
model2.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', dropout=0.4))
model2.add(Dense(Y.shape[1], activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 5,161,103
Trainable params: 5,161,103
Non-trainable params: 0
_________________________________________________________________


In [18]:
## Train the new model with the same parameters as in previous model.
epochs = 5
batch_size = 1000
history = model2.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.15)

Train on 386750 samples, validate on 68250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# k. Score the test data using the LSTM model with regularization
accr2 = model2.evaluate(X_test,Y_test, batch_size=1000)
print('Test set\n  Loss: {:0.9f}\n  Accuracy: {:0.9f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.000000119
  Accuracy: 0.400271795


In [20]:
#l. Add a new LSTM layer after second LSTM hidden layer with 50 Neurons
## and recurrent dropout .2 , dropout .3, and activation function Tanh. 
model3 = Sequential()
model3.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model3.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True, dropout=0.4))
model3.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True, dropout=0.4))
model3.add(LSTM(50, activation='tanh', dropout=0.3, recurrent_dropout=0.2))
model3.add(Dense(Y.shape[1], activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 153       
Total params: 5,191,153
Trainable params: 5,191,153
Non-trainable params: 0
_________________________________________________________________


In [21]:
#m. Compile and fit this model with parameters as above
epochs = 5
batch_size = 1000
history = model3.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.15)

Train on 386750 samples, validate on 68250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
#n. Score the test data using the LSTM model with regularization
accr3 = model3.evaluate(X_test,Y_test, batch_size=1000)
print('Test set\n  Loss: {:0.9f}\n  Accuracy: {:0.9f}'.format(accr3[0],accr3[1]))

Test set
  Loss: 0.940462362
  Accuracy: 0.573774358


In [23]:
#o. Add new GRU layer before output layers with 70 Neurons and Relu as activation function 
## and Tanh as recurrent activation function.
model4 = Sequential()
model4.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model4.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True, dropout=0.4))
model4.add(LSTM(100, activation='relu', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', return_sequences=True, dropout=0.4))
model4.add(LSTM(50, activation='tanh', return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
model4.add(GRU(70, activation='relu', recurrent_activation='tanh'))
model4.add(Dense(Y.shape[1], activation='softmax'))
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
lstm_8 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_9 (LSTM)                (None, 250, 100)          80400     
_________________________________________________________________
lstm_10 (LSTM)               (None, 250, 50)           30200     
_________________________________________________________________
gru_1 (GRU)                  (None, 70)                25410     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 213       
Total params: 5,216,623
Trainable params: 5,216,623
Non-trainable params: 0
____________________________________________

In [24]:
#p. Compile and fit this model with parameters as above.
epochs = 5
batch_size = 1000
history = model4.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.15)

Train on 386750 samples, validate on 68250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
#q. Score the test data using the LSTM model with regularization
accr4 = model4.evaluate(X_test,Y_test, batch_size=1000)
print('Test set\n  Loss: {:0.9f}\n  Accuracy: {:0.9f}'.format(accr4[0],accr4[1]))

Test set
  Loss: 0.000000119
  Accuracy: 0.400271795
