# Sentiment Classifier using a LSTM Neural Network

> Classification of IMDB Reviews by sentiment

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten, Dropout, Embedding ,SpatialDropout1D
from tensorflow.keras.layers import CuDNNLSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [17]:
tf.__version__

'1.13.1'

## Hyper Parameter choices

In [3]:
out_dir = 'model_out/LSTM/'
epochs = 5
batch_size = 128

n_dim = 64
n_unique = 10000

max_review = 200
pad_type = trunc_type = 'pre'

## Loading the Data

In [4]:
(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique)

In [5]:
x_train = np.asarray(x_train)
y_train = np.asanyarray(y_train)

In [6]:
print(x_train.shape,y_train.shape)

(25000,) (25000,)


## Preprocessing

In [7]:
x_train = pad_sequences(x_train,maxlen=max_review,padding=pad_type,truncating=trunc_type, value=0)

In [8]:
x_valid = pad_sequences(x_valid,maxlen=max_review,padding=pad_type,truncating=trunc_type, value=0)

## Convnet Architecture

In [9]:
def modeler(n_drop=0.2,n_drop_em = 0.2,n_lstm = 256):
    model = Sequential()
    model.add(Embedding(n_unique,n_dim,input_length=max_review))
    model.add(SpatialDropout1D(n_drop_em))
    model.add(Bidirectional(CuDNNLSTM(n_lstm,return_sequences=True)))
    model.add(Bidirectional(CuDNNLSTM(n_lstm)))
    model.add(Dropout(n_drop))
    model.add(Dense(100,activation='relu'))
    model.add(Dropout(n_drop))
    model.add(Dense(1,activation='sigmoid'))
    model.summary()
    return model

In [10]:
model = modeler()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 64)           640000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 64)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 512)          659456    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1576960   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)        

In [11]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
modelcheckpoint = ModelCheckpoint(filepath=out_dir+"weights.{epoch:02d}.hdf5")
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

## Model Training

In [12]:
#model.fit(x_train,y_train,batch_size=batch_size,epochs=2,verbose=1,validation_data=(x_valid,y_valid),callbacks=[modelcheckpoint])

## Tensorboard Support

In [13]:
from tensorflow.keras.callbacks import TensorBoard
from time import time

In [14]:
tensorboard = TensorBoard(log_dir=out_dir+"/tdb/"+"logs/{}".format(time()),histogram_freq=1)

In [None]:
model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(x_valid,y_valid),callbacks=[tensorboard])

Train on 25000 samples, validate on 25000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/5

In [None]:
! tensorboard --logdir=model_out/LSTM/tdb/logs/

## Loading Model Weights

In [15]:
model.load_weights(out_dir+'weights.01.hdf5')

ValueError: You are trying to load a weight file containing 4 layers into a model with 5 layers.

In [None]:
x_valid.shape

In [None]:
y_hat = model.predict_proba(x_valid,verbose=1)

In [None]:
y_hat[0]

In [None]:
plt.hist(y_hat)
_ = plt.axvline(x=0.5,color = 'orange')

In [None]:
pct = roc_auc_score(y_valid,y_hat)*100.0

In [None]:
"{:0.2f}".format(pct)

In [None]:
def dfr(y_hat):
    y =[]
    for i in y_hat:
        y.append(i[0])
    return y

In [None]:
y_df = dfr(y_hat)

In [None]:
ydf = pd.DataFrame(list(zip(y_df,y_valid)),columns=['y_pred','y_valid'])

In [None]:
ydf.head(10)