# Sentiment Analysis using Bidirectional LSTM

<b><i>Importing python libraries</i></b>

In [64]:
import numpy as np
from sen_utils import *
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.optimizers import Adam
np.random.seed(1)
import re
import string
from unicodedata import normalize
from keras.callbacks import ModelCheckpoint

<b><i>Reading dataset in train and test varibles</i></b>

In [28]:
colnames=['Reviews','Labels'] 
df= pd.read_csv('data/Dataset_.csv', names=colnames, header=None)
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())

                                 Reviews  Labels
0                                awesome       5
1        couples are mostly welcome here       5
2  Thanking Mr Abhilek For best service.       5
3                             True treat       5
4                         Cheap and best       4


In [29]:
df = df.values
X = df[:,0]
Y = df[:,1]

In [30]:
X

array(['awesome', 'couples are mostly welcome here',
       'Thanking Mr Abhilek For best service.', 'True treat',
       'Cheap and best', 'Not just a Retreat, a true Treat.',
       'Not a good reflection of the positive reviews', 'Good Stay',
       'Dont visit the resort  by seeing the  pics uploaded in their website',
       'Official visit 03days',
       'i booked the hotel for one night and was my worst experience of life',
       'What a breath of fresh air!', 'awesome grand',
       'Amazing city Varanasi.', 'some problem',
       'excellent hotel to stay', 'Very professional',
       'Wonderful Retreat', 'awesome grand',
       'Not a good reflection of the positive reviews', '89 reviews',
       'Unworthy of Stay. Period.', 'Pleasant stay in beautiful Hotel',
       'Stunning property ', 'Not a place for family ', 'Awesome hotel',
       'Memorable Wedding Reception', 'NGO',
       'Good Hotel adjacent to Railway station',
       'Beautiful hotel located near Taj.', 'Poor F

<b><i>Function to clean our dataset</i></b>

In [31]:
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)

    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return np.array(cleaned)


In [32]:
X = clean_lines(X)
X

array(['awesome', 'couples are mostly welcome here',
       'thanking mr abhilek for best service', 'true treat',
       'cheap and best', 'not just a retreat a true treat',
       'not a good reflection of the positive reviews', 'good stay',
       'dont visit the resort by seeing the pics uploaded in their website',
       'official visit',
       'i booked the hotel for one night and was my worst experience of life',
       'what a breath of fresh air', 'awesome grand',
       'amazing city varanasi', 'some problem', 'excellent hotel to stay',
       'very professional', 'wonderful retreat', 'awesome grand',
       'not a good reflection of the positive reviews', 'reviews',
       'unworthy of stay period', 'pleasant stay in beautiful hotel',
       'stunning property', 'not a place for family', 'awesome hotel',
       'memorable wedding reception', 'ngo',
       'good hotel adjacent to railway station',
       'beautiful hotel located near taj', 'poor food', 'worst stay',
       'e

<b><i>Maximum length of the longest sentence in the dataset</i></b>

In [33]:
maxLen = len(max(X, key=len).split())

In [51]:
m = len(df)
train = int(0.8*m)
dev = int(0.1*m)
test = int(0.1*m)


In [52]:
df[:dev,1].shape

(41,)

<b><i>Splitting data into train, dev and test</i></b>

In [53]:
X_train = np.array(list(X[:train]))
Y_train = np.array(list(Y[:train]))
X_dev = np.array(list(X[train:train+dev]))
Y_dev = np.array(list(Y[train:train+dev]))
X_test = np.array(list(X[train+dev:train+dev+test]))
Y_test = np.array(list(Y[train+dev:train+dev+test]))

In [54]:
Y_dev

array([2, 3, 3, 2, 5, 5, 3, 3, 5, 4, 3, 5, 5, 5, 4, 4, 5, 1, 4, 4, 2, 5,
       1, 5, 5, 4, 5, 4, 5, 5, 1, 5, 4, 3, 4, 5, 4, 0, 5, 2, 4])

In [55]:
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)

(332,)
(41,)
(41,)


<b><i>Converting the train, dev and test label to their one hot vector form matrix</i></b>

In [56]:
Y_train_oh = convert_to_one_hot(Y_train, C = 6)
Y_dev_oh = convert_to_one_hot(Y_dev, C = 6)
Y_test_oh = convert_to_one_hot(Y_test, C = 6)
print(Y_train_oh.shape)

(332, 6)


<b><i>Function to read and store the glove embedding matrix</i></b>

In [57]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

<b><i>Reading the glove embedding matrix</i></b>

In [58]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

<b><i>Function convert every sentence to its corresponding indices vector using word_to_index dictionary; it also pads zero if incase the sentence vector is less than the max length of the sentence </i></b>

In [59]:
def sentences_to_indices(X, word_to_index, max_len):
 
    m = X.shape[0]
    print(m)
    X_indices = np.zeros((m, max_len))
    for i in range(m):                               
        
        sentence_words = X[i].split()
        j = 0
        
        for w in sentence_words:
            if w in word_to_index.keys(): 
                X_indices[i, j] = word_to_index[w]
            else:
                X_indices[i, j] = word_to_index['awesome']
            j = j + 1
    
    return X_indices

In [60]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
X_dev_indices = sentences_to_indices(X_dev, word_to_index, maxLen)
X_train_indices.shape

332
41


(332, 21)

<b><i>Function builds the <u>Embedding()</u> layer in Keras. After this layer is built, we can pass the output of <u>sentences_to_indices()</u> to it as an input, and the Embedding() layer will return the word embeddings for that sentence.</i></b>

In [61]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):

    vocab_len = len(word_to_index) + 1                  
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)

    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

<b><i>Model Function</i></b>

In [65]:
def classify(input_shape, word_to_vec_map, word_to_index):

    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state; the returned output should be a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    X = Dropout(0.5)(X)
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state; the returned output should be a single hidden state, not a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences=False))(X)
    
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(6)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

<b><i>Calling the Model Function</i></b>

In [66]:
model = classify((maxLen,), word_to_vec_map, word_to_index)

In [67]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

<b><i>Compiling the model</i></b>

In [68]:
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

<b><i>Fitting our training data to the compiled model</i></b>

In [69]:
model.fit(X_train_indices, Y_train_oh, epochs = 100, batch_size = 20, validation_data=(X_dev_indices, Y_dev_oh), callbacks=[checkpoint])

Train on 332 samples, validate on 41 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.38213, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 1.38213 to 1.28023, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 1.28023 to 1.16659, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.16659
Epoch 5/100

Epoch 00005: val_loss improved from 1.16659 to 0.93584, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.93584 to 0.89309, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.89309 to 0.64665, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 0.64665 to 0.58391, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.58391
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.58391
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.58391
Epoch 12/100

Epoch 00012: val_loss improve


Epoch 00043: val_loss did not improve from 0.33389
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.33389
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.33389
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.33389
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.33389
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.33389
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.33389
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.33389
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.33389
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.33389
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.33389
Epoch 54/100

Epoch 00054: val_loss did not improve from 0.33389
Epoch 55/100

Epoch 00055: val_loss did not improve from 0.33389
Epoch 56/100

Epoch 00056: val_loss did not improve from 0.33389
Epoch 57/100

Epoch 00057: val_loss did not improve from 0.33389
Epoch 58/100

Epoch 00058: val_loss di


Epoch 00087: val_loss did not improve from 0.33389
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.33389
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.33389
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.33389
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.33389
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.33389
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.33389
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.33389
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.33389
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.33389
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.33389
Epoch 98/100

Epoch 00098: val_loss did not improve from 0.33389
Epoch 99/100

Epoch 00099: val_loss did not improve from 0.33389
Epoch 100/100

Epoch 00100: val_loss did not improve from 0.33389


<keras.callbacks.History at 0x288c3f2be0>

<b><i>Converting our test data to testable form and evaluating our model</i></b>

In [70]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print("Test accuracy = ", acc)

41
Test accuracy =  0.8536585365853658
