# Change from Theano default to TensorFlow

In [1]:
# Select the framework
import os
import sys
# Select the backend
#os.environ[“KERAS_BACKEND”] = “theano”
os.environ["KERAS_BACKEND"] = "tensorflow"
# Un-load the old module
for mod in sys.modules.keys():
    if mod.startswith('keras.'):
        del sys.modules[mod]
        del sys.modules["eras"]

# Import the required modules
import keras
print('Imported tensorflow')

Using TensorFlow backend.


Imported tensorflow


# Import Libraries

In [2]:
#import libraries
import numpy as np
import gensim
import string

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence



In [4]:
# fix random seed for reproducibility
np.random.seed(7)

# Split into test and train 

In [5]:
NUM_WORDS=5000 # only use top 10000 words
INDEX_FROM=3   # word index offset

train,test = keras.datasets.imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
train_x,train_y = train
test_x,test_y = test

# Dictionaries to convert IMDB reviews to words for Word2Vec training

In [7]:
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}

#appended start and end and padding to get a sense of reviews starting and ending
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

# Preparing review list for Word2Vec training

In [8]:
reviews_list = []
for i in range(0,len(train_x)):
    review = ' '.join(id_to_word[id] for id in train_x[i])
    reviews_list.append(review)

In [9]:
for i in range(0,len(test_x)):
    review = ' '.join(id_to_word[id] for id in test_x[i])
    reviews_list.append(review)

In [11]:
print('Length of reviews:' + str(len(reviews_list)))

Length of reviews:50000


In [16]:
print('Review 1 : \n ' + str(reviews_list[0]))

Review 1 : 
 <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> 

# Generate sentences to train Word2Vec

In [17]:
#fixed max review length at 200
max_sentence_len = 200

sentences = [[word for word in doc.lower().translate(str.maketrans('','',string.punctuation)).split()[:max_sentence_len]] for doc in reviews_list]

In [20]:
print('Sample Length : ' + str(len(sentences[1])))

Sample Length : 189


In [21]:
sentences = [[x for x in i if x] for i in sentences]

In [22]:
print('Num sentences:', len(sentences))

Num sentences: 50000


### Training Word2Vec and checking similar words to some commonly occuring words in the reviews

In [23]:
print('Training word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['beautiful', 'worst', 'bad', 'great']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))

Training word2vec...
Result embedding shape: (4965, 100)
Checking similar words:
  beautiful -> gorgeous (0.82), lovely (0.77), stunning (0.72), wonderful (0.67), breathtaking (0.65), fabulous (0.62), magnificent (0.62), elegant (0.62)
  worst -> best (0.82), funniest (0.80), greatest (0.76), finest (0.67), weakest (0.61), biggest (0.61), most (0.57), worse (0.45)
  bad -> horrible (0.78), terrible (0.77), good (0.77), lousy (0.74), awful (0.74), lame (0.71), stupid (0.70), cheesy (0.64)
  great -> wonderful (0.83), terrific (0.83), fantastic (0.82), fine (0.76), superb (0.75), good (0.74), brilliant (0.73), marvelous (0.72)


  


In [24]:
#return index of word from word2vec
def word2idx(word):
    return word_model.wv.vocab[word].index

#return word corresponding to index from word2vec
def idx2word(idx):
    return word_model.wv.index2word[idx]

#### Convert from word list to index list by word2vec as embedding layer is learnt according to word2vec weights

In [39]:
train_x_list = []
for i in range(0,len(train_x)):
    review = ' '.join(id_to_word[id] for id in train_x[i])
    train_x_list.append(review)

In [40]:
train_x_list = [[word for word in doc.lower().translate(str.maketrans('','',string.punctuation)).split()[:max_sentence_len]] for doc in train_x_list]

In [41]:
print('\nPreparing the data for LSTM...')
# train_x = np.zeros([len(train_x_list), max_sentence_len], dtype=np.int32)
train_x_list = [[word2idx(i) for i in j] for j in train_x_list]
b = np.zeros([len(train_x_list),len(max(train_x_list,key = lambda x: len(x)))])
for i,j in enumerate(train_x_list):
    b[i][0:len(j)] = j
train_x_list = b
print('train_x shape:', train_x_list.shape)
print('train_y shape:', train_y.shape)


Preparing the data for LSTM...
train_x shape: (25000, 200)
train_y shape: (25000,)


In [51]:
test_x_list = []
for i in range(0,len(train_x)):
    review = ' '.join(id_to_word[id] for id in test_x[i])
    test_x_list.append(review)

In [52]:
test_x_list = [[word for word in doc.lower().translate(str.maketrans('','',string.punctuation)).split()[:max_sentence_len]] for doc in test_x_list]

In [53]:
print('\nPreparing the data for LSTM...')
# train_x = np.zeros([len(train_x_list), max_sentence_len], dtype=np.int32)
test_x_list = [[word2idx(i) for i in j] for j in test_x_list]
b = np.zeros([len(test_x_list),len(max(train_x_list,key = lambda x: len(x)))])
for i,j in enumerate(test_x_list):
    b[i][0:len(j)] = j
test_x_list = b
print('test_x shape:', test_x_list.shape)
print('test_y shape:', test_y.shape)


Preparing the data for LSTM...
test_x shape: (25000, 200)
test_y shape: (25000,)


# Import Libraries for model preparation

In [54]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

### checking sizes of embedding matrix to be prepared

In [55]:
vocab_size, emdedding_size

(4965, 100)

In [56]:
pretrained_weights.shape

(4965, 100)

## Model Preparation

In [57]:
model = Sequential()
model.add(Embedding( input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         496500    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 32)          9632      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 559,433
Trainable params: 559,433
Non-trainable params: 0
_________________________________________________________________
None


## Model Fitting

In [58]:
model.fit(train_x_list, train_y, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x69ebf940>

## Model Evaulation

In [59]:
# Final evaluation of the model
scores = model.evaluate(test_x_list, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.26%
