In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.utils.data_utils import get_file
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Convolution1D, MaxPooling1D, ZeroPadding1D
from keras.utils import np_utils
from keras.optimizers import Adam
import cPickle as pickle
import bcolz
import re
from numpy.random import random, permutation, randn, normal, uniform, choice

Using TensorFlow backend.


In [5]:
idx = imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [4]:
(x_train, labels_train), (x_test, labels_test) = imdb.load_data()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [6]:
idx2word = {v:k for k,v in idx.items()}

In [7]:
idx2word[5000]

u'1987'

In [8]:
print x_train.shape

print x_test.shape

print labels_train
' '.join(idx2word[i] for i in x_train[0])

(25000,)
(25000,)
[1 0 0 ..., 0 1 0]


u"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but w

In [9]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [10]:
lens = np.array(map(len, trn))
print max(lens), min(lens), lens.mean()

2494 11 238.71364


In [11]:
review_len = 500

In [12]:
trn = sequence.pad_sequences(trn, maxlen=review_len, value=0)
test = sequence.pad_sequences(test, maxlen=review_len, value=0)

In [13]:
trn.shape

(25000, 500)

In [14]:
trn[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Simple NN model

In [15]:
model = Sequential([
        Embedding(vocab_size, 32, input_length=review_len),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.7),
        Dense(1, activation='sigmoid')
    ])

In [16]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201.0
Trainable params: 1,760,201.0
Non-trainable params: 0.0
_________________________________________________________________


In [17]:
model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))



Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fee20ed7cd0>

# Using CNN Model

In [18]:
conv_model = Sequential([
        Embedding(vocab_size, 32, input_length=review_len),
        Dropout(0.2),
        ZeroPadding1D(padding=1),
        Convolution1D(64, 5, activation='relu'),
        Dropout(0.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

In [19]:
conv_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
conv_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
zero_padding1d_1 (ZeroPaddin (None, 502, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 498, 64)           10304     
_________________________________________________________________
dropout_3 (Dropout)          (None, 498, 64)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 249, 64)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 15936)             0         
__________

In [20]:
conv_model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fee2250fd10>

In [21]:
conv_model.evaluate(test, labels_test)



[0.28144591594696045, 0.88224000000000002]

# Using Pre-trained Network - Glove(6Billion-50Dimension) (Recommended)

In [27]:
def load_vectors(loc):
    return (bcolz.open(loc+'.dat')[:],
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [14]:
vecs, words, wordidx = load_vectors('6B.50d')

The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).

In [15]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            # Embedding of IMDB word is fetched from Glove for that same word. 
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [16]:
emb = create_emb()

In [17]:
conv_model = Sequential([
    Embedding(vocab_size, 50, input_length=review_len, dropout=0.2, weights=[emb], trainable=False),
    Dropout(0.25),
    Convolution1D(64, 5, activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
    ])

In [18]:
conv_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
conv_model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3c34911390>

In [21]:
conv_model.layers[0].trainable=True

In [23]:
conv_model.optimizer.lr = 1e-4
conv_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
conv_model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3c33ecb150>

In [24]:
conv_model.fit(trn, labels_train, nb_epoch=5, batch_size=64, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3c342c7f10>

In [26]:
conv_model.optimizer.lr = 1e-5
conv_model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3c59a15990>

# Multi CNN

In [22]:
from keras.layers import Merge, Input
from keras.models import Model

In [23]:
graph_in = Input((vocab_size, 32))
convs = []

for i in range(3,6):
    x = Convolution1D(64, i, activation='relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)
out = Merge(mode='concat')(convs)
graph = Model(input=graph_in, output=out)

  '` call to the Keras 2 API: ' + signature)


In [24]:
model = Sequential([
        Embedding(vocab_size, 32, input_length=review_len),
        Dropout(0.2),
        graph,
        Dropout(0.5),
        Dense(100, activation='relu'),
        Dropout(0.7),
        Dense(1, activation='sigmoid')
    ])

In [25]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [26]:
model.fit(trn, labels_train, nb_epoch=3, batch_size=64, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fedfc3e7650>

# LSTM

In [28]:
model = Sequential([
        Embedding(vocab_size, 32, input_length=review_len),
        LSTM(100),
        Dense(1, activation='sigmoid')
    ])


In [29]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 213,301.0
Trainable params: 213,301
Non-trainable params: 0.0
_________________________________________________________________


In [31]:
model.fit(trn, labels_train, nb_epoch=2, batch_size=64, validation_data=(test, labels_test))