# IMDB

## Prepare 

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
from matplotlib import pyplot as plt
sys.path.append('..')

In [2]:
import cv2
import csv
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from utils.commands import unzip, mkdir, call, count_file, KaggleCLI, execute_in, unzip_all, load_array
from utils.plot import plot_images, plot_confusion_matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from keras import optimizers, initializers, losses, callbacks, regularizers
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils.data_utils import get_file
from keras.regularizers import l2
from keras.optimizers import Adam

In [5]:
model_path = pjoin(os.getcwd(), 'models')
cal_path = pjoin(os.getcwd(), 'cal')
for p in [model_path, cal_path]:
    mkdir(p)

Things will be done:
- examine the data
- Collaborative Filtering


## Examine Data

### Loading data

In [4]:
from keras.datasets import imdb
idx = imdb.get_word_index()
idx2word = {v: k for k, v in idx.iteritems()}

In [5]:
num_words = 5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=num_words,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=None,
                                                      oov_char=5000,
                                                      index_from=0)

In [42]:
lens = np.array(map(len, x_train))
print lens.max(), lens.min(), lens.mean()

2493 10 237.71364


### Preprocessing Data

In [6]:
maxlen = 500
x_train = sequence.pad_sequences(x_train, maxlen=maxlen, value=0)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen, value=0)

## Simple&CNN

### Single hidden layer NN

In [57]:
def build_simple_nn():
    model = Sequential([
        Embedding(num_words, 32, input_length=maxlen),
        Flatten(),
        BatchNormalization(),
        Dense(300, activation='relu'),
        BatchNormalization(),
        Dropout(0.7),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

single_model = build_simple_nn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 16000)             0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 16000)             64000     
_________________________________________________________________
dense_7 (Dense)              (None, 300)               4800300   
_________________________________________________________________
batch_normalization_2 (Batch (None, 300)               1200      
_________________________________________________________________
dropout_3 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 301       
Total para

In [58]:
single_model.compile(Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [59]:
single_model.fit(x_train, y_train, batch_size=64, epochs=10, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f10c0270190>

### Simple CNN

In [79]:
def build_simple_cnn():
    model = Sequential([
        Embedding(num_words, 32, input_length=maxlen),
        BatchNormalization(),
        Dropout(0.2),
        Conv1D(64, 5, padding='same', activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        MaxPooling1D(),
        Flatten(),
        Dropout(0.4),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dropout(0.7),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

simple_cnn = build_simple_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
batch_normalization_33 (Batc (None, 500, 32)           128       
_________________________________________________________________
dropout_18 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 500, 64)           10304     
_________________________________________________________________
batch_normalization_34 (Batc (None, 500, 64)           256       
_________________________________________________________________
dropout_19 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 250, 64)           0         
__________

In [80]:
simple_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [81]:
simple_cnn.fit(x_train, y_train, batch_size=64, epochs=5, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0fe3e6bc10>

### vgg style cnn 

In [73]:
def build_vgg_cnn():
    model = Sequential([
        Embedding(num_words, 32, input_length=maxlen),
        BatchNormalization(),
        Conv1D(16, 5, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(16, 5, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(32, 10, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(32, 10, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(64, 20, padding='same', activation='relu'),
        BatchNormalization(),
        Conv1D(64, 20, padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Flatten(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

vgg_cnn = build_vgg_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
batch_normalization_21 (Batc (None, 500, 32)           128       
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 500, 16)           2576      
_________________________________________________________________
batch_normalization_22 (Batc (None, 500, 16)           64        
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 500, 16)           1296      
_________________________________________________________________
batch_normalization_23 (Batc (None, 500, 16)           64        
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 250, 16)           0         
__________

In [74]:
vgg_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [75]:
vgg_cnn.fit(x_train, y_train, batch_size=64, epochs=5, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f10a04eb450>

### Transfer learning with GloVe

In [7]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

In [8]:
import re
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((num_words, n_fact))    
    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb /= 3
    return emb


In [18]:
from keras.layers import SpatialDropout1D

def build_glove_cnn():
    model = Sequential([
        Embedding(num_words, 50, input_length=maxlen, weights=[create_emb()], trainable=False),
        SpatialDropout1D(0.2),
        Conv1D(64, 5, padding='same', activation='relu'),
        Dropout(0.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.summary()
    return model

glove_cnn = build_glove_cnn()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 500, 50)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 64)           16064     
_________________________________________________________________
dropout_8 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               1600100   
__________

In [22]:
glove_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [23]:
glove_cnn.optimizer.lr = 0.0001

In [21]:
glove_cnn.layers[0].trainable = True

In [24]:
glove_cnn.fit(x_train, y_train, batch_size=64, epochs=3, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc6081477d0>

### Multi-Size CNN with GloVe

In [29]:
def build_mutisize_cnn_glove():
    graph_in = Input((num_words, 50))
    convs = []
    for fsz in range(3, 6):
        x = Conv1D(64, fsz, padding='same', activation='relu')(graph_in)
        x = MaxPooling1D()(x)
        x = Flatten()(x)
        convs.append(x)
        
    out = Concatenate()(convs)
    graph = Model(graph_in, out)
    
    model = Sequential([
        Embedding(num_words, 50, input_length=maxlen, weights=[create_emb()], trainable=False),
        SpatialDropout1D(0.2),
        graph,
        Dropout(0.2),
        Dense(100, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    
    model.summary()
    return model

multisize_cnn_glove = build_mutisize_cnn_glove()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 500, 50)           0         
_________________________________________________________________
model_3 (Model)              multiple                  38592     
_________________________________________________________________
dropout_14 (Dropout)         (None, 48000)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 100)               4800100   
_________________________________________________________________
dropout_15 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 101       
Total para

In [33]:
multisize_cnn_glove.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [34]:
multisize_cnn_glove.optimizer.lr = 0.0001

In [32]:
multisize_cnn_glove.layers[0].trainable = True

In [35]:
multisize_cnn_glove.fit(x_train, y_train, batch_size=64, epochs=5, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc58c2c4f10>