# Multi Convolutional Sentiment classifier for IMDB reviews

#### Load dependencies

In [8]:
import keras
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, concatenate
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Embedding # new!
from keras.layers import Conv1D, SpatialDropout1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint # new! 
import os # new! 
from sklearn.metrics import roc_auc_score, roc_curve # new!
import pandas as pd
import matplotlib.pyplot as plt # new!
%matplotlib inline

In [2]:
# output directory name:
output_dir = 'model_output/multiconv'

# training:
epochs = 4
batch_size = 128

# vector-space embedding: 
n_dim = 64
n_unique_words = 5000
n_words_to_skip = 50
max_review_length = 400
pad_type = trunc_type = 'pre'
drop_embed = 0.2

# neural network architecture: 
n_dense = 256
dropout = 0.2

n_conv_1 = n_conv_2 = n_conv_3 = 256
k_conv_1 = 3
k_conv_2 = 2
k_conv_3 = 4

#### Load data

For a given data set: 

* the Keras text utilities [here](https://keras.io/preprocessing/text/) quickly preprocess natural language and convert it into an index
* the `keras.preprocessing.text.Tokenizer` class may do everything you need in one line:
    * tokenize into words or characters
    * `num_words`: maximum unique tokens
    * filter out punctuation
    * lower case
    * convert words to an integer index

In [3]:
(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique_words)

In [4]:
x_train.shape

(25000,)

In [None]:
# word_index = keras.datasets.imdb.get_word_index()
# word_index = {k:(v+3) for k,v in word_index.items()}
# word_index["PAD"] = 0
# word_index["START"] = 1
# word_index["UNK"] = 2

In [None]:
# index_word = {v:k for k,v in word_index.items()}

In [None]:
# (all_x_train,_),(all_x_valid,_) = imdb.load_data()

In [5]:
x_train = pad_sequences(x_train, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)
x_valid = pad_sequences(x_valid, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)

In [6]:
x_train[:6]

array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       [ 263, 2504,  182, ...,   21,   64, 2574],
       [   0,    0,    0, ...,    7,   61,  113],
       [   0,    0,    0, ...,    2,   10,   10]], dtype=int32)

In [7]:
for i in range(6):
    print len(x_train[i])

400
400
400
400
400
400


#### Design neural network architecture

In [9]:
input_layer = Input(shape=(max_review_length,), dtype='int16', name='input') # supports integers +/- 32.7k 
embedding_layer = Embedding(n_unique_words, n_dim, input_length=max_review_length, name='embedding')(input_layer)
drop_embed_layer = SpatialDropout1D(drop_embed, name='drop_embed')(embedding_layer)

conv_1 = Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1')(drop_embed_layer)
maxp_1 = GlobalMaxPooling1D(name='maxp_1')(conv_1)

conv_2 = Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2')(drop_embed_layer)
maxp_2 = GlobalMaxPooling1D(name='maxp_2')(conv_2)

conv_3 = Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3')(drop_embed_layer)
maxp_3 = GlobalMaxPooling1D(name='maxp_3')(conv_3)

concat = concatenate([maxp_1, maxp_2, maxp_3])

dense_layer = Dense(n_dense, activation='relu', name='dense')(concat)
drop_dense_layer = Dropout(dropout, name='drop_dense')(dense_layer)
dense_2 = Dense(64, activation='relu', name='dense_2')(drop_dense_layer)
dropout_2 = Dropout(dropout, name='drop_dense_2')(dense_2)

predictions = Dense(1, activation='sigmoid', name='output')(dropout_2)

model = Model(input_layer, predictions)

In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 400)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 400, 64)      320000      input[0][0]                      
__________________________________________________________________________________________________
drop_embed (SpatialDropout1D)   (None, 400, 64)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv_1 (Conv1D)                 (None, 398, 256)     49408       drop_embed[0][0]                 
__________________________________________________________________________________________________
conv_2 (Co

In [11]:
n_dim, n_unique_words, n_dim * n_unique_words

(64, 5000, 320000)

In [12]:
max_review_length, n_dim, n_dim * max_review_length

(400, 64, 25600)

In [13]:
n_dense, n_dim * max_review_length * n_dense + n_dense

(256, 6553856)

#### Configure Model

In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
modelcheckpoint = ModelCheckpoint(filepath=output_dir+"/weights.{epoch:02d}.hdf5")

In [16]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, 
          verbose=1, validation_data=(x_valid, y_valid), callbacks=[modelcheckpoint])

Train on 25000 samples, validate on 25000 samples
Epoch 1/4

#### Evaluate

In [None]:
model.load_weights(output_dir+"/weights.01.hdf5") # zero-indexed

In [None]:
y_hat = model.predict_proba(x_valid)

In [None]:
len(y_hat)

In [None]:
y_hat[0]

In [None]:
plt.hist(y_hat)
_ = plt.axvline(x=0.5, color='orange')

In [None]:
pct_auc = roc_auc_score(y_valid, y_hat)*100.0

In [None]:
"{:0.2f}".format(pct_auc)

In [None]:
float_y_hat = []
for y in y_hat:
    float_y_hat.append(y[0])

In [None]:
ydf = pd.DataFrame(list(zip(float_y_hat, y_valid)), columns=['y_hat', 'y'])

In [None]:
ydf.head(10)

In [None]:
' '.join(index_word[id] for id in all_x_valid[0])

In [None]:
' '.join(index_word[id] for id in all_x_valid[6])

In [None]:
ydf[(ydf.y == 0) & (ydf.y_hat > 0.9)].head(10)

In [None]:
' '.join(index_word[id] for id in all_x_valid[489])

In [None]:
ydf[(ydf.y == 1) & (ydf.y_hat < 0.1)].head(10)