In [1]:
import numpy as np
import data_helpers
from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
np.random.seed(0)

Using TensorFlow backend.


### -------------------------- Parameter section ------------------------

In [2]:
model_type = "CNN-static"  # CNN-rand|CNN-non-static|CNN-static

embedding_dim = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10

### ------------------------------ Loading data -------------------------------

In [23]:
print("Load data...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

Load data...
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [24]:
# x_train/test contains index of words
np.unique(x_train)

array([   0,    1,    2, ..., 4997, 4998, 4999], dtype=int32)

In [25]:
vocabulary = imdb.get_word_index()
vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
vocabulary_inv[0] = "<PAD/>"

In [26]:
# Maps word to index
print (vocabulary['pistol'])

# Maps index to word
print(vocabulary_inv[8957])

8957
pistol


In [27]:
x_train2 = x_train[1:100, :]

In [28]:
x_train2.shape

(99, 400)

### Model Variations

**CNN-rand:** All words are randomly initialized  
**CNN-static:** use pretrained vectors from word2vec. Keep word vectors static during training  
**CNN-nonstatic:** use pretrained vectors from word2vec. Allow word vectors to change during training  

In [29]:
# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type == "CNN-non-static" or model_type == "CNN-static":
    embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
    if model_type == "CNN-static":
        x_train2 = embedding_weights[0][x_train2]
#         x_test = embedding_weights[0][x_test]
        
elif model_type == "CNN-rand":
    embedding_weights = None
    
else:
    raise ValueError("Unknown model type")

Model type is CNN-static
Load existing Word2Vec model '50features_1minwords_10context'


In [33]:
x_train2[1].shape

(400, 50)

In [31]:
x_train[1]

array([   1,  194, 1153,  194,    2,   78,  228,    5,    6, 1463, 4369,
          2,  134,   26,    4,  715,    8,  118, 1634,   14,  394,   20,
         13,  119,  954,  189,  102,    5,  207,  110, 3103,   21,   14,
         69,  188,    8,   30,   23,    7,    4,  249,  126,   93,    4,
        114,    9, 2300, 1523,    5,  647,    4,  116,    9,   35,    2,
          4,  229,    9,  340, 1322,    4,  118,    9,    4,  130, 4901,
         19,    4, 1002,    5,   89,   29,  952,   46,   37,    4,  455,
          9,   45,   43,   38, 1543, 1905,  398,    4, 1649,   26,    2,
          5,  163,   11, 3215,    2,    4, 1153,    9,  194,  775,    7,
          2,    2,  349, 2637,  148,  605,    2,    2,   15,  123,  125,
         68,    2,    2,   15,  349,  165, 4362,   98,    5,    4,  228,
          9,   43,    2, 1157,   15,  299,  120,    5,  120,  174,   11,
        220,  175,  136,   50,    9, 4373,  228,    2,    5,    2,  656,
        245, 2350,    5,    4,    2,  131,  152,  4

In [14]:
len(vocabulary_inv)

88585

In [13]:
embedding_weights[0].shape

(88585, 50)

In [29]:
# Just a vector representation used for the word (currently we are using random representation for words )
embedding_wts = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
print(len(vocabulary))
embedding_wts[0].shape

Load existing Word2Vec model '50features_1minwords_10context'
88584


(88585, 50)

### --------------------------- Creating model  ------------------------------------

In [24]:
sequence_length

400

In [26]:
input_shape

(400,)

In [25]:
model_input1 = Input(shape=input_shape)

In [30]:
len(vocabulary_inv)

88585

In [9]:
# Model Hyperparameters
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50


input_shape = (sequence_length, embedding_dim) if model_type == "CNN-static" else (sequence_length,)
model_input = Input(shape=input_shape)

# Static model do not have embedding layer
if model_type == "CNN-static":
    z = Dropout(dropout_prob[0])(model_input)
else:
    z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)
    z = Dropout(dropout_prob[0])(z)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters, # 10 in this case
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [28]:
z

<tf.Tensor 'dense_1/Relu:0' shape=(?, 50) dtype=float32>

In [22]:
model_input.graph

<tensorflow.python.framework.ops.Graph at 0x7fc058bbe4e0>

In [10]:
# Initialize weights with word2vec
if model_type == "CNN-non-static":
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights(embedding_weights)

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
53s - loss: 0.5204 - acc: 0.7026 - val_loss: 0.3068 - val_acc: 0.8720
Epoch 2/10
39s - loss: 0.3002 - acc: 0.8741 - val_loss: 0.2894 - val_acc: 0.8794
Epoch 3/10
39s - loss: 0.2585 - acc: 0.8949 - val_loss: 0.2791 - val_acc: 0.8826
Epoch 4/10
41s - loss: 0.2404 - acc: 0.9045 - val_loss: 0.2771 - val_acc: 0.8854
Epoch 5/10
39s - loss: 0.2276 - acc: 0.9110 - val_loss: 0.2872 - val_acc: 0.8818
Epoch 6/10
38s - loss: 0.2193 - acc: 0.9129 - val_loss: 0.2848 - val_acc: 0.8808
Epoch 7/10
39s - loss: 0.2125 - acc: 0.9157 - val_loss: 0.2953 - val_acc: 0.8784
Epoch 8/10
39s - loss: 0.2013 - acc: 0.9216 - val_loss: 0.3063 - val_acc: 0.8780
Epoch 9/10
39s - loss: 0.1981 - acc: 0.9223 - val_loss: 0.3096 - val_acc: 0.8751
Epoch 10/10
39s - loss: 0.1932 - acc: 0.9240 - val_loss: 0.3146 - val_acc: 0.8756


<keras.callbacks.History at 0x7fc058572198>