In [1]:
import cPickle
import numpy as np

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Merge
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, Adadelta
from keras.constraints import unitnorm, maxnorm
from keras.regularizers import l2
from keras.layers.advanced_activations import PReLU

from sklearn.metrics import roc_auc_score

Using gpu device 0: GeForce GT 750M
DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/Vitaly/AppData/Local/Theano/compiledir_Windows-8-6.2.9200-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.6-64/tmpu6mgch/f90a7a397ede0965c66895e171db1485.lib and object C:/Users/Vitaly/AppData/Local/Theano/compiledir_Windows-8-6.2.9200-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.6-64/tmpu6mgch/f90a7a397ede0965c66895e171db1485.exp



Load train, validation and test data

In [2]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = kernel_size - 1
    for i in xrange(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data(revs, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, val, test = [], [], []
    for rev in revs:
        sent = get_idx_from_sent(rev['text'], word_idx_map, max_l, kernel_size)
        sent.append(rev['y'])
        if rev['split'] == 1:
            train.append(sent)
        elif rev['split'] == 0:
            val.append(sent)
        else:
            test.append(sent)
    train = np.array(train, dtype=np.int)
    val = np.array(val, dtype=np.int)
    test = np.array(test, dtype=np.int)
    return [train, val, test]


print "loading data..."
x = cPickle.load(open("imdb-train-val-test.pickle", "rb"))
revs, W, word_idx_map, vocab = x[0], x[1], x[2], x[3]
print "data loaded!"


datasets = make_idx_data(revs, word_idx_map, max_l=2633, kernel_size=5)

loading data...
data loaded!


Put train data in separate NumPy arrays

In [3]:
# Train data preparation
N = datasets[0].shape[0]
conv_input_width = W.shape[1]
conv_input_height = int(datasets[0].shape[1]-1)

# For each word write a word index (not vector) to X tensor
train_X = np.zeros((N, conv_input_height), dtype=np.int)
train_Y = np.zeros((N, 2), dtype=np.int)
for i in xrange(N):
    for j in xrange(conv_input_height):
        train_X[i, j] = datasets[0][i, j]
    train_Y[i, datasets[0][i, -1]] = 1
    
print 'train_X.shape = {}'.format(train_X.shape)
print 'train_Y.shape = {}'.format(train_Y.shape)

train_X.shape = (20033L, 2641L)
train_Y.shape = (20033L, 2L)


Put validation data in separate NumPy arrays

In [4]:
# Validation data preparation
Nv = datasets[1].shape[0]

# For each word write a word index (not vector) to X tensor
val_X = np.zeros((Nv, conv_input_height), dtype=np.int)
val_Y = np.zeros((Nv, 2), dtype=np.int)
for i in xrange(Nv):
    for j in xrange(conv_input_height):
        val_X[i, j] = datasets[1][i, j]
    val_Y[i, datasets[1][i, -1]] = 1
    
print 'val_X.shape = {}'.format(val_X.shape)
print 'val_Y.shape = {}'.format(val_Y.shape)

val_X.shape = (4967L, 2641L)
val_Y.shape = (4967L, 2L)


Let's define and compile CNN model with Keras

In [5]:
# Number of feature maps (outputs of convolutional layer)
N_fm = 300
# kernel size of convolutional layer
kernel_size = 8

model = Sequential()
# Embedding layer (lookup table of trainable word vectors)
model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm()))
# Reshape word vectors from Embedding to tensor format suitable for Convolutional layer
model.add(Reshape(1, conv_input_height, conv_input_width))

# first convolutional layer
model.add(Convolution2D(N_fm, 1, kernel_size, conv_input_width, border_mode='valid', W_regularizer=l2(0.0001)))
# ReLU activation
model.add(Activation('relu'))

# aggregate data in every feature map to scalar using MAX operation
model.add(MaxPooling2D(poolsize=(conv_input_height-kernel_size+1, 1), ignore_border=True))

model.add(Flatten())
model.add(Dropout(0.5))
# Inner Product layer (as in regular neural network, but without non-linear activation function)
model.add(Dense(N_fm, 2))
# SoftMax activation; actually, Dense+SoftMax works as Multinomial Logistic Regression
model.add(Activation('softmax'))

# Custom optimizers could be used, though right now standard adadelta is employed
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [6]:
epoch = 0
val_acc = []
val_auc = []

Train model for N_epoch epochs (could be run as many times as needed)

In [7]:
N_epoch = 3

for i in xrange(N_epoch):
    model.fit(train_X, train_Y, batch_size=50, nb_epoch=1, verbose=1, show_accuracy=True)
    output = model.predict_proba(val_X, batch_size=10, verbose=1)
    # find validation accuracy using the best threshold value t
    vacc = np.max([np.sum((output[:,1]>t)==(val_Y[:,1]>0.5))*1.0/len(output) for t in np.arange(0.0, 1.0, 0.01)])
    # find validation AUC
    vauc = roc_auc_score(val_Y, output[:,0])
    val_acc.append(vacc)
    val_auc.append(vauc)
    print 'Epoch {}: validation accuracy = {:.3%}, validation AUC = {:.3%}'.format(epoch, vacc, vauc)
    epoch += 1
    
print '{} epochs passed'.format(epoch)
print 'Accuracy on validation dataset:'
print val_acc
print 'AUC on validation dataset:'
print val_auc

Epoch 0
Epoch 0: validation accuracy = 88.041%, validation AUC = 94.758%
Epoch 0
Epoch 1: validation accuracy = 90.195%, validation AUC = 96.278%
Epoch 0
Epoch 2: validation accuracy = 90.799%, validation AUC = 96.634%
3 epochs passed
Accuracy on validation dataset:
[0.88041071069055765, 0.90195288906784776, 0.90799275216428432]
AUC on validation dataset:
[0.94757691949701806, 0.96277742049411519, 0.9663356090302716]


Save model

In [8]:
model.save_weights('cnn_3epochs.model')

Put test data in separate NumPy array

In [9]:
# Test data preparation
Nt = datasets[2].shape[0]

# For each word write a word index (not vector) to X tensor
test_X = np.zeros((Nt, conv_input_height), dtype=np.int)
for i in xrange(Nt):
    for j in xrange(conv_input_height):
        test_X[i, j] = datasets[2][i, j]
    
print 'test_X.shape = {}'.format(test_X.shape)

test_X.shape = (25000L, 2641L)


In [10]:
p = model.predict_proba(test_X, batch_size=10)



Prepare submission file for Kaggle

In [12]:
import pandas as pd
data = pd.read_csv('DATA/testData.tsv', sep='\t')
d = pd.DataFrame({'id': data['id'], 'sentiment': p[:,0]})
d.to_csv('cnn_3epochs.csv', index=False)