In [1]:
import numpy as np 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

Using TensorFlow backend.


## Loading dataset

In [2]:
import glob
import os
from random import shuffle


def preprocess_data(file_path):
    pos_path = os.path.join(file_path, "pos")
    neg_path = os.path.join(file_path, "neg")
    pos_label = 1
    neg_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(pos_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(pos_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))
    shuffle(dataset)
    
    return dataset

In [3]:
dataset = preprocess_data('../Datasets/aclimdb/train')
dataset[0]

(1,
 "First of all for this movie I just have one word: 'wow'. This is probably, one of the best movies that touched me, from it's story to it's performances, so wonderfully played by Sophia Loren and Marcello Mastroianni. I was very impressed with this last one, because he really brought depth to the character, as it was a very hard role. Still, the two of them formed a pair, that surprised me, from the beginning until the end, showing in the way, a friendship filled with love, that develops during the entire day, settled in the movie. The story takes some time to roll, as the introduction of the characters is long, but finally we are compensated with a wonderful tale about love and humanity. If you have the chance, see it, because it's a movie that will stay in your mind for many time. Simply amazing - 9/10.")

## Vectorizer and tokenizer

In [4]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

In [5]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens: 
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

## Target labels

In [6]:
def collect_expected(dataset):
    '''
    Grab the target values from the dataset we created
    '''
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

## Execute Preprocessing

In [7]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

## Train/Test Split

In [8]:
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

## CNN Parameters

In [9]:
maxlen = 400
# How many samples to show the net before backpropagating the error and updating the weights
batch_size = 32
# length of the token vectors you'll create for passing into the convent
embedding_dims = 300
filters = 250
# embedding_dims * kernel_size 
kernel_size = 3
hidden_dims = 250
epochs = 3

##  Padding and Truncating Token Sequence

In [10]:
def pad_trunc(data, maxlen):
    '''
    For a given dataset, pad with zero vectors or truncate to maxlen
    '''
    new_data = []
    
    # vector of 0 the length of the word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    #Iterate through rows, truncate if too big, add zero vectors if too small
    for sample in data:
        temp = []
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen-len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data
        

In [11]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

 ## Construct 1D CNN

In [12]:
print('Build model...')
model = Sequential()

# learns word group filters
model.add(Conv1D(
    filters, 
    kernel_size,
    padding='valid',
    activation='relu',
    strides=1,
    input_shape=(maxlen, embedding_dims)))

model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fd6e4722910>

In [14]:
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file: 
    json_file.write(model_structure)
model.save_weights("cnn_weights.h5")

In [15]:
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()

model = model_from_json(json_string)
model.load_weights("cnn_weights.h5")

In [18]:
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."

In [19]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)

array([[1.]], dtype=float32)