In [2]:
import numpy as np 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

## Loading dataset

In [7]:
import glob
import os
from random import shuffle


def preprocess_data(file_path):
    pos_path = os.path.join(file_path, "pos")
    neg_path = os.path.join(file_path, "neg")
    pos_label = 1
    neg_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(pos_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(pos_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))
    shuffle(dataset)
    
    return dataset

In [8]:
dataset = preprocess_data('../Datasets/aclimdb/train')
dataset[0]

(1,
 "Hold Your Man finds Jean Harlow, working class girl from Brooklyn falling for con man Clark Gable and getting in all kinds of trouble. The film starts out as his film, but by the time it's over the emphasis definitely switches to her character.<br /><br />The film opens with Gable pulling a street con game with partner, Garry Owen and the mark yelling for the cops. As he's being chased Gable ducks into Harlow's apartment and being he's such a charming fellow, she shields him.<br /><br />Before long she's involved with him and unfortunately with his rackets. Gable, Harlow, and Owen try pulling a badger game on a drunken Paul Hurst, but then Gable won't go through with it. Of course when Hurst realizes it was a con, he's still sore and gets belligerent and Gable has to punch him out. But then he winds up dead outside Harlow's apartment and that platinum blond hair makes her easy to identify. She goes up on an accomplice to manslaughter.<br /><br />The rest of the film is her's and 

## Vectorizer and tokenizer

In [9]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

In [14]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens: 
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

## Target labels

In [11]:
def collect_expected(dataset):
    '''
    Grab the target values from the dataset we created
    '''
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

## Execute Preprocessing

In [15]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

## Train/Test Split

In [30]:
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

## CNN Parameters

In [31]:
maxlen = 400
# How many samples to show the net before backpropagating the error and updating the weights
batch_size = 32
# length of the token vectors you'll create for passing into the convent
embedding_dims = 300
filters = 250
# embedding_dims * kernel_size 
kernel_size = 3
hidden_dims = 250
epochs = 2

##  Padding and Truncating Token Sequence

In [18]:
def pad_trunc(data, maxlen):
    '''
    For a given dataset, pad with zero vectors or truncate to maxlen
    '''
    new_data = []
    
    # vector of 0 the length of the word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    #Iterate through rows, truncate if too big, add zero vectors if too small
    for sample in data:
        temp = []
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen-len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data
        

In [21]:
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
y_train = np.array(y_train)
X_test = np.reshape(X_test, (len(X_test, maxlen, embeddings_dims)))
y_test = np.array(y_test)

[ 0.13769531 -0.06542969  0.00628662  0.08496094 -0.22167969 -0.13964844
  0.0267334   0.14160156  0.05126953  0.3359375  -0.36914062 -0.13964844
 -0.01855469  0.15039062 -0.4140625   0.20410156  0.00369263  0.07763672
 -0.06835938  0.0612793   0.26171875 -0.0456543   0.25        0.24316406
  0.03857422 -0.05737305  0.33984375  0.08007812 -0.00579834 -0.15917969
  0.23535156  0.00866699 -0.41601562 -0.22460938  0.14453125  0.08398438
  0.14746094  0.16992188 -0.02258301 -0.23925781 -0.30859375 -0.16796875
  0.05200195 -0.10546875  0.12109375 -0.33007812 -0.11035156 -0.29492188
  0.01647949 -0.05297852 -0.25976562  0.17089844  0.0859375   0.24023438
  0.05615234 -0.06689453 -0.16992188  0.18847656 -0.13574219  0.24609375
 -0.20996094  0.00823975  0.11230469  0.0267334   0.13476562  0.00952148
 -0.078125    0.31640625  0.09326172  0.35546875 -0.01599121 -0.2265625
  0.05859375  0.3515625  -0.16699219 -0.32617188  0.18457031 -0.08447266
  0.28320312  0.17480469  0.16601562 -0.34179688  0.

NameError: name 'embeddings_dims' is not defined

 ## Construct 1D CNN

In [24]:
print('Build model...')
model = Sequential()

# learns word group filters
model.add(Conv1D(
    filters, 
    kernel_size,
    padding='valid',
    activation='relu',
    strides=1,
    input_shape=(maxlen, embedding_dims)))

model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [25]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 5000 arrays: [array([[-0.16210938,  0.08691406,  0.20117188, ..., -0.08398438,
         0.04492188, -0.1796875 ],
       [ 0.20410156,  0.01318359,  0.07568359, ..., -0.21191406,
        -0.1328125 ,  0.10839844],...

In [26]:
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file: 
    json_file.write(model_structure)
model.save_weights("cnn_weights.h5")

In [27]:
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()

model = model_from_json(json_string)
model.load_weights("cnn_weights.h5")

In [28]:
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."

In [29]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)

[ 0.07910156 -0.0050354   0.11181641  0.21289062  0.13085938 -0.01470947
 -0.03540039 -0.07763672  0.04077148  0.11474609  0.00147247 -0.29101562
  0.00457764 -0.20019531 -0.19238281  0.08007812  0.10107422  0.04858398
  0.15722656 -0.09521484 -0.05004883  0.25        0.33007812 -0.09716797
 -0.05566406 -0.0071106  -0.16796875 -0.13574219  0.05102539 -0.00598145
  0.10791016  0.16503906 -0.03955078 -0.03955078  0.04321289  0.12060547
  0.13476562  0.09375     0.00909424  0.1640625   0.21289062 -0.05322266
  0.33398438  0.01586914  0.10449219  0.24121094 -0.0189209  -0.04199219
  0.05834961  0.03271484  0.09863281  0.18945312  0.04125977  0.01501465
 -0.05883789  0.10253906  0.01538086  0.03198242  0.02722168 -0.13769531
  0.12695312  0.06396484 -0.13574219 -0.012146    0.07617188 -0.02319336
 -0.21191406  0.20996094 -0.01953125  0.02038574  0.16113281 -0.00897217
  0.04663086  0.03881836 -0.4609375  -0.1796875   0.12792969 -0.00564575
  0.24121094  0.21777344 -0.02600098 -0.1171875   0

array([[0.5245353]], dtype=float32)