# Recurrent Neural Network

In [1]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

## Data Preprocessor

In [3]:
def preprocess_data(filepath):
    positive_path = os.path.join(filepath, "pos")
    negative_path = os.path.join(filepath, "neg")
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

## Tokenizer and Vectorizer

In [5]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try: 
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

## Pull Expected Values

In [6]:
def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

## Load and Prepare Data

In [12]:
dataset = preprocess_data("../Datasets/aclimdb/train")
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

## Network Parameters

In [13]:
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

##  Padding and Truncating Token Sequence

In [14]:
def pad_trunc(data, maxlen):
    '''
    For a given dataset, pad with zero vectors or truncate to maxlen
    '''
    new_data = []
    
    # vector of 0 the length of the word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    #Iterate through rows, truncate if too big, add zero vectors if too small
    for sample in data:
        temp = []
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen-len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data
        

## Load test and training data

In [15]:
import numpy as np

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

## Recurrent Neural Network Model

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN
num_neurons = 50
model = Sequential()

model.add(SimpleRNN(
    num_neurons, return_sequences=True,
    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
#Dense layer expects flat vector of n elements
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (None, 400, 50)           17550     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test,y_test))



Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fc3d0b17f50>

In [20]:
model_structure = model.to_json()
with open("simplernn_model1.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("simplernn_weights1.h5")

## Making improvements to the model

In [21]:
epochs=5

In [22]:
num_neurons = 100
model = Sequential()
model.add(SimpleRNN(
    num_neurons, 
    return_sequences=True,
    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
#Dense layer expects flat vector of n elements
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test,y_test))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 400, 100)          40100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 400, 100)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 40001     
Total params: 80,101
Trainable params: 80,101
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fc3d1415ad0>

In [24]:
model_structure = model.to_json()
with open("models_and_json/simplernn_model2.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("models_and_json/simplernn_weights2.h5")

## Prediction

In [23]:
sample_1 = "I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."

In [26]:
from keras.models import model_from_json

with open("models_and_json/simplernn_model2.json", "r") as json_file:
    model_json = json_file.read()
model = model_from_json(model_json)
model.load_weights('models_and_json/simplernn_weights2.h5')

vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict_classes(test_vec)

array([[0]], dtype=int32)