# LSTM

In [1]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

## Data Preprocessor

In [2]:
def preprocess_data(filepath):
    positive_path = os.path.join(filepath, "pos")
    negative_path = os.path.join(filepath, "neg")
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, "*.txt")):
        with open(filename, "r") as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

## Tokenizer and Vectorizer

In [3]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try: 
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

## Pull Expected Values

In [10]:
def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

## Load and Prepare Data

In [5]:
dataset = preprocess_data("../Datasets/aclimdb/train")
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

NameError: name 'preprocess_data' is not defined

## Network Parameters

In [6]:
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

##  Padding and Truncating Token Sequence

In [11]:
def pad_trunc(data, maxlen):
    '''
    For a given dataset, pad with zero vectors or truncate to maxlen
    '''
    new_data = []
    
    # vector of 0 the length of the word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    #Iterate through rows, truncate if too big, add zero vectors if too small
    for sample in data:
        temp = []
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen-len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data
        

## Load test and training data

In [8]:
import numpy as np

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

## Build LSTM Model

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM

num_neurons = 50
model = Sequential()
model.add(LSTM(num_neurons,
               return_sequences=True,
               input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 400, 50)           70200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 90,201
Trainable params: 90,201
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test,y_test))


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fc67bd28e90>

In [12]:
model_structure = model.to_json()
with open("models_and_json/lstm_model1.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("models_and_json/lstm_weights1.h5")

In [14]:
from keras.models import model_from_json
with open("models_and_json/lstm_model1.json", "r") as json_file:
    json_model = json_file.read()
model = model_from_json(json_model)
model.load_weights("models_and_json/lstm_weights1.h5")

In [15]:
sample_1 = """I hate that the dismal weather had me down for so long,
... when will it break! Ugh, when does happiness return? The sun is
... blinding and the puffy clouds are too thin. I can't wait for the
... weekend."""

In [16]:
vec_list = tokenize_and_vectorize([(1,sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
print("Sample's sentiment, 1 - pos, 2 - neg : {}".format(model.predict_classes(test_vec)))

Sample's sentiment, 1 - pos, 2 - neg : [[0]]


In [17]:
print("Raw output of sigmoid function: {}".format(model.predict(test_vec)))

Raw output of sigmoid function: [[0.2482055]]


## Test Data for Embedding Length

In [8]:
def test_len(data, maxlen):
    total_len = truncated = padded = exact = 0 
    for sample in data:
        total_len+=len(sample)
        if len(sample) > maxlen: 
            truncated += 1
        elif len(sample) < maxlen: 
            padded += 1
        else: 
            exact += 1
    print("Padded: {}".format(padded))
    print("Equal: {}".format(exact))
    print("Truncated: {}".format(truncated))
    print("Avg length: {}".format(total_len/len(data)))
            

In [9]:
dataset = preprocess_data('../Datasets/aclimdb/train')
vectorized_data = tokenize_and_vectorize(dataset)
test_len(vectorized_data, 400)

Padded: 22458
Equal: 20
Truncated: 2522
Avg length: 205.2144


## Optimize LSTM Hyperparameters

In [16]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM

maxlen = 200
batch_size = 32
embedding_dims = 300
epochs = 2
num_neurons = 50

dataset = preprocess_data('../Datasets/aclimdb/train')
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
split_point = int(len(vectorized_data)*0.8)

x_train = vectorized_data[:split_point]
x_train = pad_trunc(x_train, maxlen)
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))

y_train = expected[:split_point]
y_train = np.array(y_train)

x_test = vectorized_data[split_point:]
x_test = pad_trunc(x_test, maxlen)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))

y_test = expected[split_point:]
y_test = np.array(y_test)

In [17]:
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True,
               input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 200, 50)           70200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200, 50)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 10001     
Total params: 80,201
Trainable params: 80,201
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7ffa81c19f90>

In [20]:
model_structure = model.to_json()
with open("models_and_json/lstm_model2.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("models_and_json/lstm_weights2.h5")

# Shakespeare Chatbot

In [2]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/tomjoshi/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [3]:
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

## Preproces Shakespeare Plays

In [4]:
text = ''
for txt in gutenberg.fileids():
    if 'shakespeare' in txt:
        text += gutenberg.raw(txt).lower()
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
'corpus length:{} total chars:{}'.format(len(text), len(chars)) 

'corpus length:375542 total chars:50'

In [5]:
print(text[:500])

[the tragedie of julius caesar by william shakespeare 1599]


actus primus. scoena prima.

enter flauius, murellus, and certaine commoners ouer the stage.

  flauius. hence: home you idle creatures, get you home:
is this a holiday? what, know you not
(being mechanicall) you ought not walke
vpon a labouring day, without the signe
of your profession? speake, what trade art thou?
  car. why sir, a carpenter

   mur. where is thy leather apron, and thy rule?
what dost thou with thy best apparrell on


##  Assemble Training Set

In [6]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i+maxlen])
# get number of training samples
print('nb sequences: ', len(sentences))

nb sequences:  125168


## One Hot Encode Training Examples

In [8]:
import numpy as np
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
model = Sequential()
# input_shape = (features, time steps)
model.add(LSTM(128,
               input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [13]:
epochs = 6
batch_size = 128
model_structure = model.to_json()
with open("models_and_json/shakespeare_lstm_model.json", "w") as json_file:
    json_file.write(model_structure)

for i in range(5):
    model.fit(X, y, batch_size=batch_size, epochs=epochs)
    model.save_weights("models_and_json/{}_shakespeare_lstm_weights.h5".format(i+1))


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [14]:
import random
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Generate Text With Diversity Levels

1. Random piece of 40 characters from original text and predicitng what character will come next.
2. Append predicted charater to the input sentence, drop the first character, and predict again on those 40 characters as input.
3. Flush so that characters immediatley goes to console

In [16]:
import sys
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0]:
    print()
    print("----- diversity: ", diversity)
    generated = ''
    sentence = text[start_index: start_index+maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        # Drop first character to maintain same length
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()
            


----- diversity:  0.2
----- Generating with seed: "aue't.
oh good horatio, what a wounded n"
aue't.
oh good horatio, what a wounded no more of his lady

   cassi. thou know the strangers that stration, and that haue beares by his lady

   cassi. thou know the sounder of the strangers beare
the commis'd to my selfe of the words of the sould,
that i shall be so strong the season on the commistion
by the strangers blood,
enter king with the soule, and make once,
and with a such a shall be a brutus, and a man,
then they shall haue 

----- diversity:  0.5
----- Generating with seed: "aue't.
oh good horatio, what a wounded n"
aue't.
oh good horatio, what a wounded no more more and to the more out,
with more pittion browes many like a deed a tent
do strong this burres my lord

   ham. thou hast day with the stands?
  ham. what sweare my lord

   ham. in the true: and his owne commandmoke and death,
and more happy to the drinke byds
murther of this caesar shall he words in fortune,
and i will