In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
def pick_random(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = preds.flatten()
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [3]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, randomness, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = pick_random(yhat, randomness)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [4]:
# source text
data = ""
with open("../../WAFiles/blogs.txt", 'r') as fin:
    #for line in fin:
    #    data += line
    
    for i in range(2000):
        data += fin.readline()

print(len(data))

2341607


In [5]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 21434


In [6]:
# encode 2 words -> 1 word

sequences = list()
for i in range(2, len(encoded)):
    if i%100000==0:
        print("%.2f" % (i / len(encoded) * 100))
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

23.09
46.19
69.28
92.38
Total Sequences: 433014


In [7]:
# pad sequences

max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


In [8]:
# split into input and output elements

sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size, dtype=np.int16)

In [9]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_length-1))
model.add(LSTM(1024))
model.add(Dense(512, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 64)             1371776   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024)              4460544   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_2 (Dense)              (None, 21434)             10995642  
Total params: 17,352,762
Trainable params: 17,352,762
Non-trainable params: 0
_________________________________________________________________
None




In [10]:
# fit network
model.fit(X, y, batch_size=64, epochs=10, verbose=1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa3ce9b7410>

In [16]:
model.save('model.h5')

In [20]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, "today i", 0.3, 100))

today i went to the point of course i don't know i just want to go to the doctor and i was just a little bit i got to go to the dean is great i think i might have to go to sleep but i don't know i don't know what i think i need to go to the park was a lot of fun i think i may have been a little bit i have a lot of fun i went to the owner design visitors and drew all over again and i have to go to sleep in the


In [17]:
import pickle
pickle.dump(tokenizer, open('tokenizer_216.dat', 'wb'))