In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import *

Using TensorFlow backend.


In [2]:
def pick_random(preds, temperature=1.0):
    #helper function to sample an index from a probability array
    preds = preds.flatten()
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, randomness, n_words):
    in_text = seed_text
    print(in_text, end=" ")
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = pick_random(yhat[0], randomness)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        print(out_word, end=" ")

In [3]:
# source text
train = ""
test = ""
with open("../../WAFiles/blogs.txt", 'r') as fin:
    #for line in fin:
    #    data += line
    
    for i in range(10000):
        train += fin.readline()
    for i in range(1000):
        test += fin.readline()

print(len(train))
print(len(test))

10786475
853180


In [4]:
# integer encode sequences of words
tokenizer = Tokenizer()#num_words=10000)
tokenizer.fit_on_texts([train])

# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# vocab_size = 10000

Vocabulary Size: 54299


In [5]:
train_data = np.array(tokenizer.texts_to_sequences([train])[0])
test_data = np.array(tokenizer.texts_to_sequences([test])[0])

In [6]:
print(train_data.shape)
print(test_data.shape)

(1979523,)
(155571,)


In [7]:
class BatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, total_words, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.total_words = total_words
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.total_words))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps + 1 >= len(self.data):
                    self.current_idx = (self.current_idx + self.num_steps + 1) % len(self.data)
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + self.num_steps]
                y[i, :] = tf.keras.utils.to_categorical(temp_y, num_classes=self.total_words)
                self.current_idx += self.skip_step
            yield x, y

In [8]:
batch_size = 64

In [9]:
train_data_generator = BatchGenerator(train_data, 10, batch_size, vocab_size, skip_step=2)
test_data_generator = BatchGenerator(test_data, 10, batch_size, vocab_size, skip_step=2)

In [10]:
# x, y = next(train_data_generator.generate())
# for i in range(len(x)):
#     out_word = ''
#     for j in x[i]:
#         for word, index in tokenizer.word_index.items():
#             if index == j:
#                 out_word = word
#                 break
#         print(out_word, end=" ")
#     for word, index in tokenizer.word_index.items():
#         if index == np.argmax(y[i]):
#             print(" - " + word)
#             break

In [11]:
# x = np.array(next(train_data_generator.generate())[0])
# y = model.predict(x)
# for i in range(len(x)):
#     out_word = ''
#     for j in x[i]:
#         for word, index in tokenizer.word_index.items():
#             if index == j:
#                 out_word = word
#                 break
#         print(out_word, end=" ")
#     for word, index in tokenizer.word_index.items():
#         if index == np.argmax(y[i]):
#             print(" - " + word)
#             break

In [12]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=10))
model.add(LSTM(1024, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.05))
#model.add(LSTM(512, return_sequences=False))
#model.add(BatchNormalization())
#model.add(Dropout(0.05))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 64)            3475136   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024)              4460544   
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 54299)             55656475  
Total params: 63,596,251
Trainable params: 63,594,203
Non-trainable params: 2,048
_________________________________________________________________




In [13]:
filepath="checkpoints/weights-{epoch:02d}-{val_loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
    save_best_only=True, mode='auto', period=1)

In [14]:
# fit network
model.fit_generator(
    generator=train_data_generator.generate(),
    steps_per_epoch=len(train_data)//(batch_size)//32,
    epochs=100,
    validation_data=test_data_generator.generate(),
    validation_steps = len(test_data)//(batch_size)//32,
    callbacks=[checkpoint]
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100

Epoch 00001: val_loss improved from inf to 7.47873, saving model to checkpoints/weights-01-7.479.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 7.47873
Epoch 3/100

Epoch 00003: val_loss improved from 7.47873 to 7.13620, saving model to checkpoints/weights-03-7.136.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 7.13620 to 6.58281, saving model to checkpoints/weights-04-6.583.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 6.58281 to 6.41376, saving model to checkpoints/weights-05-6.414.hdf5
Epoch 6/100

Epoch 00006: val_loss did not improve from 6.41376
Epoch 7/100

Epoch 00007: val_loss did not improve from 6.41376
Epoch 8/100

Epoch 00008: val_loss improved from 6.41376 to 5.86768, saving model to checkpoints/weights-08-5.868.hdf5
Epoch 9/100

Epoch 00009: val_loss did not improve from 5.86768
Epoch 10/100

Epoch 00010: val_loss did not improve from 5.

KeyboardInterrupt: 

In [15]:
model.save('model.h5')

In [19]:
# evaluate model
generate_seq(model, tokenizer, 10, "today i", 0.5, 100)

today i hope you have to go to the site of the way it was really like you don't want to see it the way of the best one of the way i think he was really surprised by the way i can wait for a few of the rest of the one of the day and i hope that only call this is a lot of the way i could think the rest of my life i hope it's not even though but i don't want to try to find it as you don't know what i had a little bit 

In [20]:
import pickle
pickle.dump(tokenizer, open('tokenizer_311.dat', 'wb'))