In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import string
import collections

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import *

Using TensorFlow backend.


In [2]:
def pick_random(preds, temperature=1.0):
    #helper function to sample an index from a probability array
    preds = preds.flatten()
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

punc = string.punctuation

# generate a sequence from a language model
def generate_seq(model, max_length, seed_text, randomness, n_words):
    in_text = seed_text
    print(in_text, end="")
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenNL(in_text)
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = pick_random(yhat[0], randomness)
        
        out_word = tokenW(yhat)
        
        ispunc = True
        for c in out_word:
            if not c in punc:
                ispunc = False
                break
        
        # append to input
        in_text += ' ' + out_word
        print(('' if ispunc else ' ') + out_word, end='')

In [3]:
# source text
train = ""
test = ""
with open("../../WAFiles/blogs.txt", 'r') as fin:
    #for line in fin:
    #    data += line
    
    for i in range(100000):
        train += fin.readline()
    for i in range(10000):
        test += fin.readline()

train = train.lower()
test = test.lower()
        
print(len(train))
print(len(test))

98295231
10655598


In [4]:
trainl = train.split()
trainc = collections.Counter(trainl)
trainlist = [i for i in trainc.keys() if trainc[i] >= 3]

In [5]:
# for i in trainc.keys():
#     if trainc[i] == 2:
#         print(i)   

In [6]:
trainsplit = train.split()
d1 = dict(zip(range(1, len(trainlist)+1), trainlist))
d2 = dict(zip(trainlist, range(1, len(trainlist)+1)))
vocab_size = len(d1) + 1
print(vocab_size)

79113


In [7]:
def tokenW(n):
    try:
        return d1[n]
    except:
        return ''
def tokenN(s):
    try:
        return d2[s]
    except:
        return 0
def tokenWL(nums):
    words = ""
    for i in range(len(nums)):
        words += tokenW(nums[i]) + " "
    return words
def tokenNL(words):
    ws = words.split()
    ar = np.empty((len(ws),))
    for i in range(len(ws)):
        ar[i] = tokenN(ws[i])
    return ar
        

In [8]:
train_data = tokenNL(train)
test_data = tokenNL(test)

In [9]:
print(train_data.shape)
print(test_data.shape)

(19026786,)
(2071994,)


In [10]:
class BatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, total_words, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.total_words = total_words
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.total_words))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps + 10 >= len(self.data):
                    self.current_idx = (self.current_idx + self.num_steps + 10) % len(self.data)
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + self.num_steps]
                y[i, :] = tf.keras.utils.to_categorical(temp_y, num_classes=self.total_words)
                self.current_idx += self.skip_step
            yield x, y

In [11]:
batch_size = 64

In [12]:
train_data_generator = BatchGenerator(train_data, 10, batch_size, vocab_size, skip_step=1000)
test_data_generator = BatchGenerator(test_data, 10, batch_size, vocab_size, skip_step=1000)

In [13]:
x, y = next(train_data_generator.generate())
print(tokenWL(x[0]), end="- ")
print(tokenW(np.argmax(y)))

well everyone got up and going this morning it's still - raining


In [14]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=10))
model.add(LSTM(1024, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 128)           10126464  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024)              4722688   
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 79113)             81090825  
Total params: 95,944,073
Trainable params: 95,942,025
Non-trainable params: 2,048
_________________________________________________________________




In [15]:
filepath="checkpoints/weights-{epoch:02d}-{val_loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1,
    save_best_only=True, mode='auto', period=1)

In [16]:
# fit network
model.fit_generator(
    generator=train_data_generator.generate(),
    steps_per_epoch=len(train_data)//(batch_size)//256,
    epochs=3,
    validation_data=test_data_generator.generate(),
    validation_steps = len(test_data)//(batch_size)//256,
    #callbacks=[checkpoint]
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7c2f85f590>

In [None]:
model.save('model.h5')

In [19]:
# evaluate model
generate_seq(model, 10, "I wish", 0.4, 100)

I wish you are not sure you want to get my own life so much like i want to do you want to get a much time i want to get it but i have heard of my life i want to get like this i want to get to work and i like have to be able to get out of a new time i would have been much of this morning i would have like i'm glad i want to see the time i just want to do about what i want to get a lot of days and i'm