In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import string
import collections

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import *

Using TensorFlow backend.


In [2]:
def pick_random(preds, temperature=1.0):
    #helper function to sample an index from a probability array
    preds = preds.flatten()
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

punc = string.punctuation

# generate a sequence from a language model
def generate_seq(model, max_length, seed_text, randomness, n_words):
    in_text = seed_text
    print(in_text, end="")
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenNL(in_text)
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = pick_random(yhat[0], randomness)
        
        out_word = tokenW(yhat)
        
        ispunc = True
        for c in out_word:
            if not c in punc:
                ispunc = False
                break
        
        # append to input
        in_text += ' ' + out_word
        print(('' if ispunc else ' ') + out_word, end='')

In [3]:
# source text
train = ""
test = ""
with open("../../WAFiles/blogs.txt", 'r') as fin:
    #for line in fin:
    #    data += line
    
    for i in range(400000):
        train += fin.readline()
    for i in range(40000):
        test += fin.readline()

train = train.lower()
test = test.lower()
        
print(len(train))
print(len(test))

421553136
39727072


In [4]:
trainl = train.split()
trainc = collections.Counter(trainl)
trainlist = [i for i in trainc.keys() if trainc[i] >= 18]

In [5]:
# for i in trainc.keys():
#     if trainc[i] == 2:
#         print(i)   

In [6]:
trainsplit = train.split()
d1 = dict(zip(range(1, len(trainlist)+1), trainlist))
d2 = dict(zip(trainlist, range(1, len(trainlist)+1)))
vocab_size = len(d1) + 1
print(vocab_size)

59787


In [7]:
def tokenW(n):
    try:
        return d1[n]
    except:
        return ''
def tokenN(s):
    try:
        return d2[s]
    except:
        return 0
def tokenWL(nums):
    words = ""
    for i in range(len(nums)):
        words += tokenW(nums[i]) + " "
    return words
def tokenNL(words):
    ws = words.split()
    ar = np.empty((len(ws),))
    for i in range(len(ws)):
        ar[i] = tokenN(ws[i])
    return ar
        

In [8]:
train_data = tokenNL(train)
test_data = tokenNL(test)

In [9]:
print(train_data.shape)
print(test_data.shape)

(81867883,)
(7778659,)


In [10]:
class BatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, total_words, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.total_words = total_words
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.total_words))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps + 10 >= len(self.data):
                    self.current_idx = (self.current_idx + self.num_steps + 10) % len(self.data)
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                x[i, self.num_steps // 2] = 0
                temp_y = self.data[self.current_idx + self.num_steps // 2]
                y[i, :] = tf.keras.utils.to_categorical(temp_y, num_classes=self.total_words)
                self.current_idx += self.skip_step
            yield x, y

In [11]:
batch_size = 64

In [12]:
train_data_generator = BatchGenerator(train_data, 10, batch_size, vocab_size, skip_step=1000)
test_data_generator = BatchGenerator(test_data, 10, batch_size, vocab_size, skip_step=1000)

In [13]:
x, y = next(train_data_generator.generate())
print(tokenWL(x[0]), end="- ")
print(tokenW(np.argmax(y)))

well everyone got up and  this morning it's still - going


In [14]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=10))
model.add(Bidirectional(LSTM(1024)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0005), metrics=['accuracy'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 128)           7652736   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 2048)              9445376   
_________________________________________________________________
batch_normalization_1 (Batch (None, 2048)              8192      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 59787)             122503563 
Total params: 139,609,867
Trainable params: 139,605,771
Non-trainable params: 4,096
_________________________________________________________________




In [15]:
filepath="checkpoints/weights-{epoch:02d}-{val_loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
    save_best_only=True, mode='auto', period=1)

In [17]:
# fit network
model.fit_generator(
    generator=train_data_generator.generate(),
    steps_per_epoch=len(train_data)//(batch_size)//1024,
    epochs=80,
    validation_data=test_data_generator.generate(),
    validation_steps = len(test_data)//(batch_size)//1024,
    callbacks=[checkpoint]
)

Epoch 1/80

Epoch 00001: val_loss did not improve from 4.70603
Epoch 2/80

Epoch 00002: val_loss did not improve from 4.70603
Epoch 3/80

Epoch 00003: val_loss improved from 4.70603 to 4.70546, saving model to checkpoints/weights-03-4.705.hdf5
Epoch 4/80

Epoch 00004: val_loss did not improve from 4.70546
Epoch 5/80

Epoch 00005: val_loss did not improve from 4.70546
Epoch 6/80

Epoch 00006: val_loss did not improve from 4.70546
Epoch 7/80

Epoch 00007: val_loss did not improve from 4.70546
Epoch 8/80

Epoch 00008: val_loss improved from 4.70546 to 4.65361, saving model to checkpoints/weights-08-4.654.hdf5
Epoch 9/80

Epoch 00009: val_loss did not improve from 4.65361
Epoch 10/80

Epoch 00010: val_loss did not improve from 4.65361
Epoch 11/80

Epoch 00011: val_loss did not improve from 4.65361
Epoch 12/80

Epoch 00012: val_loss did not improve from 4.65361
Epoch 13/80

Epoch 00013: val_loss did not improve from 4.65361
Epoch 14/80

Epoch 00014: val_loss did not improve from 4.65361
Epo

KeyboardInterrupt: 

In [18]:
model.save('model_tmp.hdf5')

In [28]:
x = np.array(next(train_data_generator.generate())[0])
y = model.predict(x)

printNum = 20

for b in range(printNum):
    gen = ""
    for i in range(len(x[b])):
        w = tokenW(x[b,i])
        # print('-' if w=='' else w, end=" ")
        gen += ('%*' if i==len(x[b])//2 else w) + " "
    # print()
    # print(tokenW(np.argmax(y)))
    print(gen.replace("%*", ("~"+tokenW(np.argmax(y[b]))+"~")))

full of craziness now unable ~to~ focus he seemed on 
the pressure my attorney has ~not~ been able to accept 
butte jacket booming through the ~~ island tunnel at the 
thing to get these days ~so~ i figured well just 
a savage invitation from the ~~ suddenly i felt guilty 
south towards l a but ~at~ all deliberate speed keep 
morning rush of pimps and ~the~ hustlers with a huge 
the devil keep that in ~and~ buy the ticket take 
advantage of that rest area ~should~ i tell you how 
the chp and then with ~the~ filthy phantom hitchhiker plunging 
straight out to his car ~and~ start abusing those drugs 
esoteric lights dials meters that ~i~ would never understand but 
to the desk clerk i ~really~ hate to interrupt but 
the famous journalist pairing for ~the~ suite lucy on our 
herself into a towering jesus ~of~ rage at the hazy 
to the airport saying we ~are~ going to trade the 
to that woman she was ~in~  i think she 
other one met us at ~the~ hotel he was sweating 
he was back in his ~car

In [29]:
import pickle
pickle.dump([d1, d2], open('tokenizer_tmp.dat', 'wb'))