In [1]:
import numpy as np
from collections import Counter
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras import optimizers
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


# Reading Corpus

In [2]:
corpus_file = open('corpus.txt', mode='r', encoding="utf8")
corpus = corpus_file.read()
corpus = corpus.lower()
print(corpus[:45])

﻿the black cat.

by edgar allan poe.


for th


# Wordwise

In [3]:
dict_punctuation = {
        '.':' ||Period|| ',
        ',':' ||Comma|| ',
        '"':' ||Quotation_Mark|| ',
        ';':' ||Semicolon|| ',
        '!':' ||Exclamation_Mark|| ',
        '?':' ||Question_Mark|| ',
        '(':' ||Left_Parenthesis|| ',
        ')':' ||Right_Parenthesis|| ',
        '--':' ||Double_Dash|| ',
        '-':' ||Dash|| ',
        '_':' ||Underscore|| ',
        '*':' ||Star|| ',
        '\n':' ||Return|| ',
        '’' :' ||Left_Quote|| ',
        '“' :' ||Right_Quotation|| ',
        '”' :' ||Left_Quotation|| ',
        '‘' :' ||Right_Quote|| '
    }

for key, token in dict_punctuation.items():
    corpus = corpus.replace(key, token)
    
word_corpus = corpus.split(' ')
word_corpus[1:15]

['black',
 'cat',
 '||Period||',
 '',
 '||Return||',
 '',
 '||Return||',
 'by',
 'edgar',
 'allan',
 'poe',
 '||Period||',
 '',
 '||Return||']

In [4]:
vocab = set(word_corpus)
num_classes = len(vocab)
print(num_classes)

vocab_to_int = {c:i for i,c in enumerate(vocab)}
int_to_vocab = {i:c for i,c in enumerate(vocab)}
print(int_to_vocab.get(vocab_to_int.get('||Period||')))

encoded = [vocab_to_int.get(i) for i in word_corpus]
print(encoded[1:10])

2392
||Period||
[1759, 936, 606, 0, 479, 0, 479, 1503, 2128]


In [5]:
steps = 10

In [6]:
X = []
y = []

for i in range(0, len(encoded) - steps, 1):
    X.append(encoded[i : i + steps])
    y.append(encoded[i + steps])

X = np.reshape(X, (len(X), steps))
X = X/float(num_classes)

X_train = X
y_train = np.eye(num_classes)[y]

print(X_train.shape)
print(y_train.shape)


(14208, 10)
(14208, 2392)


In [7]:
Counter(y)
# print(X[1,:,:])

Counter({0: 1864,
         1: 5,
         2: 1,
         3: 1,
         4: 154,
         5: 13,
         6: 1,
         7: 1,
         8: 1,
         9: 8,
         10: 2,
         11: 3,
         12: 1,
         13: 3,
         14: 1,
         15: 1,
         16: 1,
         17: 4,
         18: 1,
         19: 1,
         20: 1,
         21: 1,
         22: 1,
         23: 1,
         24: 1,
         25: 1,
         26: 7,
         27: 3,
         28: 1,
         29: 1,
         30: 1,
         31: 1,
         32: 3,
         33: 1,
         34: 2,
         35: 1,
         36: 1,
         37: 1,
         38: 1,
         39: 1,
         40: 1,
         41: 66,
         42: 2,
         43: 1,
         44: 1,
         45: 1,
         46: 1,
         47: 1,
         48: 1,
         49: 1,
         50: 3,
         51: 2,
         52: 2,
         53: 3,
         54: 25,
         55: 1,
         56: 1,
         57: 1,
         58: 1,
         59: 2,
         60: 2,
         61: 6,
         6

In [8]:
# Hyperparams
# samples = 1300
steps = 10
dropout = 0.2
epochs = 250
batch_size = 512
embed_dim = 512

In [12]:
model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=embed_dim, input_length=steps))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(dropout))
model.add(LSTM(128, return_sequences = True))
model.add(Dropout(dropout))
model.add(Flatten())
adam = optimizers.Adam(lr=0.01)
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer=adam)

In [13]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [14]:
model.fit(X_train, y_train, batch_size = batch_size, epochs=epochs, callbacks=callbacks_list)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250

KeyboardInterrupt: 

In [58]:
y_pred = model.predict(np.reshape(X_train[102,:],(1,steps)))
print(y_pred)
y_transformed = np.argmax(y_pred)
print(y_transformed)
print(int_to_vocab[y_transformed])

[[  1.34227455e-01   1.44253527e-05   1.20604573e-05 ...,   1.02025828e-04
    1.26223258e-05   1.27296735e-05]]
0



In [59]:
seed = np.random.randint(len(encoded)-steps)
x_seed = corpus[seed:seed + steps]
x_seed = [vocab_to_int[char] for char in x_seed]
x_seed = np.reshape(x_seed, (1, steps, 1))
x_seed = x_seed/float(num_classes)
print(x_seed)

KeyError: ' '

In [60]:
x = x_seed
out = []
charsize = 500

for i in range(charsize):
    y_pred = model.predict(x)
    y_transformed = np.argmax(y_pred)
    output = int_to_vocab[y_transformed]
    out.append(output)
    x_new = y_transformed/float(num_classes)
    x = np.append(x[:,1:100,:], np.reshape(x_new, (1,1,1)), axis = 1)

print('completed')

TypeError: Error when checking model : data should be a Numpy array, or list/dict of Numpy arrays. Found: e lost no opportunity of procuring those of the mo...

In [14]:
print(len(out))
print(''.join(out))

500
ot tead the sout and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the
