In [10]:
import numpy as np
from collections import Counter
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

# Reading Corpus

In [11]:
corpus_file = open('corpus.txt', mode='r', encoding="utf8")
corpus = corpus_file.read()
corpus = corpus.lower()
print(corpus[:45])

the black cat.  by edgar allan poe.   for the


# Wordwise

In [12]:
dict_punctuation = {
        '.':' ||Period|| ',
        ',':' ||Comma|| ',
        '"':' ||Quotation_Mark|| ',
        ';':' ||Semicolon|| ',
        '!':' ||Exclamation_Mark|| ',
        '?':' ||Question_Mark|| ',
        '(':' ||Left_Parenthesis|| ',
        ')':' ||Right_Parenthesis|| ',
        '--':' ||Double_Dash|| ',
        '-':' ||Dash|| ',
        '_':' ||Underscore|| ',
        '*':' ||Star|| ',
        '\n':' ||Return|| ',
        '’' :' ||Left_Quote|| ',
        '“' :' ||Right_Quotation|| ',
        '”' :' ||Left_Quotation|| ',
        '‘' :' ||Right_Quote|| '
    }

for key, token in dict_punctuation.items():
    corpus = corpus.replace(key, token)
    
word_corpus = corpus.split(' ')
word_corpus[1:15]

['black',
 'cat',
 '||Period||',
 '',
 '',
 'by',
 'edgar',
 'allan',
 'poe',
 '||Period||',
 '',
 '',
 '',
 'for']

In [16]:
vocab = set(word_corpus)
num_classes = len(vocab)
print(num_classes)

vocab_to_int = {c:i for i,c in enumerate(vocab)}
int_to_vocab = {i:c for i,c in enumerate(vocab)}
print(int_to_vocab.get(vocab_to_int.get('||Period||')))

encoded = [vocab_to_int.get(i) for i in word_corpus]
print(encoded[1:10])

6800
||Period||
[1911, 1622, 1162, 0, 0, 4355, 264, 5972, 2091]


In [17]:
# Hyperparams
# samples = 1300
steps = 25
dropout = 0.5
epochs = 50
batch_size = 1024
embed_dim = 256

In [38]:
X = []
y = []

for i in range(0, len(encoded) - steps, 1):
    X.append(encoded[i : i + steps])
    y.append(encoded[i + steps])

X = np.reshape(X, (len(X), steps))
X = X/float(num_classes)

X_train = X
y_train = np.eye(num_classes)[y]

print(X_train.shape)
print(y_train.shape)


(77966, 25)
(77966, 6800)


In [39]:
Counter(y)
# print(X[1,:,:])

Counter({0: 10936,
         1: 3,
         2: 1,
         3: 1,
         4: 1,
         5: 5,
         6: 1,
         7: 1,
         8: 2,
         9: 1,
         10: 2,
         11: 1,
         12: 6,
         13: 8,
         14: 1,
         15: 1,
         16: 2,
         17: 1,
         18: 5,
         19: 16,
         20: 1,
         21: 17,
         22: 1,
         23: 13,
         24: 1,
         25: 8,
         26: 31,
         27: 1,
         28: 1,
         29: 7,
         30: 1,
         31: 1,
         32: 2,
         33: 4,
         34: 2,
         35: 1,
         36: 6,
         37: 1,
         38: 1,
         39: 2,
         40: 1,
         41: 1,
         42: 1,
         43: 1,
         44: 4,
         45: 2,
         46: 2,
         47: 1,
         48: 1,
         49: 3,
         50: 2,
         51: 38,
         52: 1,
         53: 1,
         54: 2,
         55: 1,
         56: 1,
         57: 1,
         58: 2,
         59: 9,
         60: 2,
         61: 2,
         

In [67]:
model = Sequential()
model.add(Embedding(input_dim=num_classes, output_dim=embed_dim, input_length=steps))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [68]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [69]:
model.fit(X_train, y_train, batch_size = batch_size, epochs=epochs, callbacks=callbacks_list)

Epoch 1/50
 7168/77966 [=>............................] - ETA: 475s - loss: 7.7085

KeyboardInterrupt: 

In [54]:
y_pred = model.predict(np.reshape(X_train[102,:,:],(1,steps,1)))
print(y_pred)
y_transformed = np.argmax(y_pred)
print(y_transformed)
print(int_to_vocab[y_transformed])

IndexError: too many indices for array

In [12]:
seed = np.random.randint(textsize-steps)
x_seed = corpus[seed:seed + steps]
x_seed = [vocab_to_int[char] for char in x_seed]
x_seed = np.reshape(x_seed, (1, steps, 1))
x_seed = x_seed/float(num_classes)
print(x_seed)

[[[ 0.5308642 ]
  [ 0.86419753]
  [ 0.60493827]
  [ 0.96296296]
  [ 0.5308642 ]
  [ 0.79012346]
  [ 0.86419753]
  [ 0.18518519]
  [ 0.27160494]
  [ 0.5308642 ]
  [ 0.86419753]
  [ 0.5308642 ]
  [ 0.69135802]
  [ 0.27160494]
  [ 0.88888889]
  [ 0.88888889]
  [ 0.65432099]
  [ 0.86419753]
  [ 0.85185185]
  [ 0.54320988]
  [ 0.72839506]
  [ 0.86419753]
  [ 0.27160494]
  [ 0.32098765]
  [ 0.32098765]
  [ 0.75308642]
  [ 0.79012346]
  [ 0.5308642 ]
  [ 0.5308642 ]
  [ 0.04938272]
  [ 0.0617284 ]
  [ 0.79012346]
  [ 0.24691358]
  [ 0.86419753]
  [ 0.13580247]
  [ 0.04938272]
  [ 0.5308642 ]
  [ 0.86419753]
  [ 0.79012346]
  [ 0.51851852]
  [ 0.79012346]
  [ 0.5308642 ]
  [ 0.86419753]
  [ 0.18518519]
  [ 0.79012346]
  [ 0.75308642]
  [ 0.79012346]
  [ 0.86419753]
  [ 0.88888889]
  [ 0.04938272]
  [ 0.72839506]
  [ 0.72839506]
  [ 0.88888889]
  [ 0.79012346]
  [ 0.86419753]
  [ 0.27160494]
  [ 0.60493827]
  [ 0.7654321 ]
  [ 0.86419753]
  [ 0.96296296]
  [ 0.0617284 ]
  [ 0.79012346]
  [ 0.75

In [13]:
x = x_seed
out = []
charsize = 500

for i in range(charsize):
    y_pred = model.predict(x)
    y_transformed = np.argmax(y_pred)
    output = int_to_vocab[y_transformed]
    out.append(output)
    x_new = y_transformed/float(num_classes)
    x = np.append(x[:,1:100,:], np.reshape(x_new, (1,1,1)), axis = 1)

print('completed')

completed


In [14]:
print(len(out))
print(''.join(out))

500
ot tead the sout and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the soutd and the
