### RNN for text generation

In [1]:
#some useful imports 
import nltk

from sklearn.utils import shuffle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, TimeDistributed, Bidirectional
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout

Using TensorFlow backend.


In [2]:
#reading the data in
names = [name.strip().lower() for name in open('dinos.txt').readlines()]
print(names[:10])

['aachenosaurus', 'aardonyx', 'abdallahsaurus', 'abelisaurus', 'abrictosaurus', 'abrosaurus', 'abydosaurus', 'acanthopholis', 'achelousaurus', 'acheroraptor']


In [3]:
#alphabet is a list of unique characters
chars = [char for name in names for char in name] + ['bos','eos']
alphabet = list(set(chars))

In [4]:
#data preporation
from keras.utils import to_categorical
import numpy as np

X_names = ['bos ' + ' '.join(name) for name in names]
Y_names = [' '.join(name) + ' eos' for name in names]
maxlen = max([len(name) for name in names])+1

Teacher forcing:

In [5]:
print(X_names[0])
print(Y_names[0])

bos a a c h e n o s a u r u s
a a c h e n o s a u r u s eos


In [6]:
tokenizer = Tokenizer(num_words=len(alphabet)+2)
tokenizer.fit_on_texts(X_names+Y_names)

# Adds padding samples not available in the alphabet
sequences = tokenizer.texts_to_sequences(X_names)
X_train = pad_sequences(sequences, maxlen=maxlen, padding='post')

sequences = tokenizer.texts_to_sequences(Y_names)
Y_train = pad_sequences(sequences, padding='post')

Y_train_cat  = [to_categorical(sent, num_classes=len(alphabet)+2) for sent in Y_train]
Y_train =  np.asarray(Y_train_cat)

In [7]:
print(X_names[0])
print(Y_names[0])


print(X_train.shape)
print(Y_train.shape)


print(tokenizer.word_index['bos'])
print(tokenizer.word_index['eos'])

bos a a c h e n o s a u r u s
a a c h e n o s a u r u s eos
(1536, 27)
(1536, 27, 30)
10
11


In [8]:
char_index = tokenizer.word_index
index_char = {i: c for c, i in char_index.items()}

In [9]:
#the RNN language model
model = Sequential()

model.add(Embedding(len(alphabet)+2, 30, input_length=maxlen))
model.add(LSTM(128, return_sequences = True))

model.add(Dense(len(alphabet)+2, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
for iteration in range(1, 20):
    X_train_shuffled, y_train_shuffled = shuffle(X_train, Y_train)
    model.fit(X_train_shuffled, y_train_shuffled, batch_size=len(X_train), epochs=1, verbose = 1)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [11]:
# helper function to sample an index from a probability array
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) #/ temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.choice(range(len(alphabet)+2), p = preds)
    return probas

In [18]:
#generation 
generated = ''
seed = 'bos'
generated += seed + ' '
print('----- Generating with seed: "' + seed + '"')
print(generated)


for i in range(12): 
    sequences = tokenizer.texts_to_sequences([seed])
    X_pred = pad_sequences(sequences, maxlen=maxlen, padding = 'post')

    preds = model.predict(X_pred, verbose=0)[0]
    samples = [sample(p) for p in preds]
    next_index = samples[i]
    while next_index == 0 or next_index == 10:
        samples = [sample(p) for p in preds]
        next_index = samples[i]
    next_char = index_char[next_index+1]
    generated += next_char + ' '
    print(generated)
    seed += next_char
    if next_char == 'eos':
        break
    

----- Generating with seed: "bos"
bos 
bos l 
bos l p 
bos l p g 
bos l p g t 
bos l p g t s 
bos l p g t s k 
bos l p g t s k b 
bos l p g t s k b z 
bos l p g t s k b z u 
bos l p g t s k b z u c 
bos l p g t s k b z u c p 
bos l p g t s k b z u c p s 
