In [1]:
import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Activation
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop
import numpy as np
import random
import sys

from utils import sample

In [2]:
# Step 1 - get your data

path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt


In [3]:
text = open(path).read().lower()  # read the file and convert to lowercase

In [4]:
len(text)

600893

In [5]:
text[0]

'p'

In [6]:
text[0:100]

'preface\n\n\nsupposing that truth is a woman--what then? is there not ground\nfor suspecting that all ph'

In [7]:
# get and sort all unique characters

chars = sorted(list(set(text)))

In [8]:
print(chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']


In [9]:
len(chars)

57

In [10]:
# what position does each character exist at in the prev list

char_indices = dict((c,i) for i, c in enumerate(chars))

indices_char = dict((i,c) for i,c in enumerate(chars))

In [11]:
char_indices["a"]

27

In [12]:
indices_char[27]

'a'

In [13]:
# cut your text into overlapping sequences

maxlen = 40
sentences= []
next_chars=[]

for i in range(0, len(text)-maxlen, 3):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])



In [14]:
sentences[0]

'preface\n\n\nsupposing that truth is a woma'

In [15]:
next_chars[0]

'n'

In [16]:
sentences[1]

'face\n\n\nsupposing that truth is a woman--'

In [17]:
next_chars[1]

'w'

In [18]:
len(sentences)

200285

In [19]:
# vectorization

X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)

y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

In [20]:
X.shape

(200285, 40, 57)

In [21]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]]=1

In [27]:
X[0,:,0]

array([False, False, False, False, False, False, False,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False], dtype=bool)

In [28]:
y.shape

(200285, 57)

In [29]:
# build LSTM Model

model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
# train your model

for iteration in range(1,20):
    
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, epochs=1)
    
    start_index = random.randint(0, len(text)-maxlen-1)
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        
        generated = ''
        sentence = text[start_index: start_index+maxlen] #randomly picking 40 chars for my seed
        generated+=sentence
        print('MY SEED IS', sentence)
        sys.stdout.write(generated)
        
        for i in range(400):
            
            x = np.zeros((1, maxlen, len(chars)))
            for t,char in enumerate(sentence):
                x[0,t,char_indices[char]]=1
                
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            
            generated += next_char
            sentence = sentence[1:] + next_char
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()