# Download and parse initial text file

In [1]:
import keras
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
harry_path = 'harrypotter.txt'

In [3]:
text = open(harry_path,).read().lower()

In [4]:
#clean text
def clean_str(text):
    raw_tokens = text.split()
    clean_t = [word.lower() for word in raw_tokens if word.isalpha()]
    return ' '.join(clean_t)

In [5]:
print('Corpus length:{}'.format(len(text)))

Corpus length:442745


In [6]:
type(text)

str

In [7]:
print(text[:300])

harry potter and the sorcerer's stone 

chapter one 

the boy who lived 

mr. and mrs. dursley, of number four, privet drive, were proud to say that they were perfectly normal, thank you very much. they were the last people you'd expect to be involved in anything strange or mysterious, because they 


## Vectorize sequences of characters

Partially extract overlapping sequences of length maxlen,
one-hot encode them,
pack them in a 3D numpy array 'x' of shape(sequences, maxlen, unique_characters).

Simultaneously prepare an array 'y' containing the corresponding targers: one-hot encoded
characters that come after each extract sequence.

In [8]:
# extract sequences of 60 characters, and sample a new sequence every three characters
maxlen = 60
step = 3

In [9]:
# hold the extracted sequences and targets (follow up characters)
sentences = []
next_chars = []

In [10]:
# extract sequences
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])


In [11]:
# print number of sentences in corpus
print('Number of sequences', len(sentences))

Number of sequences 147562


In [12]:
# list of unique characters in corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
print(chars)

Unique characters: 54
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', 'ì', 'ñ']


In [13]:
# create dictionary mapping unique characters to their index in the list of 'chars'
char_indices = dict(
    (char, chars.index(char)) for char in chars
)

In [14]:
print(char_indices)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '?': 23, '\\': 24, 'a': 25, 'b': 26, 'c': 27, 'd': 28, 'e': 29, 'f': 30, 'g': 31, 'h': 32, 'i': 33, 'j': 34, 'k': 35, 'l': 36, 'm': 37, 'n': 38, 'o': 39, 'p': 40, 'q': 41, 'r': 42, 's': 43, 't': 44, 'u': 45, 'v': 46, 'w': 47, 'x': 48, 'y': 49, 'z': 50, '~': 51, 'ì': 52, 'ñ': 53}


## Vectorization

In [15]:
# one-hot encode characters into binary arrays
x = np.zeros(
    (len(sentences), maxlen, len(chars)), dtype=np.bool
)

In [16]:
y = np.zeros(
    (len(sentences), len(chars)), dtype=np.bool
)

In [17]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

# Build Network Architecture

Define a network with a single LSTM layer, followed by a Dense classifier, and a softmax over all possible characters

In [18]:
# Single layer LSTM model for next-character prediction
from keras import layers

In [19]:
# define model architecture
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(len(chars), activation='softmax'))

In [20]:
# use categorical crossentropy for loss function as targets are one-hot encoded
optimizer = keras.optimizers.RMSprop(lr=0.01)

In [21]:
# compile model with loss function and optimizer
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Training the language model and sampling from it

### Given a trained model and a seed text snippet, generate new text by repeating the following:
1 - Draw from the model of a prob. distrib. for the next character, given the generated text available so far

2 - Reweight the distribution to a certain temperature

3 - Sample the next character at random according to the reweighted distribution

4 - Add a new character at the end of the available text

In [22]:
# define reweighting function to apply to the original prob.distrib coming out of the model
# and draw a character index from it (sampling function) 
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

### Training and text generation loop

Repeatedly train and generate text.

Begin by generating text using a range of different temperatures after every epoch, to see how the generated text evolves as the model beings to converge, as well as the impact of the temperature in the sampling strategy

In [23]:
import random, sys

In [25]:
# train model for 10 epochs
for epoch in range(1, 10):
    print('Epoch: ', epoch)
    # fit model for one iteration on the data
    model.fit(x, y, batch_size=128, epochs=1)
    # select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    # generate text
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "{}"\n '.format(generated_text))
    
    # try a range of different sampling temperatures
    for temperature in [0.3, 0.5, 0.7, 0.8]:
        print('--- temperature: {} \n\n'.format(temperature))
        sys.stdout.write(generated_text)
        print()

        # We generate 400 characters
        for i in range(400):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

Epoch:  1
Epoch 1/1
--- Generating with seed: "on that secret? i set myself against what is lurking in this"
 
--- temperature: 0.3 


on that secret? i set myself against what is lurking in this
 say the stared and harry had a beated and the and the bears to the harry the the didn't for her in the for the beating and harry was a man a car a who had and he had and he got to harry was the for the was to the stark to the face and harry had and a ter the harry down here when the beard on the still toward out of the never had a said and a stand and a back to have a beard to be and and last on 
--- temperature: 0.5 


nd a stand and a back to have a beard to be and and last on 
the last of here was a all, and be make of the never the to one to the ground could say something book and out of the tersted and comething have furst to was cauld to be never paies. 

"i and hermione told clost been a parted to one and harry had been stand he from and can't lest and the carried to a pane a parce of 

 and were flameling harry had ant me. all them fried the the forcher. "you'd say, and quirrell's transing out and he know, hahry. 

"an' he revery of the merre had depthan notting to do you got strain, where hadre, harry back to all seem got house were book, and you letter ears. very tooking in harry. snape was madar. whe not a like the scrrawn at the hell. brang to the broom and under here it?" 

Epoch:  5
Epoch 1/1
--- Generating with seed: "n't be. there's loads of people who come from muggle familie"
 
--- temperature: 0.3 


n't be. there's loads of people who come from muggle familie
s, but he was straing the stone to the starters was more and harry had and to the end on the ground, the door out of the stone of the more was they were startered as they had a start and harry told the more on the corridn't be and they still himself started at his peacing the that he was sorrid last on the potter. 

"what harry still a stood for the car how down the first of the starter to pull th
--

 keopen to the an of the slither was malfoy, i should have got to do that they had something of the room, and they shook a man and swike sickly read so plant and at the forgst, wereus in. 

"the only manist to him. 

"for the hatle -- ead the tall bad. what though the still in his house, and the chanded out of the touch and a surple caught sawhelp he was all if that's it and say him still was cree
--- temperature: 0.8 


t sawhelp he was all if that's it and say him still was cree
n door and crabbe and her stune and hander that he bote with the other shise to a man counting and cramp the looked toed was the might tone the way at harry. he for a kill hagrid. "thno thing to not a reast had looked more in his pleased in the knees on the kitching to what carcalveran filch of the beain staulder professor dumbledore stiod. 

he tall, professor in until about everything. "they're 
Epoch:  9
Epoch 1/1
--- Generating with seed: "ought i had the right to face voldemort if i could...." 

"y"
 
--