Example model training for text generation

In [0]:
import re
import datetime
import os
import json

import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense, Activation, Lambda, Dropout
from keras.models import Model

# Load and prepare data

In [0]:
DATA='./data/'

In [0]:
corpus = []
for file in (os.listdir(DATA)):
    with open(DATA + file) as f:
        corpus += f.readlines()

Filter strings to keep only a reduced set of characters, remove extraneous whitespaces and change everything to lowercase

In [0]:
corpus = [' '.join(re.sub('[^\w .,\'-/:]', ' ', s).split()).lower() for s in corpus if s.strip()]

Number of entries

In [0]:
len(corpus)

Total number of characters

In [0]:
sum([len(x) for x in corpus])

### Tokenize and reshape our corpus

Prepare tokenizer and "reverse tokenizer"

In [0]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(corpus)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

nb_index = len(tokenizer.word_index) + 1# word_index starts at 1
space_index = tokenizer.word_index[' ']

X_raw = tokenizer.texts_to_sequences(corpus)

# flatten our corpus into a big list of encoded characters
# note: we can do it this way because our whole corpus fits in memory
X_raw = np.hstack(X_raw)

We define the size of your segments with the `seg_size` variable

In [0]:
seg_size = 30

Pad our data with space characters so its length will be a multiple of `seg_size`

In [0]:
X_raw = np.append(X_raw, np.full(seg_size - X_raw.shape[0] % seg_size, space_index))
X_raw.shape

Now we reshape our data for training:
- X: overlapping sequences of `seg_size` characters
- Y: the following value for each sequence

Note that we one-hot encode everything

In [0]:
X = np.zeros((X_raw.shape[0], seg_size, nb_index), dtype=np.bool)
Y = np.zeros((X_raw.shape[0], nb_index), dtype=np.bool)

for i in range(0, X_raw.shape[0] - seg_size):
    for j in range(seg_size):
        X[i][j][int(X_raw[i + j])] = 1
    Y[i][int(X_raw[i + seg_size])] = 1

In [0]:
X.shape

In [0]:
Y.shape

---

# Create our model

The following model is heavily inspired by http://karpathy.github.io/2015/05/21/rnn-effectiveness/ and https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

In [0]:
from keras.layers.wrappers import TimeDistributed

# You can change these to play with different network shapes
RNN_SIZE= 64
NUM_LAYERS = 2 
BATCH_SIZE = 256
DROPOUT = 0.2

inputl = Input(shape=(seg_size, nb_index))
x= inputl

for i in range(NUM_LAYERS-1):
    x = LSTM(RNN_SIZE, return_sequences=True)(x)
x = LSTM(RNN_SIZE, return_sequences=False)(x)
    
x = Dropout(DROPOUT)(x)
    
x = Dense(nb_index)(x)
x = Activation('softmax')(x)

In [0]:
autoencoder = Model(inputl, x)
autoencoder.compile(optimizer='RMSprop', loss='categorical_crossentropy')

The following functions can be used to switch back and forth between a string and its encoded form

In [0]:
def encode_string(s, segsize, num_classes):
    Xt = np.array(tokenizer.texts_to_sequences([s]))
    Xt = np.append(np.full(segsize - Xt.shape[1] % segsize, space_index), Xt) # pad beginning with spaces
    Xt = keras.utils.to_categorical(Xt, num_classes=num_classes)
    Xt = Xt.reshape(segsize, num_classes)
    return Xt
    
def decode_string(s):
    return [reverse_word_map[x] for x in np.argmax(s, axis=1)]

Define an utility function to export the model to filesystem. The model can be saved as a single file or with structure and weights separated.
The function also export a dictionnary to convert model outputs to characters.

In [0]:
def export(model, basepath, split=False):
    if not os.path.exists(basepath):
        os.makedirs(basepath)
        
    if not split:
        model.save(basepath + '/model.h5' )
    else:
        model.save_weights(basepath + '/model.hdf5')
        with open(basepath + '/model.json', 'w') as f:
            f.write(model.to_json())

    with open(basepath + '/dict.json', 'w') as fp:
        json.dump(tokenizer.word_index, fp)

If you want to load an already existing model instead of starting from scratch, you can do it here for example by uncommenting the following line and setting the adqueate path.

In [0]:
#autoencoder = keras.models.load_model('mymodel.h5')

---

# Training

Here we define some utility methods to help monitor the progress of the training process

In [0]:
def sample(model, base, size):
    for i in range(size):
        d = np.argmax(autoencoder.predict(np.array([base[-30:]]))[0])
        r = np.zeros(nb_index)
        r[d] = 1
        base = np.vstack((base, r))
    return decode_string(base) 

def sample_string(model, base, size):
    e = encode_string(base, seg_size, nb_index)
    return sample(model, e, size)          

Train the model for 10 epoch, saving the current version and outputing generated text after each

In [0]:
print('-' * 50)

print(*decode_string(X[0]), sep='')
print(*sample(autoencoder, X[0], 20), sep='')

print('-' * 50)

print('ceci est un ')
print(*sample_string(autoencoder, 'ceci est un ', 20), sep='')

print()

for iteration in range(10):
    print('=' * 50)
    print('Iteration', iteration)

    autoencoder.fit(X, Y,
                    epochs=1,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    verbose=1)

    # let's generate some sample strings to see how training is going
    print('-' * 50)

    print(*decode_string(X[0]), sep='')
    print(*sample(autoencoder, X[0], 20), sep='')

    print('-' * 50)

    print('ceci est un ')
    print(*sample_string(autoencoder, 'ceci est un ', 20), sep='')

    print()
    
    export(autoencoder, './model_' + str(datetime.datetime.now()), split=True)

---