In [None]:
from keras.utils.data_utils import get_file
import string 
import numpy as np
import tensorflow as tf 

## Data

In [2]:
SRC = "https://s3.amazonaws.com/text-datasets/nietzsche.txt"
DST = "/Users/joshua.newnham/Documents/Shared Playground Data/RNN_Char_Data/nietzsche.txt"

dl_path = get_file(fname=DST, origin=SRC)

with open(dl_path, 'r') as f:
    text = f.read()

print("Loaded text file with {} characters".format(len(text)))

Loaded text file with 600893 characters


Create a vocabulary (which we will turn into a one-hot encoding vector and used to encode the inputs and outputs) 

In [3]:
vocab = sorted(set(text))

print("Vocabulary size {}".format(len(vocab)))

Vocabulary size 84


In [4]:
print(vocab)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'ä', 'æ', 'é', 'ë']


Remove character that are not deemed useful 

In [5]:
vocab = [c for c in vocab if c in string.printable]

print("Filtered vocabulary size {}".format(len(vocab)))

Filtered vocabulary size 79


Add **unknown** token to our vocab (which will be used to replace anything we don't have) 

In [6]:
UNKNOWN_CHAR = "|"
vocab.insert(0, UNKNOWN_CHAR)

In [7]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

Add **padding** token to our vocab (which is used to *pad* out a sequence, if required)

In [8]:
# PADDING_CHAR = '\f'
# vocab.insert(0, PADDING_CHAR)

Create lookup dictionaries 

In [9]:
idx_2_char = {idx : char for idx, char in enumerate(vocab)}
char_2_idx = {char : idx for idx, char in enumerate(vocab)}

In [10]:
def encode_char(c, char_2_idx):
    if c not in char_2_idx:
        c = UNKNOWN_CHAR
    
    encoding = char_2_idx[c]
    vec = np.zeros((len(char_2_idx), ), dtype=np.int32)
    vec[encoding] = 1 
    return vec

One-hot encode the whole dataset 

In [11]:
data = np.array([encode_char(c, char_2_idx) for c in text])

In [12]:
data.shape

(600893, 80)

Let's now create our training set (X, y); where X is our sequence length (that we feed into the model) and y represents the last character that follows from X (what we want our model to predict).  
For example: 

**Sample 1:** X = "The quick bro" and y = "w"   
**Sample 2:** X = "he quick brow" and y = "n"  
**...** 

In [68]:
SEQ_LEN = 25
STRIDE = 1 

X = []
Y = []

for i in range(0, data.shape[0] - (SEQ_LEN + 1), STRIDE):
    data_x = data[i:i + SEQ_LEN,:]
    data_y = data[i+SEQ_LEN:i+SEQ_LEN+1,:]
    
    X.append(data_x)
    Y.append(data_y)
        
X = np.stack(X)
Y = np.stack(Y)

print("X shape {}, Y shape {}".format(X.shape, Y.shape))

X shape (600867, 25, 80), Y shape (600867, 1, 80)


In [69]:
Y = Y.reshape((Y.shape[0], Y.shape[-1]))
Y.shape

(600867, 80)

Split our data into a training and validation set 

In [70]:
train_count = int(X.shape[0] * 0.8) 
valid_count = X.shape[0] - train_count

train_X = X[:train_count]
train_y = Y[:train_count]

valid_X = X[train_count:]
valid_y = Y[train_count:]

print("Train {} {}, Valid {} {}".format(train_X.shape, valid_y.shape, valid_X.shape, valid_y.shape))

Train (480693, 25, 80) (120174, 80), Valid (120174, 25, 80) (120174, 80)


## Create and train our model 

In [112]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(256, input_shape=(X.shape[1], X.shape[2]), activation=None))
model.add(tf.keras.layers.Activation(activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(Y.shape[-1], activation=None))
model.add(tf.keras.layers.Activation(activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 256)               345088    
_________________________________________________________________
activation_8 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 80)                20560     
_________________________________________________________________
activation_9 (Activation)    (None, 80)                0         
Total params: 365,648
Trainable params: 365,648
Non-trainable params: 0
_________________________________________________________________


In [113]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [114]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'lstm_checkpoint.h5', 
    monitor='val_loss', 
    verbose=0, 
    save_best_only=True, 
    save_weights_only=True, 
    mode='auto', 
    period=3)
    
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=5)

model.fit(
    valid_X, valid_y, 
    batch_size=64, 
    validation_data=(valid_X, valid_y), 
    epochs=1000, 
    callbacks=[checkpoint, early_stopping])

Train on 120174 samples, validate on 120174 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000


<tensorflow.python.keras.callbacks.History at 0xb41223dd8>

In [None]:
model.save_weights('lstm_256_20190118.h5')

## Test 

In [115]:
def prepare_input(text):
    # Vectorise text 
    x = [encode_char(c, char_2_idx) for c in text]
    # Apply left padding 
    if len(x) > SEQ_LEN:
        x = x[:SEQ_LEN]
    elif len(x) < SEQ_LEN:
        diff = SEQ_LEN - len(x)         
        padding = [np.zeros_like(x[0])] * diff 
        x = padding + x
    
    x = np.array(x) 
    return x

In [116]:
x = prepare_input("hello ther")

In [117]:
x.shape

(25, 80)

In [118]:
def predict_next_word(text):
    x = prepare_input(text) 
    x = np.expand_dims(x, axis=0)
    prediction = model.predict(x)
    vec = prediction[0] 
    vocab_idx = vec.argmax()
    c = idx_2_char[vocab_idx]
    return c 

In [125]:
predict_next_word("th")

'i'