# Model 10

This model is the kitchen sink: a 1D Conv layer then an GRU layer, then another GRU layer then a Dense layer with a softmax activation, trained to predict the next character from the preceeding characters.  The dataset is Pride and Prejudice by Jane Austen, which contains around 680,000 characters.  The sequence length is short (20 chars) and the training set size is large (150k samples).  No dropout is used.

## Imports and Constants, etc.

In [None]:
import datetime
import random
import projd
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import importlib
from keras.layers import Dense, SimpleRNN, Input, Conv1D, LSTM, GRU
from keras.models import Model
import keras
import numpy as np

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model


from pathlib import Path

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import datagen
import load
importlib.reload(datagen)
importlib.reload(load)
%matplotlib inline
sns.set()

data_dir = Path('/data2/uvm_deep_learning_homework2')
models_dir = data_dir / 'models'
logs_dir = data_dir / 'logs'

GEN_STRIDE = 3 # for generation of overlapping text substrings
EPOCHS=100
BATCH_SIZE=128
NUM_GEN_TEXT_SAMPLES = 1 # number of generated text samples to create per epoch sampled
GEN_SAMPLE_LEN = 80 # length of generated text samples
SEQ_LEN = 20 # sequence length
NUM_SEQS = 150000 # maximum number of sequences used for training
VOCAB_SIZE = 256

n_a = 128 # number of hidden units
n_a2 = 64 # of filters in convolutional layer

model_name = f'model_10_conv_lstm_{n_a}_{SEQ_LEN}_{GEN_STRIDE}'


## Train and Validation Sets

- Load Preprocessed Datasets
- Divide into training and validation sets


### Load Vectorized Datasets

The jokes, names, and book datasets preprocessed in the following manner:

- The text of the dataset is combined, lowercased, and white-space normalized.
- The cleaned text is split into overlapping strings of length SEQ_LEN.  
  They overlap by (SEQ_LEN - GEN_STRIDE) characters.
- The characters are converted to integers (via ISO Latin 1 encoding) and then 1-hot encoded
- The y value corresponding to every sequence in x is the one-hot encoded character immediately 
  following the sequence in the text 
- The result is an X shape of (m, SEQ_LEN, VOCAB_SIZE) and a Y shape of (m, VOCAB_SIZE).

In [None]:
text = datagen.get_normalized_text(choice='pride', data_dir=data_dir)
x, y = datagen.text_to_tensors(text, SEQ_LEN, GEN_STRIDE, VOCAB_SIZE, num_seqs=NUM_SEQS)

# Confirm that the shape looks right.
print(x.shape, y.shape)

## Build Model


In [None]:
def build_model(n_x=VOCAB_SIZE, n_y=VOCAB_SIZE, n_a=n_a, n_a2=n_a2, n_t=SEQ_LEN):
    '''
    n_x: number of input features.  The size of the vocabulary.  Each char is one-hot encoded
    n_y: number of output features.  The same as n_x for next character prediction.
    n_a: number of hidden units in rnn layer
    n_a2: number of hidden units in conv layer
    n_t: the length of each sequence.
    '''
    ## the input is a sequence of characters that have been one-hot encoded.
    x_input = Input(shape=(n_t, n_x))
    x = x_input
    x = Conv1D(n_a2, (3,), activation='relu')(x)
    x = GRU(n_a, return_sequences=True)(x)
    x = GRU(n_a)(x)
    y = Dense(n_y, activation='softmax')(x)
    
    model = Model(inputs=x_input, outputs=y)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
    
model = build_model()
print(model.summary())
SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Train and Evaluate Model

- Add callbacks to save model every 20 epochs and to log performance stats every epoch, so we have the results saved somewhere for charting.


In [None]:
# Callbacks include ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping
# Save the model
model_path = models_dir  /  (model_name +'_{epoch:02d}.h5')
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    str(model_path), monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, 
    mode='auto', period=1)
# Stop when validation loss stops improving
early_cb = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
# Save logs to logfile
log_path = logs_dir / (model_name + '_' + datetime.datetime.now().isoformat() + '_log.csv')
log_cb = keras.callbacks.CSVLogger(str(log_path), separator=',', append=False)

history = model.fit(x, y, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.25, 
                    callbacks=[checkpoint_cb, log_cb])


## Visualize Training Progress

In [None]:
# read metrics from the log file
metrics = pd.read_csv(log_path)

In [None]:
print(pd.concat([metrics[::10], metrics[-1:]])) # every 10th metric and the last one

In [None]:
# Plot Training and Validation Accuracy 
axes = plt.gca()
axes.set_ylim([0.0,1.0]) # Show results on 0..1 range
plt.plot(metrics["acc"])
plt.plot(metrics["val_acc"])
plt.legend(['Training Accuracy', "Validation Accuracy"])
plt.show()

# Plot Training and Validation Loss
plt.plot(metrics["loss"])
plt.plot(metrics["val_loss"])
plt.legend(['Training Loss', "Validation Loss"])
plt.show()



## Show Effect of Training on Text Generation

Use models from different training epochs to generate text.


In [None]:
def get_model_path(model_name, epoch):
    model_path = models_dir  /  (model_name + f'_{epoch:02d}.h5')
    return model_path


def weighted_sample(probs):
    '''
    probs is a 2d array where each row is a separate probability distribution for the next character
    return an index for each row corresponding to a randomly sampled probability.
    Example:
    [[0.8, 0.1, 0.1],
     [0.2, 0.5, 0.3]]
    '''
    # this has no axis argument
    # np.random.choice(len(preds), p=preds)

    # https://stackoverflow.com/questions/40474436/how-to-apply-numpy-random-choice-to-a-matrix-of-probability-values-vectorized-s
    #cum holds the cumulative distributions:
    c = probs.cumsum(axis=1)
    # Generate a set of uniformly distributed samples...
    u = np.random.rand(len(c), 1)
    #...and then see where they "fit" in c:
    choices = (u < c).argmax(axis=1)
    return choices
        
    
def max_sample(probs):
    return np.argmax(probs, axis=-1)


def seed_text(text, seq_len):
    start = np.random.randint(0, len(text) - seq_len)
    return text[start:(start + seq_len)]


def generate_text_for_epochs(model_name, epochs, text, seq_len, vocab_size, num_samples, sample_len):
    for epoch in epochs:
        path = get_model_path(model_name, epoch)
        model = keras.models.load_model(path)
        print('Epoch {}:'.format(epoch))
        for i in range(num_samples):
            seed, sample = generate_text(model, text, seq_len, vocab_size, sample_len)
            print(sample)


def generate_text(model, text, seq_len, vocab_size, output_len):
    int_to_char = datagen.get_int_to_char(vocab_size)
    char_to_int = datagen.get_char_to_int(vocab_size)
    # initial sequences to prime the generation of next characters
    seed = seed_text(text, seq_len)
    # as tensors for input to model.  shape (1, seq_len, vocab_size)
    x_seq = seed
    output = ''
    # generate output_len characters
    for i in range(output_len):
        x = datagen.sequences_to_tensor([x_seq], seq_len, char_to_int)
        preds = model.predict(x)[0] # shape (1, vocab_size)
        idx = np.random.choice(len(preds), p=preds)
        char = int_to_char[idx]
        output += char
        x_seq = x_seq[1:] + char
    return seed, output


In [None]:
generate_text_for_epochs(model_name, [20, 40, 60, 80, 100], text, SEQ_LEN, VOCAB_SIZE, NUM_GEN_TEXT_SAMPLES, GEN_SAMPLE_LEN)