# Model 1

The initial model is a basic RNN.

## Imports and Constants, etc.

In [None]:
import projd
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import importlib
from keras.layers import Dense, SimpleRNN, Input
from keras.models import Model
import keras

from IPython.display import SVG # visualize model
from keras.utils.vis_utils import model_to_dot # visualize model


from pathlib import Path

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import config
import datagen
importlib.reload(config)
%matplotlib inline
sns.set()

GEN_STRIDE = 20 # for generation of overlapping text substrings
EPOCHS=10
BATCH_SIZE=32
SAMPLE_MODEL_EPOCHS = 20 # sample model output (generated text) every N epochs.
VOCAB_SIZE = 256
ALPHA_REGULARIZER = 0.05
n_a = 128 # number of hidden units
n_y = VOCAB_SIZE # predict next character
n_x = VOCAB_SIZE # input current character
n_t = 40 # sequence length

model_name = f'model01_rnn_{n_a}_{n_t}_{GEN_STRIDE}'


## Train and Validation Sets

- Load Preprocessed Datasets
- Divide into training and validation sets


### Load Vectorized Datasets

The jokes, names, and book datasets are preprocessed in the following manner:

- The text of the dataset is combined, lowercased, and white-space normalized.
- The cleaned text is split into overlapping strings of length n_t.  
  They overlap by (n_t - GEN_STRIDE) characters.
- The characters are converted to integers (via ISO Latin 1 encoding) and then 1-hot encoded
- The y/output sequences are the x sequences shifted over one, with a space character appended 
  to the end to make the sequence length the same.
- The result is an X shape of (m, n_t, n_x) and a Y shape of (m, n_t, n_y).

In [None]:
x, y = datagen.get_tensors(n_t, VOCAB_SIZE, GEN_STRIDE)

In [None]:
print(x.shape, y.shape)

### Divide Datasets Into Train and Validation

In [None]:
train_seed = 1
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=train_seed, shuffle=True)

In [None]:
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

## Build Model


In [None]:
def build_model(n_x=n_x, n_y=n_y, n_a=n_a, n_t=n_t):
    '''
    n_x: number of input features.  The size of the vocabulary.  Each char is one-hot encoded
    n_y: number of output features.  The same as n_x for next character prediction.
    n_a: number of hidden units in rnn layer
    n_t: the length of each sequence.
    '''
    ## the input is a sequence of characters that have been one-hot encoded.
    x_input = Input(shape=(n_t, n_x))
    x = SimpleRNN(n_a, return_sequences=True, kernel_regularizer=keras.regularizers.l2(ALPHA_REGULARIZER))(x_input)
    y = Dense(n_y, activation='softmax', kernel_regularizer=keras.regularizers.l2(ALPHA_REGULARIZER))(x)
    
    model = Model(inputs=x_input, outputs=y)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
    
model = build_model()
print(model.summary())
SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Train and Evaluate Model

- Add callbacks to save model every 20 epochs and to log performance stats every epoch, so we have the results saved somewhere for charting.


In [None]:
model_path = config.models_dir  /  (model_name +'_{epoch:02d}.h5')
def get_model_path(model_name, epoch):
    return config.models_dir  /  (model_name + f'{epoch:02d}.h5')

# Callbacks include ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping
# Save the model
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    str(model_path), monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, 
    mode='auto', period=1)
# Stop when validation loss stops improving
early_cb = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
# Save logs to logfile
log_path = config.logs_dir / (model_name + '_log.csv')
log_cb = keras.callbacks.CSVLogger(str(log_path), separator=',', append=False)

history = model.fit(x_train, x_train, epochs=10, batch_size=BATCH_SIZE, validation_data=(x_val, y_val), 
                    callbacks=[checkpoint_cb, log_cb])
#model.evaluate()

## Visualize Training Progress

In [None]:
metrics = pd.read_csv(log_path)

In [None]:
metrics[-10:]

In [None]:
plt.plot(metrics["acc"])
plt.plot(metrics["val_acc"])
plt.legend(['Training Accuracy', "Validation Accuracy"])
plt.show()

plt.plot(metrics["loss"])
plt.plot(metrics["val_loss"])
plt.legend(['Training Loss', "Validation Loss"])
plt.show()



## Generate Text Periodically During Training

In [None]:
def generate_text_for_epochs(model_name, epochs):
    for epoch in epochs:
        path = model_path(model_name, epoch)
        model = keras.models.load_model(path)
        text = generate_text(model)
        print('Epoch:', epoch)
        print('Generated Text:')
        print(text)

def generate_text(model):
    for i in range(length):
        model.predict()