Reference:
https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

### Section 0: Import packages

In [1]:
import numpy as np
import sys
import re
import nltk
# from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
# from keras.utils import np_utils
# from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters


In [2]:
# Will not remove stopwords in this exercise
# import nltk
# nltk.download('stopwords')

### Section 1: Select Training/Validaiton/Test Files

_Modeler Input:_  
* Select a president to build models on  
* Select the percentages of files in training, validation and test sets

In [39]:
from os import listdir
from os.path import isfile, join
# Select a president to build models on
dir_president = "CorpusOfPresidentialSpeeches/obama"
# split_pct = [training_pct, validation_pct, test_pct]
split_pct = [.4, .4, .3]
# Use x number of characters/digits to predict the next character
seq_length = 100
# Set sed number
np.random.seed(266)

Select training/validaiton/test files

In [40]:
# onlyfiles contains a list of files (not directories) under path_president
# Reference: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
onlyfiles_lst = [f for f in listdir(dir_president) if isfile(join(dir_president, f))]
num_of_files = len(onlyfiles_lst)
# Reference: https://stackoverflow.com/questions/15511349/select-50-items-from-list-at-random-to-write-to-file/39585770
files_train_arr = np.random.choice(onlyfiles_lst, round(num_of_files*split_pct[0]), replace=False)
# Set substraction: https://stackoverflow.com/questions/3428536/python-list-subtraction-operation
files_val_test_lst = list(set(onlyfiles_lst) - set(files_train_arr))
files_val_arr = np.random.choice(files_val_test_lst, round(len(files_val_test_lst)*split_pct[1]/(split_pct[1]+split_pct[2])), replace=False)
files_test_arr = np.array(list((set(files_val_test_lst) - set(files_val_arr))))

print('Training set:')
print(files_train_arr)
print('Validation set:')
print(files_val_arr)
print('Test set:')
print(files_test_arr)

Training set:
['obama_speeches_028.txt' 'obama_speeches_015.txt'
 'obama_speeches_018.txt' 'obama_speeches_045.txt'
 'obama_speeches_035.txt' 'obama_speeches_000.txt'
 'obama_speeches_041.txt' 'obama_speeches_039.txt'
 'obama_speeches_019.txt' 'obama_speeches_001.txt'
 'obama_speeches_003.txt' 'obama_speeches_032.txt'
 'obama_speeches_040.txt' 'obama_speeches_029.txt'
 'obama_speeches_002.txt' 'obama_speeches_014.txt'
 'obama_speeches_005.txt' 'obama_speeches_033.txt'
 'obama_speeches_016.txt']
Validation set:
['obama_speeches_031.txt' 'obama_speeches_043.txt'
 'obama_speeches_021.txt' 'obama_speeches_012.txt'
 'obama_speeches_030.txt' 'obama_speeches_026.txt'
 'obama_speeches_038.txt' 'obama_speeches_047.txt'
 'obama_speeches_022.txt' 'obama_speeches_009.txt'
 'obama_speeches_023.txt' 'obama_speeches_020.txt'
 'obama_speeches_036.txt' 'obama_speeches_017.txt'
 'obama_speeches_008.txt' 'obama_speeches_034.txt'
 'obama_speeches_049.txt']
Test set:
['obama_speeches_027.txt' 'obama_speech

### Section 2: Pre-processing Data so that It Can Be Consumed by _tensorflow.keras.layers.LSTM_

_**Questions**_:
* Why remove special characters?  

In [41]:
def tokenize_words(input_file):
    """
    This function accomplishes four purposes:
    1. Remove the title and date (the first two rows) from the input file
    2. Remove all special characters except for . and ,
    3. Convert all characters to lower case
    4. Tokenize words
    
    Args:
        input_file (str): input file
        
    Returns:
        output_file (str): tokenized strings separated by space
    """
    # Remove the title and date (the first two rows)
    startChar = [word.end() for word in re.finditer("\n",file)][1]
    input2 = input_file[startChar:]
    
    # lowercase everything so that we have less tokens to predict
    #     i.e., no need to distinguish a vs. A
    input2 = input2.lower()

    # Keep all the words and digitis
    # Keep only two special characters: . and ,
    # If we want to keep carriage return, add |\n
    tokenizer = RegexpTokenizer(r'\w+|[\.\,]')
    tokens = tokenizer.tokenize(input2)
    output_file = " ".join(tokens)
    
    return output_file

Define a list of all possible characters and digits in the data

In [42]:
# It's possible that digits in the validation/test sets are not training set
# To make sure every character/digit can be converted to a number 
#     and subsequently scored appropriately for validation/test sets,
# We define chars_lst as all possible characters/digits we can observe from training/validaiton/test sets
# The code below only captures characters/digits in the training set and thus inappropriate
#     chars_lst = sorted(list(set(tokenized_file)))
# Reference: https://stackoverflow.com/questions/16060899/alphabet-range-on-python
chars_lst = [' ',',','.'] + [str(i) for i in range(10)] + [chr(i) for i in range(ord('a'),ord('z')+1)]

_**Question**_:
* I don't understand the logic of converting `X` to float or divided by vocab_len so that all Xs are smaller than 1

In [43]:
def create_x_y_num(input_file, chars_lst, seq_length):
    """
    This function creates raw input data and raw target character.
    
    Args:
        input_file (str): tokenized file
        chars_list (list): a list of all possible characters and digits in the data
        seq_length (int): the number of characters/digits as input
        
    Returns:
        x_data (list): a list of rolling ?-character sequences converted to floats
            number of elements (i.e., sequences) in the list = input_len - seq_length
            every element is an array with dimension (seq_length x 1)
        y_data (list): the next character for every rolling sequence
            number of elements = input_len - seq_length
    """
    
    # input_len - seq_length = the beginning character of the last row of input data
    # vocab_len is used to standardized the input data
    input_len = len(input_file)
    vocab_len = len(chars_lst)
    # print ("Total number of characters:", input_len)
    # print ("Total vocab:", vocab_len)
    
    # Define the dictionary that map characters/digits to numbers
    char_to_num = dict((c, i) for i, c in enumerate(chars_lst))

    # Initialize the data
    x_data_temp = []
    y_data = []
    
    # loop through inputs, start at the beginning and go until we hit
    # the final character we can create a sequence out of
    for i in range(0, input_len - seq_length, 1):
        # Define input and output sequences
        # Input is the current character plus desired sequence length
        in_seq = input_file[i:i + seq_length]

        # Out sequence is the initial character plus total sequence length
        out_seq = input_file[i + seq_length]

        # Convert list of characters to integers 
        x_data_temp.append([char_to_num[char] for char in in_seq])
        y_data.append(char_to_num[out_seq])
        
    # Convert the input sequences 
    #     (a list containning sublists, with each sublist represent a 100-character sequence)
    #     into a processed numpy array that our network can use
    n_patterns = len(x_data_temp)
    x_data_reshape = np.reshape(x_data_temp, (n_patterns, seq_length, 1))

    # Convert intergers into floats 
    # so that the sigmoid activation function our network uses can interpret them and output probabilities from 0 to 1
    x_data = list(x_data_reshape/float(vocab_len))
        
    return x_data, y_data

In [44]:
def combine_x_y(dir_president, files_arr):
    """
    """
    X_temp = []
    Y_temp = []
    for i in range(files_arr.shape[0]):
        file = open(join(dir_president, files_arr[i])).read()
        
        # Tokenize the file
        tokenized_file = tokenize_words(file)
        
        # Create raw x and y for a given file in a format that can be merged with other files
        x_data, y_data = create_x_y_num(tokenized_file, chars_lst, seq_length)
        
        # Use extend not append
        #     append adds an element that's a list itself
        #     extend adds elements from the new list to the existing list
        # Reference: https://stackabuse.com/append-vs-extend-in-python-lists/
        X_temp.extend(x_data)
        Y_temp.extend(y_data)
    
    x = np.array(X_temp)
    # One-hot encode the label data
    y = keras.utils.to_categorical(Y_temp)
    
    return x, y

In [45]:
train_X, train_Y = combine_x_y(dir_president, files_train_arr)
print(train_X.shape)
print(train_Y.shape)

val_X, val_Y = combine_x_y(dir_president, files_val_arr)
print(val_X.shape)
print(val_Y.shape)

test_X, test_Y = combine_x_y(dir_president, files_test_arr)
print(test_X.shape)
print(test_Y.shape)

(422133, 100, 1)
(422133, 39)
(380467, 100, 1)
(380467, 39)
(345024, 100, 1)
(345024, 39)


### Section 3: LSTM

In [49]:
model = keras.Sequential()
model.add(LSTM(256, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(train_Y.shape[1], activation='softmax'))

The default learning rate for adam optimizer is 0.001.  
(Reference: https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)  
To change the learning rate, see https://www.tensorflow.org/guide/keras/train_and_evaluate (tensor), https://keras.io/optimizers/ (keras)  

_**Note**_: maybe research on the optimizer to use??

In [50]:
# model.compile(loss='categorical_crossentropy', optimizer='adam'(learning_rate=1e-3))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=0.01))

In [None]:
# Capture fit history
# Reference: https://chrisalbon.com/deep_learning/keras/visualize_loss_history/
history = model.fit(train_X, train_Y, epochs=2, batch_size=256, validation_data=(val_X,val_Y))

Train on 422133 samples, validate on 380467 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/2
 23040/422133 [>.............................] - ETA: 2:04:53 - loss: 3.0399

Reference on `keras.callbacks.ModelCheckpoint`:
https://machinelearningmastery.com/check-point-deep-learning-models-keras/

In [41]:
checkpoint_path = "model_weights_LSTM_character_Angela_attemp2.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [47]:
# Capture fit history
# Reference: https://chrisalbon.com/deep_learning/keras/visualize_loss_history/
history = model.fit(X, y, epochs=2, batch_size=256, validation_data=(), callbacks=desired_callbacks)

Epoch 1/2
Epoch 00001: loss improved from 2.88151 to 2.87522, saving model to model_weights_LSTM_Angela_attemp2.hdf5
Epoch 2/2
Epoch 00002: loss improved from 2.87522 to 2.87276, saving model to model_weights_LSTM_Angela_attemp2.hdf5


In [43]:
training_loss = history.history['loss']

In [46]:
history.history

{'loss': [2.9931623756142103, 2.8815136720348082]}

In [None]:
# Reference: https://chrisalbon.com/deep_learning/keras/visualize_loss_history/
# Get training and test loss histories
training_loss = history.history['loss']
test_loss = history.history['val_loss']

# Create count of the number of epochs
epoch_count = range(1, len(training_loss) + 1)

# Visualize loss history
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show();


loss_history_arr = np.array(loss_history)
np.savetxt("loss_history.txt", loss_history_arr, delimiter=",")

Under the same session, I can continue to train using another `fit`. If the session was restarted or interrupted, to continue the fit see  
* https://stackoverflow.com/questions/45393429/keras-how-to-save-model-and-continue-training  
* https://www.mikulskibartosz.name/save-and-restore-a-tensorflow-model-using-keras-for-continuous-model-training/  
(Haven't implemented it yet).

In [22]:
# Reference: https://www.tensorflow.org/tutorials/keras/save_and_load
# No checkpoint needed to save the model
checkpoint_path = "model_weights_LSTM_character_Angela_attemp2.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.load_weights(checkpoint_path)
desired_callbacks = [checkpoint]
history = model.fit(X, y, epochs=1, batch_size=256, callbacks=desired_callbacks)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 00001: loss improved from inf to 2.87206, saving model to model_weights_LSTM_Angela_attemp2.hdf5


In [23]:
model.save('model_LSTM_character_Angela_attemp2.hdf5') 

In [46]:
# Recreate the exact same model, including its weights and the optimizer
new_model = keras.models.load_model('model_LSTM_character_Angela_attemp2.hdf5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [47]:
history = new_model.fit(X, y, epochs=2, batch_size=256, callbacks=desired_callbacks)

Epoch 1/2
Epoch 00001: loss improved from 2.87206 to 2.87085, saving model to model_weights_LSTM_Angela_attemp2.hdf5
Epoch 2/2
Epoch 00002: loss improved from 2.87085 to 2.86386, saving model to model_weights_LSTM_Angela_attemp2.hdf5


In [21]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from 2.45532 to 2.43682, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.43682 to 2.40654, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.40654 to 2.37607, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.37607 to 2.35417, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.35417 to 2.31597, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.31597 to 2.27857, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.27857 to 2.24322, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.24322 to 2.20737, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.20737 to 2.17077, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 2.17077 to

<tensorflow.python.keras.callbacks.History at 0x19f56d00080>

In [22]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from 1.68699 to 1.62925, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 1.62925 to 1.58411, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 1.58411 to 1.55259, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 1.55259 to 1.48358, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 1.48358 to 1.43740, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 1.43740 to 1.40468, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 1.40468 to 1.37305, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 1.37305 to 1.30448, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 1.30448 to 1.28089, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 1.28089 to

<tensorflow.python.keras.callbacks.History at 0x19f02f97978>

In [None]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)

Epoch 1/40

Epoch 00001: loss improved from 0.86370 to 0.83903, saving model to model_weights_saved.hdf5
Epoch 2/40

Epoch 00002: loss improved from 0.83903 to 0.80507, saving model to model_weights_saved.hdf5
Epoch 3/40

Epoch 00003: loss improved from 0.80507 to 0.76631, saving model to model_weights_saved.hdf5
Epoch 4/40

Epoch 00004: loss improved from 0.76631 to 0.73677, saving model to model_weights_saved.hdf5
Epoch 5/40

Epoch 00005: loss improved from 0.73677 to 0.71900, saving model to model_weights_saved.hdf5
Epoch 6/40

Epoch 00006: loss improved from 0.71900 to 0.68887, saving model to model_weights_saved.hdf5
Epoch 7/40

Epoch 00007: loss improved from 0.68887 to 0.65578, saving model to model_weights_saved.hdf5
Epoch 8/40

Epoch 00008: loss improved from 0.65578 to 0.62784, saving model to model_weights_saved.hdf5
Epoch 9/40

Epoch 00009: loss improved from 0.62784 to 0.61905, saving model to model_weights_saved.hdf5
Epoch 10/40

Epoch 00010: loss improved from 0.61905 to

<tensorflow.python.keras.callbacks.History at 0x19f02f97668>

In [None]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)

Epoch 1/40

Epoch 00001: loss improved from 0.22224 to 0.21859, saving model to model_weights_saved.hdf5
Epoch 2/40

Epoch 00002: loss improved from 0.21859 to 0.20716, saving model to model_weights_saved.hdf5
Epoch 3/40

Epoch 00003: loss improved from 0.20716 to 0.19415, saving model to model_weights_saved.hdf5
Epoch 4/40

Epoch 00004: loss improved from 0.19415 to 0.19152, saving model to model_weights_saved.hdf5
Epoch 5/40

Epoch 00005: loss improved from 0.19152 to 0.17600, saving model to model_weights_saved.hdf5
Epoch 6/40

Epoch 00006: loss improved from 0.17600 to 0.17551, saving model to model_weights_saved.hdf5
Epoch 7/40

Epoch 00007: loss did not improve from 0.17551
Epoch 8/40

Epoch 00008: loss improved from 0.17551 to 0.17171, saving model to model_weights_saved.hdf5
Epoch 9/40

Epoch 00009: loss improved from 0.17171 to 0.15912, saving model to model_weights_saved.hdf5
Epoch 10/40

Epoch 00010: loss did not improve from 0.15912
Epoch 11/40

Epoch 00011: loss improved f

<tensorflow.python.keras.callbacks.History at 0x19f5b6074a8>

To capture loss history, see  
* https://stackoverflow.com/questions/38445982/how-to-log-keras-loss-output-to-a-file
* https://forums.fast.ai/t/passing-multiple-callbacks-in-keras-early-stopping-modelcheckpoint-lrratescheduler/5477  
(Haven't implemented it yet)

In [25]:
checkpoint_path = "model_weights_LSTM_character_Angela_attemp2.hdf5"
model.load_weights(checkpoint_path)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
num_to_char = dict((i, c) for i, c in enumerate(chars_lst))

In [27]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  defeat seek peace security support wondered america beacon still burns bright tonight proved true s "


In [28]:
# pattern

In [29]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

trength nation comes might arms scale wealth enduring power ideals democracy liberty opportunity unyielding hope true genius america america change union perfected already achieved gives us hope must achieve tomorrow election many firsts many stories told generations one mind tonight woman cast ballot atlanta lot like millions others stood line make voice heard election except one thing ann nixon cooper 106 years old born generation past slavery time cars road planes sky someone like vote two reasons woman color skin tonight think seen throughout century america heartache hope struggle progress times told people pressed american creed yes time women voices silenced hopes dismissed lived see stand speak reach ballot yes despair dust bowl depression across land saw nation conquer fear new deal new jobs new sense common purpose yes bombs fell harbor tyranny threatened world witness generation rise greatness democracy saved yes buses montgomery hoses birmingham bridge selma preacher atlant

_**Note**_: Should look into Tensorboard