Reference:
https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

### Section 0: Import packages

In [1]:
import numpy
import sys
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow import keras
# from keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
# from keras.utils import np_utils
# from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Angela\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Num GPUs Available:  0


In [2]:
tf.test.is_gpu_available()

False

In [4]:
import tensorflow-gpu

SyntaxError: invalid syntax (<ipython-input-4-9405d16d83c1>, line 1)

### Section 1: Load data

In [3]:
file = open("CorpusOfPresidentialSpeeches/obama/obama_speeches_000.txt").read()

### Section 2: Create input data to LSTM

_**Questions**_:
* Why convert all to lower case and removing special characters?  
* Why remove stop words?  

In [4]:
def tokenize_words(input):
    # Remove the title and date (the first two row)
    startChar = [word.end() for word in re.finditer("\n",file)][1]
    input2 = input[startChar:]
    
    # lowercase everything to standardize it
    input2 = input2.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input2)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [5]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [6]:
processed_inputs

'anyone still doubts america place things possible still wonders dream founders alive time still questions power democracy tonight answer answer told lines stretched around schools churches numbers nation never seen people waited three hours four hours many first time lives believed time must different voice could difference answer spoken young old rich poor democrat republican black white latino asian native american gay straight disabled disabled americans sent message world never collection red states blue states always united states america answer led told long many cynical fearful doubtful achieve put hands arc history bend toward hope better day long time coming tonight day election defining moment change come america received gracious call senator mccain fought long hard campaign fought even longer harder country loves endured sacrifices america us cannot begin imagine better service rendered brave selfless leader congratulate governor palin achieved look forward working renew n

_**Question**_:
* Shall we add all other numbers to the dictionary below?  

_**Note**_:
* Need to find paper to justify choosing character-level generation over word-level generation. OR we can try both

In [7]:
# convert the characters in our input to number
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
char_to_num

{' ': 0,
 '0': 1,
 '1': 2,
 '6': 3,
 'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'q': 20,
 'r': 21,
 's': 22,
 't': 23,
 'u': 24,
 'v': 25,
 'w': 26,
 'x': 27,
 'y': 28,
 'z': 29}

In [8]:
# We need the total length of our inputs and total length of our set of characters 
# for later data prep, so we'll store these in a variable.
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 6029
Total vocab: 30


In [9]:
# Initialize the data
seq_length = 100
x_data = []
y_data = []

In [10]:
# Go through the entire list of inputs and convert the characters to numbers

# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [11]:
# x_data

In [12]:
# Check my understanding
i = 10
in_seq = processed_inputs[i:i + seq_length]
out_seq = processed_inputs[i + seq_length]
print(in_seq)
print('-----')
print(out_seq)

ll doubts america place things possible still wonders dream founders alive time still questions powe
-----
r


In [13]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5929


_**Question**_:
* I don't understand the logic of converting `X` to float

In [14]:
# convert our input sequences into a processed numpy array that our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
# convert the numpy array values into floats so that the sigmoid activation function our network uses can interpret them and output probabilities from 0 to 1
X = X/float(vocab_len)

In [15]:
# one-hot encode our label data
y = keras.utils.to_categorical(y_data)

In [18]:
list(X)[1]

array([[0.56666667],
       [0.93333333],
       [0.6       ],
       [0.56666667],
       [0.26666667],
       [0.        ],
       [0.73333333],
       [0.76666667],
       [0.4       ],
       [0.5       ],
       [0.5       ],
       [0.        ],
       [0.23333333],
       [0.6       ],
       [0.8       ],
       [0.16666667],
       [0.76666667],
       [0.73333333],
       [0.        ],
       [0.13333333],
       [0.53333333],
       [0.26666667],
       [0.7       ],
       [0.4       ],
       [0.2       ],
       [0.13333333],
       [0.        ],
       [0.63333333],
       [0.5       ],
       [0.13333333],
       [0.2       ],
       [0.26666667],
       [0.        ],
       [0.76666667],
       [0.36666667],
       [0.4       ],
       [0.56666667],
       [0.33333333],
       [0.73333333],
       [0.        ],
       [0.63333333],
       [0.6       ],
       [0.73333333],
       [0.73333333],
       [0.4       ],
       [0.16666667],
       [0.5       ],
       [0.266

### Section 3: LSTM

In [17]:
model = keras.Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

The default learning rate for adam optimizer is 0.001.  
(Reference: https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)  
To change the learning rate, see https://keras.io/optimizers/)  

_**Note**_: maybe research on the optimizer to use??

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

Reference on `keras.callbacks.ModelCheckpoint`:
https://machinelearningmastery.com/check-point-deep-learning-models-keras/

In [19]:
filepath = "model_weights_LSTM_character_Angela_attemp1.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [20]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)

Epoch 1/40

Epoch 00001: loss improved from inf to 3.06778, saving model to model_weights_saved.hdf5
Epoch 2/40

Epoch 00002: loss improved from 3.06778 to 2.93814, saving model to model_weights_saved.hdf5
Epoch 3/40

Epoch 00003: loss improved from 2.93814 to 2.93622, saving model to model_weights_saved.hdf5
Epoch 4/40

Epoch 00004: loss improved from 2.93622 to 2.92090, saving model to model_weights_saved.hdf5
Epoch 5/40

Epoch 00005: loss improved from 2.92090 to 2.91584, saving model to model_weights_saved.hdf5
Epoch 6/40

Epoch 00006: loss did not improve from 2.91584
Epoch 7/40

Epoch 00007: loss did not improve from 2.91584
Epoch 8/40

Epoch 00008: loss improved from 2.91584 to 2.91199, saving model to model_weights_saved.hdf5
Epoch 9/40

Epoch 00009: loss did not improve from 2.91199
Epoch 10/40

Epoch 00010: loss improved from 2.91199 to 2.91104, saving model to model_weights_saved.hdf5
Epoch 11/40

Epoch 00011: loss did not improve from 2.91104
Epoch 12/40

Epoch 00012: loss 

<tensorflow.python.keras.callbacks.History at 0x19f56d00048>

Under the same session, I can continue to train using another `fit`. If the session was restarted or interrupted, see  
* https://stackoverflow.com/questions/45393429/keras-how-to-save-model-and-continue-training  
* https://www.mikulskibartosz.name/save-and-restore-a-tensorflow-model-using-keras-for-continuous-model-training/  
(Haven't implemented it yet).

In [21]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from 2.45532 to 2.43682, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.43682 to 2.40654, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.40654 to 2.37607, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.37607 to 2.35417, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.35417 to 2.31597, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.31597 to 2.27857, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.27857 to 2.24322, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.24322 to 2.20737, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.20737 to 2.17077, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 2.17077 to

<tensorflow.python.keras.callbacks.History at 0x19f56d00080>

In [22]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from 1.68699 to 1.62925, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 1.62925 to 1.58411, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 1.58411 to 1.55259, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 1.55259 to 1.48358, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 1.48358 to 1.43740, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 1.43740 to 1.40468, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 1.40468 to 1.37305, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 1.37305 to 1.30448, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 1.30448 to 1.28089, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 1.28089 to

<tensorflow.python.keras.callbacks.History at 0x19f02f97978>

In [None]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)

Epoch 1/40

Epoch 00001: loss improved from 0.86370 to 0.83903, saving model to model_weights_saved.hdf5
Epoch 2/40

Epoch 00002: loss improved from 0.83903 to 0.80507, saving model to model_weights_saved.hdf5
Epoch 3/40

Epoch 00003: loss improved from 0.80507 to 0.76631, saving model to model_weights_saved.hdf5
Epoch 4/40

Epoch 00004: loss improved from 0.76631 to 0.73677, saving model to model_weights_saved.hdf5
Epoch 5/40

Epoch 00005: loss improved from 0.73677 to 0.71900, saving model to model_weights_saved.hdf5
Epoch 6/40

Epoch 00006: loss improved from 0.71900 to 0.68887, saving model to model_weights_saved.hdf5
Epoch 7/40

Epoch 00007: loss improved from 0.68887 to 0.65578, saving model to model_weights_saved.hdf5
Epoch 8/40

Epoch 00008: loss improved from 0.65578 to 0.62784, saving model to model_weights_saved.hdf5
Epoch 9/40

Epoch 00009: loss improved from 0.62784 to 0.61905, saving model to model_weights_saved.hdf5
Epoch 10/40

Epoch 00010: loss improved from 0.61905 to

<tensorflow.python.keras.callbacks.History at 0x19f02f97668>

In [None]:
model.fit(X, y, epochs=40, batch_size=256, callbacks=desired_callbacks)

Epoch 1/40

Epoch 00001: loss improved from 0.22224 to 0.21859, saving model to model_weights_saved.hdf5
Epoch 2/40

Epoch 00002: loss improved from 0.21859 to 0.20716, saving model to model_weights_saved.hdf5
Epoch 3/40

Epoch 00003: loss improved from 0.20716 to 0.19415, saving model to model_weights_saved.hdf5
Epoch 4/40

Epoch 00004: loss improved from 0.19415 to 0.19152, saving model to model_weights_saved.hdf5
Epoch 5/40

Epoch 00005: loss improved from 0.19152 to 0.17600, saving model to model_weights_saved.hdf5
Epoch 6/40

Epoch 00006: loss improved from 0.17600 to 0.17551, saving model to model_weights_saved.hdf5
Epoch 7/40

Epoch 00007: loss did not improve from 0.17551
Epoch 8/40

Epoch 00008: loss improved from 0.17551 to 0.17171, saving model to model_weights_saved.hdf5
Epoch 9/40

Epoch 00009: loss improved from 0.17171 to 0.15912, saving model to model_weights_saved.hdf5
Epoch 10/40

Epoch 00010: loss did not improve from 0.15912
Epoch 11/40

Epoch 00011: loss improved f

<tensorflow.python.keras.callbacks.History at 0x19f5b6074a8>

To capture loss history, see  
* https://stackoverflow.com/questions/38445982/how-to-log-keras-loss-output-to-a-file
* https://forums.fast.ai/t/passing-multiple-callbacks-in-keras-early-stopping-modelcheckpoint-lrratescheduler/5477  
(Haven't implemented it yet)

In [25]:
filename = "model_weights_LSTM_character_Angela_attemp1.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [27]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  defeat seek peace security support wondered america beacon still burns bright tonight proved true s "


In [28]:
# pattern

In [29]:
# Below are the generated after 160 epoches
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

trength nation comes might arms scale wealth enduring power ideals democracy liberty opportunity unyielding hope true genius america america change union perfected already achieved gives us hope must achieve tomorrow election many firsts many stories told generations one mind tonight woman cast ballot atlanta lot like millions others stood line make voice heard election except one thing ann nixon cooper 106 years old born generation past slavery time cars road planes sky someone like vote two reasons woman color skin tonight think seen throughout century america heartache hope struggle progress times told people pressed american creed yes time women voices silenced hopes dismissed lived see stand speak reach ballot yes despair dust bowl depression across land saw nation conquer fear new deal new jobs new sense common purpose yes bombs fell harbor tyranny threatened world witness generation rise greatness democracy saved yes buses montgomery hoses birmingham bridge selma preacher atlant