Reference:
https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

### Section 0: Import packages

In [2]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow import keras
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, LSTM
# from keras.utils import np_utils
# from keras.callbacks import ModelCheckpoint

In [33]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angela/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Section 1: Load data

In [30]:
file = open("CorpusOfPresidentialSpeeches/obama/obama_speeches_000.txt").read()

### Section 2: Create input data to LSTM

Questions:
* Why convert all to lower case and removing special characters?  
* Why remove stop words?

In [31]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [34]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [35]:
processed_inputs

'title remarks election night date november 4 2008 anyone still doubts america place things possible still wonders dream founders alive time still questions power democracy tonight answer answer told lines stretched around schools churches numbers nation never seen people waited three hours four hours many first time lives believed time must different voice could difference answer spoken young old rich poor democrat republican black white latino asian native american gay straight disabled disabled americans sent message world never collection red states blue states always united states america answer led told long many cynical fearful doubtful achieve put hands arc history bend toward hope better day long time coming tonight day election defining moment change come america received gracious call senator mccain fought long hard campaign fought even longer harder country loves endured sacrifices america us cannot begin imagine better service rendered brave selfless leader congratulate go

Question:
* Shall we add all other numbers to the dictionary below?  

In [37]:
# convert the characters in our input to number
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
char_to_num

{' ': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '4': 4,
 '6': 5,
 '8': 6,
 'a': 7,
 'b': 8,
 'c': 9,
 'd': 10,
 'e': 11,
 'f': 12,
 'g': 13,
 'h': 14,
 'i': 15,
 'j': 16,
 'k': 17,
 'l': 18,
 'm': 19,
 'n': 20,
 'o': 21,
 'p': 22,
 'q': 23,
 'r': 24,
 's': 25,
 't': 26,
 'u': 27,
 'v': 28,
 'w': 29,
 'x': 30,
 'y': 31,
 'z': 32}

In [38]:
# We need the total length of our inputs and total length of our set of characters 
# for later data prep, so we'll store these in a variable.
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 6079
Total vocab: 33


In [39]:
# Initialize the data
seq_length = 100
x_data = []
y_data = []

In [40]:
# Go through the entire list of inputs and convert the characters to numbers

# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [41]:
x_data

[[26,
  15,
  26,
  18,
  11,
  0,
  24,
  11,
  19,
  7,
  24,
  17,
  25,
  0,
  11,
  18,
  11,
  9,
  26,
  15,
  21,
  20,
  0,
  20,
  15,
  13,
  14,
  26,
  0,
  10,
  7,
  26,
  11,
  0,
  20,
  21,
  28,
  11,
  19,
  8,
  11,
  24,
  0,
  4,
  0,
  3,
  1,
  1,
  6,
  0,
  7,
  20,
  31,
  21,
  20,
  11,
  0,
  25,
  26,
  15,
  18,
  18,
  0,
  10,
  21,
  27,
  8,
  26,
  25,
  0,
  7,
  19,
  11,
  24,
  15,
  9,
  7,
  0,
  22,
  18,
  7,
  9,
  11,
  0,
  26,
  14,
  15,
  20,
  13,
  25,
  0,
  22,
  21,
  25,
  25,
  15,
  8,
  18,
  11,
  0],
 [15,
  26,
  18,
  11,
  0,
  24,
  11,
  19,
  7,
  24,
  17,
  25,
  0,
  11,
  18,
  11,
  9,
  26,
  15,
  21,
  20,
  0,
  20,
  15,
  13,
  14,
  26,
  0,
  10,
  7,
  26,
  11,
  0,
  20,
  21,
  28,
  11,
  19,
  8,
  11,
  24,
  0,
  4,
  0,
  3,
  1,
  1,
  6,
  0,
  7,
  20,
  31,
  21,
  20,
  11,
  0,
  25,
  26,
  15,
  18,
  18,
  0,
  10,
  21,
  27,
  8,
  26,
  25,
  0,
  7,
  19,
  11,
  24,
  15,
  9,
  7,


In [44]:
# Check my understanding
i = 10
in_seq = processed_inputs[i:i + seq_length]
out_seq = processed_inputs[i + seq_length]
print(in_seq)
print('-----')
print(out_seq)

rks election night date november 4 2008 anyone still doubts america place things possible still wond
-----
e


In [45]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5979


Question:
* I don't understand the logic of converting `X` to float

In [46]:
# convert our input sequences into a processed numpy array that our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
# convert the numpy array values into floats so that the sigmoid activation function our network uses can interpret them and output probabilities from 0 to 1
X = X/float(vocab_len)

In [48]:
# one-hot encode our label data
y = keras.utils.to_categorical(y_data)

In [49]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)