In [1]:
import pandas as pd
import numpy
import sys

import tensorflow as tf

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

# nltk.download('stopwords')

In [2]:
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.8.0
Num GPUs Available:  1


In [3]:
corpus = pd.read_pickle('./pickled_files/clean_sotu_speeches_corpus.pkl')
corpus.head()

Unnamed: 0,sotu_date,president_name,clean_text,president_party,year
0,1793-12-03,George Washington,fellow citizens of the senate and house of re...,federalist,1793
1,1794-11-19,George Washington,fellow citizens of the senate and house of re...,federalist,1794
2,1795-12-08,George Washington,fellow citizens of the senate and house of re...,federalist,1795
3,1796-12-07,George Washington,fellow citizens of the senate and house of re...,federalist,1796
4,1797-11-22,John Adams,gentlemen of the senate and gentlemen of the ...,federalist,1797


In [4]:
corpus.president_party.unique()

array(['federalist', 'democratic_republican', 'democrat', 'republican',
       'union', nan], dtype=object)

In [5]:
# Splitting democratic and republican speeches
democrat = corpus[corpus['president_party'] == 'democrat']
republican = corpus[corpus['president_party'] == 'republican']

In [6]:
democrat_speeches = ' '.join(democrat['clean_text'])
republican_speeches = ' '.join(democrat['clean_text'])

In [7]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [8]:
tokenized_words_dem = tokenize_words(democrat_speeches)
tokenized_words_rep = tokenize_words(republican_speeches)

In [9]:
# Converting characters to integers
chars_dem = sorted(list(set(tokenized_words_dem)))
char_to_num = dict((c, i) for i, c in enumerate(chars_dem))

In [10]:
input_len_dem = len(tokenized_words_dem)
vocab_len_dem = len(chars_dem)
print ("Total number of characters:", input_len_dem)
print ("Total vocab:", vocab_len_dem)

Total number of characters: 2825323
Total vocab: 29


In [11]:
seq_length = 100
x_data = []
y_data = []

In [12]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len_dem - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = tokenized_words_dem[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = tokenized_words_dem[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [13]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 2825223


In [14]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len_dem)

In [15]:
y = np_utils.to_categorical(y_data)

In [16]:
# Building the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2023-02-11 23:42:39.574166: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-11 23:42:39.574403: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 256)          264192    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100, 256)          525312    
                                                                 
 dropout_1 (Dropout)         (None, 100, 256)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 29)                3

In [19]:
filepath = "dem_model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [20]:
model.fit(X, y, epochs=2, batch_size=256, callbacks=desired_callbacks)

Epoch 1/2


2023-02-11 23:42:41.676291: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-02-11 23:42:42.750225: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:42:43.169500: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:42:44.335952: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:42:44.586641: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:42:45.902429: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:42:46.800734: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113

  939/11037 [=>............................] - ETA: 14:49 - loss: 2.8866

KeyboardInterrupt: 

In [21]:
filename = "dem_model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
num_to_char = dict((i, c) for i, c in enumerate(chars_dem))

In [23]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" one adjustment satisfaction claims citizens fix tariff imports exports regulate transit duties trade "


In [24]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len_dem)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

2023-02-11 23:44:17.941909: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:44:18.071819: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:44:18.890131: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-11 23:44:19.058784: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [25]:
result

' '