In [1]:
# base
import re, pickle
import numpy as np

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
with open('trump_raw_text.txt', 'r', encoding='utf8') as myfile:
    raw_text = myfile.read()

In [3]:
def clean_text(t):
    # to lower
    t = t.lower()
    # remove quotes
    t = re.sub(r'"@.*', '', t)
    t = re.sub(r'^“.*”$', '', t)
    # remove URLs
    t = re.sub(r'https*:\/\/\S*', '', t)
    t = re.sub(r'pic\.twitter\.com\/\S*', '', t)
    # remove \n
    t = re.sub('\n', ' ', t)
    # remove extra whitespaces
    t = re.sub(r'\s+', ' ', t)
    # replace '&amp' with 'and'
    t = re.sub('&amp;', 'and', t)     
    # replace abbreviations
    t = re.sub("'ll", ' will', t)
    t = re.sub("won't", 'will not', t)
    t = re.sub("n't", ' not', t) 
    # remove @mention
    t = re.sub(r'@[A-Za-z0-9_]+', '', t) 
    # remove #tag
    t = re.sub(r'#[A-Za-z0-9_]+', '', t) 
    # remove special characters
    t = re.sub(r'[^a-zA-Z ]', '', t) 
    # remove multiple spaces 
    t = re.sub("\s\s+", " ", t) 
    return t

raw_text = clean_text(raw_text)

In [5]:
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  920369
Total Vocab:  27


In [6]:
with open('chars.txt', 'wb') as fp:
    pickle.dump(chars, fp)

In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataX, dataY, test_size=0.1, random_state=42)

print("Total Patterns: ", len(X_train))

Total Patterns:  828242


In [9]:
# reshape X to be [samples, time steps, features]
X = np.reshape(X_train, (len(X_train), seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(y_train)
# define the LSTM model
model = Sequential([
    LSTM(256, input_shape=(None, X.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256, dropout=0.2),
    Dense(y.shape[1], activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 256)         264192    
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 27)                6939      
Total params: 1,321,755
Trainable params: 1,321,755
Non-trainable params: 0
______________________________________________

In [10]:
early_stop = [EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=0, mode='auto')]

# fit the model
model.fit(X, y, epochs=100, batch_size=128, callbacks=early_stop, validation_split=0.1)

Train on 745417 samples, validate on 82825 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


KeyboardInterrupt: 

In [None]:
model.save('trump_model.h5')