In [1]:
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# load doc into memory
def load_sequences(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [7]:
# load
in_filename = 'data/trump_sequences.txt'
doc = load_sequences(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_length),
    LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    LSTM(512, dropout=0.2, recurrent_dropout=0.2),
    Dense(100, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            204900    
_________________________________________________________________
lstm_4 (LSTM)                (None, 50, 512)           1153024   
_________________________________________________________________
lstm_5 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense_4 (Dense)              (None, 100)               51300     
_________________________________________________________________
dense_5 (Dense)              (None, 4098)              413898    
Total params: 3,922,322
Trainable params: 3,922,322
Non-trainable params: 0
_________________________________________________________________


In [8]:
# early stop callback
early_stop = [EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')]

# fit model
model.fit(X, y, batch_size=128, epochs=100, callbacks=early_stop, validation_split=0.1)

# save the model to file
model.save('models/trump_model.h5')

# save the tokenizer
dump(tokenizer, open('data/tokenizer.pkl', 'wb'))

Train on 18868 samples, validate on 2097 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
