In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
#alllines-reduced.txt includes Act I, Scenes I-III of Henry IV
with open('datasets/alllines-reduced.txt', 'r') as file:
    text = file.read()
    lines = text.lower().split('\n')

#Define words, vocabulary size and sequences of words as lines
#from keras.preprocessing.text import text_to_word_sequence, Tokenizer
words = tf.keras.preprocessing.text.text_to_word_sequence(text)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(words)
vocabulary_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(lines)

#Find subsequences 
subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        subsequence = sequence[:i+1]
        subsequences.append(subsequence)

In [3]:
vocabulary_size

1464

In [4]:
#from keras.preprocessing.sequence import pad_sequences
sequence_length = max([len(sequence) for sequence in sequences])
sequences = tf.keras.preprocessing.sequence.pad_sequences(subsequences, maxlen=sequence_length, padding='pre')

In [5]:
#from keras.utils import to_categorical
x, y = sequences[:,:-1],sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=vocabulary_size)

In [6]:
print(x.shape,y.shape)

(4339, 15) (4339, 1464)


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [8]:
model = tf.keras.models.Sequential()

In [9]:
#1. An embedding layer with the following parameters:
    #The input dimension is vocabulary_size
    #The output dimension is 100
    #The input length is sequence_length - 1
model.add(tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=100, input_length=sequence_length-1))

#2. An LSTM layer with 100 units
model.add(tf.keras.layers.LSTM(100))

#3. A dropout layer with a dropout rate of 10%
model.add(tf.keras.layers.Dropout(0.10))

#4.A dense layer with the following parameters:
    #Activation function is softmax
    #The number of units is vocabulary_size
model.add(tf.keras.layers.Dense(units=vocabulary_size, activation='softmax'))

In [10]:
#Build the network using the following parameters:
    #Optimizer: Adam
    #Loss function: categorical_crossentropy
    #Metrics: accuracy
    #Epochs: 500
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 100)           146400    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1464)              147864    
Total params: 374,664
Trainable params: 374,664
Non-trainable params: 0
_________________________________________________________________


In [12]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [13]:
model.fit(x_train, y_train, epochs=500, batch_size=32,callbacks=[callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500


<tensorflow.python.keras.callbacks.History at 0x1a3ccd8d10>

In [14]:
loss, accuracy = model.evaluate(x_test, y_test)



In [15]:
print("loss:",loss)
print("accuracy:",accuracy)

loss: 12.625411033630371
accuracy: 0.0552995391190052
