In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
with open('datasets/alllines.txt', 'r') as file:
    text = file.read()
    lines = text.lower().split('\n')

#Define words, vocabulary size and sequences of words as lines
#from keras.preprocessing.text import text_to_word_sequence, Tokenizer
words = tf.keras.preprocessing.text.text_to_word_sequence(text)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(words)
vocabulary_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(lines)

#Find subsequences 
subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
       subsequence = sequence[:i+1]
       subsequences.append(subsequence)

In [None]:
#from keras.preprocessing.sequence import pad_sequences
sequence_length = max([len(sequence) for sequence in sequences])
sequences = tf.keras.preprocessing.sequence.pad_sequences(subsequences, maxlen=sequence_length, padding='pre')

In [None]:
#from keras.utils import to_categorical
x, y = sequences[:,:-1],sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=vocabulary_size)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=0)

In [None]:
model = tf.keras.models.Sequential()

In [None]:
#1. An embedding layer with the following parameters:
    #The input dimension is vocabulary_size
    #The output dimension is 100
    #The input length is sequence_length - 1
model.add(tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=100, input_length=sequence_length-1))

#2. An LSTM layer with 100 units
model.add(tf.keras.layers.LSTM(100))

#3. A dropout layer with a dropout rate of 10%
model.add(tf.keras.layers.Dropout(0.10))

#4.A dense layer with the following parameters:
    #Activation function is softmax
    #The number of units is vocabulary_size
model.add(tf.keras.layers.Dense(units=vocabulary_size, activation='softmax'))

In [None]:
#Build the network using the following parameters:
    #Optimizer: Adam
    #Loss function: categorical_crossentropy
    #Metrics: accuracy
    #Epochs: 500
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
#model.fit(x_train, y_train, epochs=5, batch_size=100)

In [None]:
#loss, accuracy = model.evaluate(x_test, y_test)