###Import the required libraries

In [0]:

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences


###Download and store the required dataset

In [0]:

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \
    -O /tmp/irish-lyrics-eof.txt

data = open('/tmp/irish-lyrics-eof.txt').read()
corpus = data.lower().split("\n")


###Tokenize the dataset and split it into inputs and outputs

In [0]:

# Create the tokenizer and fit it on the corpus
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Add all sequences of words to the dataset
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

# Ensure that all training examples are of the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))

# Define the inputs and their labels
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)


###Define and train the model

In [0]:

ip_model = tf.keras.models.Sequential()
ip_model.add(tf.keras.layers.Embedding(total_words, 100, input_length = max_sequence_len-1))
ip_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(200)))
ip_model.add(tf.keras.layers.Dense(total_words, activation = 'softmax'))

ip_model.compile(loss='categorical_crossentropy',
                 optimizer = tf.keras.optimizers.Adam(lr = 0.01),
                 metrics = ['accuracy'])

history = ip_model.fit(xs, ys,
                       epochs = 100,
                       verbose = 1)


###Plot the accuracy of the model

In [0]:

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs", color = "white")
  plt.ylabel(string, color = "white")
  plt.show()

plot_graphs(history, 'accuracy')


###Test the model using some initial text

In [0]:

seed_text = "I have a bad feeling about this"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding = 'pre')
	predicted = ip_model.predict_classes(token_list, verbose = 0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)
