In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import re
from nltk.corpus import stopwords
import string

import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.optimizers import Adam

import pickle

In [3]:
def load_data(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
data = load_data("../input/game-of-thrones-text-generation/got1.txt")

In [4]:
data[:1000]

In [5]:
# data cleaning process
import re                                # Regular expressions to use sub function for replacing the useless text from the data

def clean_text(text):
    text = re.sub(r',', '', text)
    text = re.sub(r'\'', '',  text)
    text = re.sub(r'\"', '', text)
    text = re.sub(r'\(', '', text)
    text = re.sub(r'\)', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'“', '', text)
    text = re.sub(r'”', '', text)
    text = re.sub(r'’', '', text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r';', '', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'\-', '', text)

    return text

In [6]:
# cleaning the data
lower_data = data.lower()           # Converting the string to lower case to get uniformity

split_data = lower_data.splitlines()      # Splitting the data to get every line seperately but this will give the list of uncleaned data

split_data[:50] 

In [7]:
final = ''                          # initiating a argument with blank string to hold the values of final cleaned data

for line in split_data:
    line = clean_text(line)
    final += '\n' + line

print(final)[:10]


In [8]:
final_data = final.split('\n')       # splitting again to get list of cleaned and splitted data ready to be processed
final_data[:10]

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
# Instantiating the Tokenizer
max_vocab = 100
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(final_data)

In [11]:
# Getting the total number of words of the data.
word2idx = tokenizer.word_index
print(len(word2idx))



In [14]:

vocab_size = len(word2idx) + 1        # Adding 1 to the vocab_size because the index starts from 1 not 0. This will make it uniform when using it further
print(vocab_size)

# ***# ## Creating n-gram sequences from the sentences
# # # # # # We will turn the sentences to sequences line by line and create n_gram sequences*******

In [16]:
input_seq = []

for line in final_data:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_seq = token_list[:i+1]
    input_seq.append(n_gram_seq)



In [17]:
# Getting the maximum length of sequence for padding purpose
max_seq_length = max(len(x) for x in input_seq)
print(max_seq_length)


In [18]:
# Padding the sequences and converting them to array
input_seq = np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))
print(input_seq)

In [19]:
# Taking xs and labels to train the model.

xs = input_seq[:, :-1]        # xs contains every word in sentence except the last one because we are using this value to predict the y value
labels = input_seq[:, -1]     # labels contains only the last word of the sentence which will help in hot encoding the y value in next step
print("xs: ",xs)
print("labels:",labels)


In [20]:
from tensorflow.keras.utils import to_categorical

In [21]:
ys = to_categorical(labels, num_classes=vocab_size)
print(ys)

In [22]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential


In [24]:
# using the pipeline method of sequential to define a model

model = Sequential()
model.add(Embedding(vocab_size, 124, input_length=max_seq_length-1))
model.add(Dropout(0.2))
model.add(LSTM(520, return_sequences=True))
model.add(Bidirectional(LSTM(340, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dense(1024, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [26]:
model.compile(optimizer=Adam(lr=0.001),
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

In [27]:
model.summary()    

In [28]:
r = model.fit(xs,ys,epochs=10)

In [29]:
# Evaluating the model on accuracy
import matplotlib.pyplot as plt
plt.plot(r.history['accuracy'])

In [39]:
# Defining a function to take input of seed text from user and no. of words to be predicted

def predict_words(seed, no_words):
  for i in range(no_words):
    token_list = tokenizer.texts_to_sequences([seed])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=1)
    
    new_word = ''

    for word, index in tokenizer.word_index.items():
      if predicted == index:
        new_word = word
      break
    seed += " " + new_word
    print(seed)

In [42]:
# predicting or generating the poem with the seed text

seed_text = 'i am feeling good today'
next_words = 20

predict_words(seed_text, next_words)


In [44]:
# saving the model

model.save('text_generator.h5') # Will create a HDF5 file of the model