###Text Generation with LSTMs with Keras and Python - Part One

In [0]:
#Process Text
#Clean Text
#Tokenize the Text and Create Sequences with Keras

In [0]:
#Create a function that reads files
#When we give a filepath its going to simply return a single string everything in that particular document
def read_file(filepath):
  with open(filepath) as f:
    str_txt=f.read()

  return str_txt

In [0]:
read_file('/content/moby_dick_four_chapters.txt')

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [0]:
#read_file(melville-moby_dick)
#If we want to read the entire file
#the length of this entire file is 1198623

In [0]:
#Tokenize and clean the text
import spacy

In [0]:
#I can add the list of things that I want to disable
nlp=spacy.load('en',disable=['parser','tagger','ner'])

In [0]:
#Provide the maximum length of nlp because at times we work on large texts
nlp.max_length=1198623

In [0]:
#Create a function to remove punctuation
#We pass some document text as string as an input to this function

def separate_punc(doc_text):
  return[token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n']

#Using nlp we can get actual tokens on that particular document
#I want to get rid of things that are probably not going to be very helpful in training purposes like periods or new lines 
#This is to make sure the text generation neural network doesn't overfit that sort of punctuation
#Otherwise we may get a bunch of periods or a bunch of new lines at the end
#Since those are common enough that the neural network overfit to them
#So, what we are doing is we go for every token
#then check that the token is not in the few things that I mention, this may be random punctuation, I can add a new line
#Anything I pass in the '' will not be added to the list of tokens
#We are going to copy a string taht is provided by keras \n\n \n\n\n!"#$%&()*+,--./:;<=>?@[\\]^_`{|}~\t\n
#We also added double new lines,triple new lines
#What we are doing is, if the token happens to be any of the above mentioned string
#We do not need these punctuations as we are really interested in relationship between words

In [0]:
d=read_file('/content/moby_dick_four_chapters.txt')

In [0]:
d

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [0]:
tokens=separate_punc(d)

In [0]:
tokens

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such',
 'an',
 'upper',
 'hand',
 '

In [0]:
#Check the length of tokens of the file read that is containing first four chapters
len(tokens)

11338

In [0]:
#Create a sequence of tokens
#We pass in the first 24 words of a sentence and have our network predict teh 25th word
#Pass 25 words----> Network predict word 26
#Why we select 25 words?
#Because 25 words is long enough to grab the structure of a sentence
#This much of words is enough for text generation

train_len=25+1

text_sequences=[]

for i in range(train_len,len(tokens)):
  seq=tokens[i-train_len:i]
  text_sequences.append(seq)

In [0]:
#Check the type of text_sequences
#It is a giant list
type(text_sequences)

list

In [0]:
#Grab the first item in the list
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [0]:
#Next sequence is 
text_sequences[1]

['me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore']

In [0]:
  #View the first sequences
  ' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [0]:
  #View the next sequences
  ' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [0]:
  #View the next sequences
  ' '.join(text_sequences[2])

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [0]:
#Our objective is given 25 words, I am suppossed to predict the 26th word
#Use keras tokenization to format this into a numerical system
from keras.preprocessing.text import Tokenizer

In [0]:
#Create an instance of Tokenizer
tokenizer=Tokenizer()

In [0]:
#We are going to fit the text to this tokenizer
tokenizer.fit_on_texts(text_sequences)

In [0]:
sequences=tokenizer.texts_to_sequences(text_sequences)

In [0]:
#Take a look at the sequences
sequences

[[956,
  14,
  263,
  51,
  261,
  408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24],
 [14,
  263,
  51,
  261,
  408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24,
  957],
 [263,
  51,
  261,
  408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24,
  957,
  5],
 [51,
  261,
  408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24,
  957,
  5,
  60],
 [261,
  408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24,
  957,
  5,
  60,
  5],
 [408,
  87,
  219,
  129,
  111,
  954,
  260,
  50,
  43,
  38,
  315,
  7,
  23,
  546,
  3,
  150,
  259,
  6,
  2712,
  14,
  24,
  957,
  5,
  60,
  5,
  5

In [0]:
#Take a look at teh first sequences
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [0]:
#Next entry in the sequence
sequences[1]

[14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24,
 957]

In [0]:
#What we did here?
#We replaced the original text sequences that has words to numbers
#Now each of these numbers is an ID for a particular word
#To figure out the relationship, we simply do teh below
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [0]:
for i in sequences[0]:
  print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [0]:
 #In the above output, 14 is a unique id to the word me
 #Likewise 50 is a unique id for the word little

In [0]:
#Other information that tokenizer provides
tokenizer.word_counts
#This shows how many times a particular word shows up

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [0]:
#Let us get the size of our vocabulary
vocabulary_size=len(tokenizer.word_counts)

In [0]:
vocabulary_size
#There are 2717 unique words present in the four chapters

2717

In [0]:
#Check the type of sequences
type(sequences)

list

In [0]:
#We want to format the above into numpy matrix
import numpy as np

In [0]:
sequences=np.array(sequences)

In [0]:
sequences
#Each of teh rows represent a single line in the text
#What we are doing is we are shifting one word for every line

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [0]:
sequences[1]
#Here 14,263,51,261.........24 are features i.e. X
#Here 957 is label i.e. Y

array([  14,  263,   51,  261,  408,   87,  219,  129,  111,  954,  260,
         50,   43,   38,  315,    7,   23,  546,    3,  150,  259,    6,
       2712,   14,   24,  957])

###Text Generation with LSTMs with Keras and Python - Part Two

In [0]:
#Create the LSTM based model
#Split the data into Features and Labels
#####X Features(First n words of Sequence)
#####Y Label(Next Word after teh Sequence)
#Fit the Model

In [0]:
from keras.utils import to_categorical

In [0]:
#We take the sequences and grab everything except the last column
X=sequences[:,:-1]

In [0]:
#Now grab only the last column
y=sequences[:,-1]

In [0]:
#Change y into to_categorical
#We mention vocabulary_size+1 because the way keras padding works is because we need extra one to hold zero
y=to_categorical(y,num_classes=vocabulary_size+1)

In [0]:
#We check the shape of X at index 1
seq_len=X.shape[1]

In [0]:
X.shape
#There are 11312 sequences with 25 words in each sequence

(11312, 25)

In [0]:
#Create the model
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [0]:
def create_model(vocabulary_size,seq_len):
  model=Sequential()
  model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
  #input_dim=vocabulary_size
  #output_dim=seq_len
  #Let us add LSTM 
  model.add(LSTM(50,return_sequences=True))
  #units: it is always a good idea to put this number as a multiple of seq_len
  #More the units i.e. neuron, it will take more time to train
  model.add(LSTM(50))
  model.add(Dense(50,activation='relu'))
  #We want to translate this into a vocabulary word
  model.add(Dense(vocabulary_size,activation='softmax'))
  model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  #we are treating each individual word in teh vocabulary as its own category
  model.summary(  )
  return model

In [0]:
model=create_model(vocabulary_size+1,seq_len)






Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [0]:
#Train and fit the model
from pickle import dump,load 
#This helps us to save the file and load it later

In [0]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)
#The batch_size is how many sequences we want to pass at a time
#We don't want to pass everything at a time

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/2





Epoch 2/2


<keras.callbacks.History at 0x7fc8de5fccf8>

In [0]:
#The accuracy is very poor
#Save the model
model.save('my_mobydick_model.h5')

In [0]:
#We are saving the tokenizer that is having information
dump(tokenizer,open('my_simpletokenizer','wb'))
# we are creating a file with the name my_simpletokenizer and are writing binary to it in order to dump this tokenizer to the
#created file

###Text Generation with LSTMS with Keras - Part Three

In [0]:
from keras.preprocessing.sequence import pad_sequences

In [0]:
#We generate new text based off a seed
#Create a function to generate new text based on the seed
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
  output_text=[]
  
  #We are supposed to feed some sort of line of 25 tokens
  input_text=seed_text
  #Itz going to generate one word after that
  #We are going to chop off the very first word of the seed
  #Take our new word,put it at the end 
  #We now have our new seed text or a new input text
  #We are going to keep doing that the number of times the user want to generate the words as specified in num_gen_words
  #We will take the input text and encode it to a sequence
  for i in range(num_gen_words):
    encoded_text=tokenizer.texts_to_sequences([input_text])[0]
    #now we pad sequences to our trained array
    
    pad_encoded=pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
    
    pred_word_ind=model.predict_classes(pad_encoded,verbose=0)[0]

    pred_word=tokenizer.index_word[pred_word_ind]

    #The input text is going to have a blank space plus the predicted word

    input_text += ' '+pred_word

    output_text.append(pred_word)


  return ' '.join(output_text)

In [0]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [0]:
import random
random.seed(101)
random_pick=random.randint(0,len(text_sequences))

In [0]:
#Let us pick some random seed text
random_seed_text=text_sequences[random_pick]

In [0]:
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [0]:
#Let us join the above
seed_text=' '.join(random_seed_text)

In [0]:
seed_text
#Now this looks as a sentence

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [0]:
#you can write your own sentence
#seed_text=''
#This is not recommended unless you mimic the style of Moby

In [0]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)
#This is a very poor model that we trained with poor accuracy as the epochs are less
#I expect to get a very bad results

'the the the the the the the the the the the the the the the the the the the the the the the the the'

###Now we will try training the model on entire Moby file with 300 epochs. BUt the accuracy will be around 60% only