In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg',disable=['parser','tagger','ner'])
nlp.max_length = 1198623

In [3]:
def separate_punc(doc_text):
    '''Get rid of \n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n'''
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [4]:
d = read_file('moby_dick_four_chapters.txt')
d[:300]

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circula'

In [5]:
tokens = separate_punc(d)
tokens



['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such',
 'an',
 'upper',
 'hand',
 '

In [6]:
len(tokens)

11338

## Create sequence of tokens
###### Pass 25 words and then the network predicts 26th word

In [7]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i - train_len:i]
    
    text_sequences.append(seq)

In [8]:
print(text_sequences[0])
print(text_sequences[1])
print(text_sequences[2])

['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on']
['me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore']
['ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', 'i']


In [9]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [11]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [12]:
sequences[1]

[14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 314,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2713,
 14,
 24,
 957]

In [13]:
for i in sequences[1]:
    
    print(f"{i}: {tokenizer.index_word[i]}")
# tokenizer.index_word

14: me
263: ishmael
51: some
261: years
408: ago
87: never
219: mind
129: how
111: long
954: precisely
260: having
50: little
43: or
38: no
314: money
7: in
23: my
546: purse
3: and
150: nothing
259: particular
6: to
2713: interest
14: me
24: on
957: shore


In [14]:
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [15]:
vocabulary_size = len(tokenizer.word_counts)

In [16]:
vocabulary_size

2718

In [17]:
type(sequences)

list

In [18]:
import numpy as np

sequences = np.array(sequences)
sequences

# Notice that number is moving 1 time frame ahead.

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [19]:
from tensorflow.keras.utils import to_categorical
X = sequences[:,:-1]
X

array([[ 956,   14,  263, ...,    6, 2713,   14],
       [  14,  263,   51, ..., 2713,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2712, ...,  262,   53,    2],
       [ 166, 2712,    3, ...,   53,    2, 2718]])

In [20]:
y = sequences[:,-1]
y

array([  24,  957,    5, ...,    2, 2718,   26])

In [21]:
y = to_categorical(y,num_classes=vocabulary_size+1)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [22]:
seq_len = X.shape[1]
seq_len

25

In [23]:
X.shape

(11312, 25)

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding


In [25]:
def create_model(vocabulary_size,seq_len):
    
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length = seq_len))
    model.add(LSTM(50,return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50,activation='relu'))
    
    model.add(Dense(vocabulary_size,activation = 'softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model
    

In [26]:
model = create_model(vocabulary_size+1,seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67975     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 2719)              138669    
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [27]:
from pickle import dump,load


In [28]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a692d3bc88>

In [29]:
model.save('my_mobdick_model.h5')

In [30]:
dump(tokenizer,open('my_simpletokenizer','wb'))

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [78]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        print(encoded_text)
        pad_encoded = pad_sequences([encoded_text],maxlen = seq_len)
        
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

In [79]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [80]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [81]:
random_seed_text = text_sequences[random_pick]

In [82]:
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [83]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [84]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1]




[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 5, 232, 10, 346, 105, 16, 92, 286, 6, 541, 14, 16, 5, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 5, 6, 96, 1, 70, 26, 2, 500, 106, 105, 16, 

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [None]:
! git add Text_Generation_LSTM.ipynb
! git commit -m "00:19/12-02-2022"
! git push orgin main