In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

text = gutenberg.raw('austen-emma.txt')[:10000] 

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/tanishrajput/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
print(text)

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had been supplied
by an excellent woman as governess, who had fallen little short
of a mother in affection.

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.  Between _them_ it was more the intimacy
of sisters.  Even before Miss Taylor had ceased to hold the nominal
office of governess, the mildness o

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
tokenizer = Tokenizer()

In [5]:
tokenizer.fit_on_texts([text])

In [6]:
tokenizer.word_index

{'and': 1,
 'of': 2,
 'a': 3,
 'the': 4,
 'to': 5,
 'her': 6,
 'was': 7,
 'had': 8,
 'in': 9,
 'she': 10,
 'it': 11,
 'i': 12,
 'for': 13,
 'as': 14,
 'but': 15,
 'be': 16,
 'he': 17,
 'with': 18,
 'very': 19,
 'that': 20,
 'not': 21,
 'all': 22,
 'you': 23,
 'have': 24,
 'miss': 25,
 'his': 26,
 'from': 27,
 'mr': 28,
 'taylor': 29,
 'could': 30,
 'is': 31,
 'emma': 32,
 'by': 33,
 'at': 34,
 'how': 35,
 'we': 36,
 'been': 37,
 'friend': 38,
 'them': 39,
 'every': 40,
 'always': 41,
 'must': 42,
 'will': 43,
 'father': 44,
 'any': 45,
 'own': 46,
 'were': 47,
 'no': 48,
 'years': 49,
 'little': 50,
 'house': 51,
 'this': 52,
 'only': 53,
 'such': 54,
 'great': 55,
 'being': 56,
 'much': 57,
 'think': 58,
 'which': 59,
 'so': 60,
 'thought': 61,
 'over': 62,
 'weston': 63,
 'man': 64,
 'when': 65,
 'poor': 66,
 'good': 67,
 'us': 68,
 'see': 69,
 'am': 70,
 'james': 71,
 'woodhouse': 72,
 'some': 73,
 'or': 74,
 'too': 75,
 'long': 76,
 'than': 77,
 'an': 78,
 'affection': 79,
 'now': 

In [7]:
input_sequences = []

for sentence in text.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1,len(tokenized_sentence)):
        n_gram = tokenized_sentence[:i + 1]
        input_sequences.append(n_gram)
        

In [8]:
print(input_sequences)

[[32, 33], [32, 33, 234], [32, 33, 234, 235], [32, 33, 234, 235, 236], [237, 12], [238, 12], [32, 72], [32, 72, 239], [32, 72, 239, 240], [32, 72, 239, 240, 1], [32, 72, 239, 240, 1, 241], [32, 72, 239, 240, 1, 241, 18], [32, 72, 239, 240, 1, 241, 18, 3], [32, 72, 239, 240, 1, 241, 18, 3, 242], [32, 72, 239, 240, 1, 241, 18, 3, 242, 243], [1, 130], [1, 130, 131], [1, 130, 131, 244], [1, 130, 131, 244, 5], [1, 130, 131, 244, 5, 245], [1, 130, 131, 244, 5, 245, 73], [1, 130, 131, 244, 5, 245, 73, 2], [1, 130, 131, 244, 5, 245, 73, 2, 4], [1, 130, 131, 244, 5, 245, 73, 2, 4, 246], [1, 130, 131, 244, 5, 245, 73, 2, 4, 246, 247], [2, 248], [2, 248, 1], [2, 248, 1, 8], [2, 248, 1, 8, 132], [2, 248, 1, 8, 132, 249], [2, 248, 1, 8, 132, 249, 250], [2, 248, 1, 8, 132, 249, 250, 97], [2, 248, 1, 8, 132, 249, 250, 97, 49], [2, 248, 1, 8, 132, 249, 250, 97, 49, 9], [2, 248, 1, 8, 132, 249, 250, 97, 49, 9, 4], [2, 248, 1, 8, 132, 249, 250, 97, 49, 9, 4, 251], [18, 19], [18, 19, 50], [18, 19, 50, 5]

In [9]:
lengths = [len(x) for x in input_sequences]
max_len = max(lengths)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
padded_input_sequences = pad_sequences(input_sequences,maxlen = max_len,padding = 'pre')

In [12]:
X = padded_input_sequences[: ,: -1]
y = padded_input_sequences[:, -1]

In [13]:
X.shape

(1647, 16)

In [14]:
y.shape

(1647,)

In [15]:
from tensorflow.keras.utils import to_categorical

In [16]:
len(tokenizer.word_index)

616

In [17]:
y = to_categorical(y,num_classes = 617) # one-hot encoding starts from 0-index

In [18]:
y.shape

(1647, 617)

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [23]:
model = Sequential()

model.add(Embedding(input_dim=617, output_dim=100, input_length=17))

model.add(LSTM(150))

model.add(Dense(617,activation='softmax'))

In [24]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
history = model.fit(X,y,epochs = 100)

Epoch 1/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0218 - loss: 6.2610
Epoch 2/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0381 - loss: 5.6862
Epoch 3/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0331 - loss: 5.6364
Epoch 4/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0340 - loss: 5.5840
Epoch 5/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0523 - loss: 5.4367
Epoch 6/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0585 - loss: 5.3734
Epoch 7/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0697 - loss: 5.2207
Epoch 8/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1003 - loss: 5.0454
Epoch 9/100
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━

In [27]:
model.summary()

In [29]:
text = "She was the"

tokenized_text = tokenizer.texts_to_sequences([text])[0]

padded_input_text = pad_sequences([tokenized_text],maxlen=17,padding='pre')

In [33]:
model.predict(padded_input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


array([[9.02174264e-08, 1.14790200e-04, 3.84190713e-07, 9.13616259e-06,
        4.95805216e-05, 3.48754543e-06, 8.69295181e-05, 5.25172538e-07,
        1.33398419e-06, 1.49766693e-05, 2.66887059e-06, 1.23482309e-07,
        2.64837149e-06, 4.84151351e-05, 4.99066815e-08, 1.76132016e-05,
        3.44960426e-06, 1.74448701e-06, 4.53999007e-07, 1.33456968e-04,
        1.16308302e-05, 4.75422485e-06, 2.82258793e-06, 1.41328667e-06,
        3.25691144e-05, 5.95107258e-06, 6.55494773e-07, 7.31160708e-06,
        9.24659815e-08, 2.23663250e-07, 7.68805478e-07, 7.82661900e-08,
        1.76392405e-07, 1.22550591e-05, 2.03956279e-06, 3.05209198e-07,
        5.63822937e-07, 4.32157540e-05, 1.10130299e-04, 5.23587744e-07,
        4.36163828e-05, 8.85359441e-06, 1.39978056e-06, 5.15129273e-07,
        2.18925425e-06, 3.05067647e-06, 4.25345570e-05, 1.76752252e-07,
        1.50259768e-06, 2.25917865e-05, 1.81137118e-04, 2.17736000e-03,
        3.46269786e-07, 4.15930754e-06, 1.87913577e-06, 1.500812

In [34]:
import numpy as np

In [37]:
position = np.argmax(model.predict(padded_input_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [38]:
for word,index in tokenizer.word_index.items():
    if index == position:
        print(word)

youngest


In [42]:
text = "She was the"

for i in range(5):
    tokenize_text = tokenizer.texts_to_sequences([text])[0]
    padded_tokenize_text = pad_sequences([tokenize_text],maxlen=17,padding='pre')
    position = np.argmax(model.predict(padded_tokenize_text))
    
    for word,index in tokenizer.word_index.items():
        if index == position:
            text = text + " " + word
            print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
She was the youngest
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
She was the youngest of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
She was the youngest of the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
She was the youngest of the two
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
She was the youngest of the two daughters


In [44]:
model.save("model.keras")

In [45]:
import pickle

In [46]:
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)