In [39]:
#import necessary libraries
import numpy as np                                                   #for numerical operations
import pandas as pd                                                  #for manipulation
import matplotlib.pyplot as plt                                      #for creating interactive visualizations
import os
import pickle                                                        #used for saving/loading trained machine learning models
import tensorflow as tf                                              #for building/training deep learning models
from tensorflow import keras                                         #provide interface for building/training neural networks
from tensorflow.keras.preprocessing.text import Tokenizer            #to convert text into a sequence of tokens or words 
from tensorflow.keras.layers import Embedding, LSTM, Dense           #Embedding->word embeddings,
                                                                     #LSTM ->type of RNN layer,Dense->fully connected layer
from tensorflow.keras.models import Sequential         #linear stack of layers in Keras(allow us to build model layer by layer)
from tensorflow.keras.preprocessing.sequence import pad_sequences  #ensure that all sequences in a list have the same length 

In [40]:
#read the data file
path=r"C:\Users\taman\Downloads\Sherlock Holmes Dataset.txt"         #path of your text file
text = open(path).read().lower()                                     #read and convert it into lowercase
print('length of the corpus is: :', len(text))                       #checking length

length of the corpus is: : 610921


In [41]:
#preprocessing
#-----Tokenization------process of breaking down a text into smaller units called tokens
#Create a tokenizer
tokenizer = Tokenizer()                   
#Fit the tokenizer on the text data
tokenizer.fit_on_texts([text])   #pass text as input then analyze text,builds a vocabulary of unique words/assigns numerical index to each
total_words = len(tokenizer.word_index) + 1
total_words                      #total number of distinct words in the text     

8200

In [42]:
input_sequences = []
#Loop through each line in the text
for line in text.split('\n'):                              #assuming 'text' is a multiline string then split text into lines
    token_list = tokenizer.texts_to_sequences([line])[0]   #Tokenize the current line using the tokenizer
    # Create n-gram sequences from the tokenized line
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [43]:
#calculates the maximum length among all the sequences 
max_sequence_len = max([len(seq) for seq in input_sequences])

#pad_sequences-->ensure all sequences in input_sequences have same length,
#max_sequence->maximum length of the sequences after padding,
#'pre'->padding should be added to the beginning of each sequence
#np.array->convert list of sequences into numpy array
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [44]:
#input sequences are split into two arrays, ‘X’ and ‘y’
X = input_sequences[:, :-1]  #except for the last column
y = input_sequences[:, -1]   #values of the last column

In [45]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1561],
       [   0,    0,    0, ...,    1, 1561,    5],
       ...,
       [   0,    0,    0, ...,   28,    1, 8198],
       [   0,    0,    0, ...,    1, 8198, 8199],
       [   0,    0,    0, ..., 8198, 8199, 3187]])

In [46]:
y

array([1561,    5,  129, ..., 8199, 3187, 3186])

In [47]:
#transforming a list of class labels y into a NumPy array
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [48]:
model = Sequential()
# Adding an Embedding layer
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))

# Adding an LSTM layer
model.add(LSTM(128))

# Adding a Dense layer
model.add(Dense(total_words, activation='softmax'))

# Printing the model summary
print(model.summary())


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 17, 100)           820000    
                                                                 
 lstm_2 (LSTM)               (None, 128)               117248    
                                                                 
 dense_2 (Dense)             (None, 8200)              1057800   
                                                                 
Total params: 1,995,048
Trainable params: 1,995,048
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
#verbose->Controls amount of information printed during training i.e =1 (progress bars/information displayed for each epoch.)
#epochs->number of times the model will iterate over the entire training dataset
lstm=model.fit(X, y,epochs=50, verbose=1)  

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
lstm.history['loss']

[5.7821946144104,
 5.259610176086426,
 4.8975725173950195,
 4.5778350830078125,
 4.287757873535156,
 4.018226623535156,
 3.7651207447052,
 3.5243349075317383,
 3.3005177974700928,
 3.09148907661438,
 2.8988306522369385,
 2.720496654510498,
 2.5586838722229004,
 2.406815767288208,
 2.2679531574249268,
 2.1399126052856445,
 2.0239107608795166,
 1.9158556461334229,
 1.8185195922851562,
 1.7265570163726807,
 1.6451746225357056,
 1.5676207542419434,
 1.4959113597869873,
 1.4310874938964844,
 1.3686628341674805,
 1.3150478601455688,
 1.2614696025848389,
 1.2141187191009521,
 1.171701192855835,
 1.1290698051452637,
 1.0942432880401611,
 1.0568344593048096,
 1.0271456241607666,
 0.9955662488937378,
 0.9696739912033081,
 0.9431201219558716,
 0.9200921058654785,
 0.895290732383728,
 0.8762571811676025,
 0.8570692539215088,
 0.8386108875274658,
 0.8235945701599121,
 0.806557297706604,
 0.7906132340431213,
 0.7799181938171387,
 0.7652570605278015,
 0.7537423372268677,
 0.7448998093605042,
 0.73358

In [82]:
seed_text = "i found in"
next_words = 6

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

i found in the morning it was an evil
