In [1]:
#import necessary libraries
import numpy as np                                                   #for numerical operations
import pandas as pd                                                  #for manipulation
import matplotlib.pyplot as plt                                      #for creating interactive visualizations
import os
import pickle                                                        #used for saving/loading trained machine learning models
import tensorflow as tf                                              #for building/training deep learning models
from tensorflow import keras                                         #provide interface for building/training neural networks
from tensorflow.keras.preprocessing.text import Tokenizer            #to convert text into a sequence of tokens or words 
from tensorflow.keras.layers import Embedding, LSTM, Dense           #Embedding->word embeddings,
                                                                     #LSTM ->type of RNN layer,Dense->fully connected layer
from tensorflow.keras.models import Sequential         #linear stack of layers in Keras(allow us to build model layer by layer)
from tensorflow.keras.preprocessing.sequence import pad_sequences  #ensure that all sequences in a list have the same length 

In [2]:
#read the data file
path=r"C:\Users\taman\Downloads\Sherlock Holmes Dataset.txt"         #path of your text file
text = open(path).read().lower()                                     #read and convert it into lowercase
print('length of the corpus is: :', len(text))                       #checking length

length of the corpus is: : 610921


In [3]:
#preprocessing
#-----Tokenization------process of breaking down a text into smaller units called tokens
#Create a tokenizer
tokenizer = Tokenizer()                   
#Fit the tokenizer on the text data
tokenizer.fit_on_texts([text])   #pass text as input then analyze text,builds a vocabulary of unique words/assigns numerical index to each
total_words = len(tokenizer.word_index) + 1
total_words                      #total number of distinct words in the text     

8200

In [4]:
input_sequences = []
#Loop through each line in the text
for line in text.split('\n'):                              #assuming 'text' is a multiline string then split text into lines
    token_list = tokenizer.texts_to_sequences([line])[0]   #Tokenize the current line using the tokenizer
    # Create n-gram sequences from the tokenized line
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [5]:
#calculates the maximum length among all the sequences 
max_sequence_len = max([len(seq) for seq in input_sequences])

#pad_sequences-->ensure all sequences in input_sequences have same length,
#max_sequence->maximum length of the sequences after padding,
#'pre'->padding should be added to the beginning of each sequence
#np.array->convert list of sequences into numpy array
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [6]:
#input sequences are split into two arrays, ‘X’ and ‘y’
X = input_sequences[:, :-1]  #except for the last column
y = input_sequences[:, -1]   #values of the last column

In [7]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1561],
       [   0,    0,    0, ...,    1, 1561,    5],
       ...,
       [   0,    0,    0, ...,   28,    1, 8198],
       [   0,    0,    0, ...,    1, 8198, 8199],
       [   0,    0,    0, ..., 8198, 8199, 3187]])

In [8]:
y

array([1561,    5,  129, ..., 8199, 3187, 3186])

In [9]:
#transforming a list of class labels y into a NumPy array
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


In [10]:
model = Sequential()
# Adding an Embedding layer
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))

# Adding an LSTM layer
model.add(LSTM(128))

# Adding a Dense layer
model.add(Dense(total_words, activation='softmax'))

# Printing the model summary
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           820000    
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 8200)              1057800   
                                                                 
Total params: 1,995,048
Trainable params: 1,995,048
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
#verbose->Controls amount of information printed during training i.e =1 (progress bars/information displayed for each epoch.)
#epochs->number of times the model will iterate over the entire training dataset
lstm=model.fit(X, y,epochs=50, verbose=1)  

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [12]:

lstm.history['loss']

[6.260003089904785,
 5.541238307952881,
 5.1647539138793945,
 4.847181797027588,
 4.5656328201293945,
 4.304337024688721,
 4.054908752441406,
 3.8156862258911133,
 3.588751792907715,
 3.3739705085754395,
 3.17478346824646,
 2.9884605407714844,
 2.818697929382324,
 2.661052703857422,
 2.513260841369629,
 2.379453420639038,
 2.2546467781066895,
 2.1416709423065186,
 2.03425931930542,
 1.9367928504943848,
 1.8461313247680664,
 1.7637544870376587,
 1.6855756044387817,
 1.613268494606018,
 1.5465370416641235,
 1.4851809740066528,
 1.4287761449813843,
 1.3756815195083618,
 1.3268758058547974,
 1.2798045873641968,
 1.2367539405822754,
 1.1981438398361206,
 1.159767746925354,
 1.1228121519088745,
 1.0922101736068726,
 1.061352252960205,
 1.0328521728515625,
 1.0049593448638916,
 0.9803075790405273,
 0.9538580179214478,
 0.9332873225212097,
 0.9114183783531189,
 0.892274796962738,
 0.8730690479278564,
 0.8549835085868835,
 0.8364862203598022,
 0.8233078122138977,
 0.8090895414352417,
 0.7934899

In [22]:
seed_text = "i found in"
next_words = 6

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

i found in the direction of the wood sherlock
