In [27]:
import nltk
from nltk.corpus import gutenberg
import pandas as pd

In [28]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Panks\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [29]:
## load the data
data = gutenberg.raw('shakespeare-hamlet.txt')
with open('dataset.txt', 'w') as f:
    f.write(data)

In [30]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [31]:
with open('dataset.txt', 'r') as f:
    text = f.read().lower()

In [32]:
## tokenize thetext
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
input_sequences = []

In [33]:
#n_gram creation
# we will create n-grams from the text, where n is the number of words in the sequence
#n_gram means that we are taking the first n words from the text
# for example, if n=3, we will take the first 3 words from the text
# input_sequences will contain all the n-gram sequences from the text

input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1] # create n-gram sequences which means that we are taking the first i+1 words from the token_list
        input_sequences.append(n_gram_sequence) # input_sequences will contain all the n-gram sequences from the text

In [34]:
input_sequences

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [35]:
max_sequence_length = max([len(x) for x in input_sequences])
max_sequence_length

14

In [36]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')) # pad sequences to the same length
# this line will pad the sequences with zeros at the beginning
# input_sequences means that each row is a sequence of words, where the first word is the input and the last word is the target
# maxlen=max_sequence_length means that the sequences will be padded to the same length as the longest sequence
# padding='pre' means that the padding will be added at the beginning of the sequence
# whole line means that we are preparing the data for training a model to predict the next word in a sequence of words


In [37]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [38]:
import tensorflow as tf
x, y = input_sequences[:,:-1], input_sequences[:,-1] # split the input sequences into x and y, where x is the input and y is the target

In [39]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words) # convert the target to one-hot encoding
# this line will convert the target to one-hot encoding, where each word is represented by a vector of zeros and ones
# for example, if the target is the word "hello", it will be represented as [0, 0, 0, 1, 0, ...] where the index of "hello" in the vocabulary is 3

In [40]:
#train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # split the data into train and test sets

In [41]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU
model1 = Sequential()
model1.add(Embedding(total_words, 64, input_length=max_sequence_length-1)) # embedding layer to convert words to vectors
model1.add(LSTM(150, return_sequences=True)) # LSTM layer to process the sequences
model1.add(Dropout(0.2)) # dropout layer to prevent overfitting
model1.add(LSTM(100)) # another LSTM layer to process the sequences
model2.add(Dense(total_words, activation='softmax')) # dense layer to output the probabilities of each word
model1.build(input_shape=(None, max_sequence_length-1)) # build the model with the input shape
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # compile the model with the loss function and optimizer
model1.summary() # print the model summary



In [43]:
model2 = Sequential()
model2.add(Embedding(total_words, 100, input_length=max_sequence_length-1)) # embedding layer to convert words to vectors
model2.add(GRU(150, return_sequences=True)) # GRU layer to process the sequences
model2.add(Dropout(0.2)) # dropout layer to prevent overfitting
model2.add(GRU(100)) # another GRU layer to process the sequences
model2.add(Dense(total_words, activation='softmax')) # dense layer to output the probabilities of each word
model2.build(input_shape=(None, max_sequence_length-1)) # build the model with the input shape
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # compile the model with the loss function and optimizer
model2.summary() # print the model summary

In [45]:
print(x_train.shape)
print(y_train.shape)


(20585, 13)
(20585, 4818)


In [46]:
history1 = model2.fit(x_train, 
                      y_train, 
                      epochs=50, 
                      validation_data=(x_test, y_test), 
                      callbacks=[early_stopping]) # train the model

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.0315 - loss: 7.2112 - val_accuracy: 0.0303 - val_loss: 6.9202
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.0380 - loss: 6.4896 - val_accuracy: 0.0527 - val_loss: 6.7965
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.0538 - loss: 6.1798 - val_accuracy: 0.0631 - val_loss: 6.7561
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0720 - loss: 5.8741 - val_accuracy: 0.0676 - val_loss: 6.7506
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - accuracy: 0.0857 - loss: 5.6186 - val_accuracy: 0.0723 - val_loss: 6.8882
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0940 - loss: 5.4039 - val_accuracy: 0.0740 - val_loss: 6.9405
Epoch 7/50
[1m6

In [48]:
def predict_next_word(model, tokenizer, text, max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        token_list = token_list[max_sequence_length:]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted = model.predict(token_list)
    predicted_word_index = np.argmax(predicted, axis=1)  # get the index of the predicted word
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None  # return None if no word is found