In [None]:
# Data Collection

import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

# Save to a file
with open("shakespeare.txt",'w') as file:
    file.write(data)


In [None]:
# Data preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
with open('shakespeare.txt','r') as file:
    text = file.read().lower()

#Tokenize the text 
tokenzier = Tokenizer()
tokenzier.fit_on_texts([text])
total_words = len(tokenzier.word_index)+1
total_words


In [22]:
# Create Input Sequences
input_sequence = []
for line in text.split('\n'):
    token_list = tokenzier.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)


In [None]:
input_sequence

In [None]:
# Pad Sequences
max_sequence_len = max(len(x) for x in input_sequence)
max_sequence_len

input_sequence = np.array(pad_sequences(input_sequence, maxlen=max_sequence_len,padding='pre'))
input_sequence

In [25]:
# create predictors and label
import tensorflow as tf

X,y=input_sequence[:,:-1],input_sequence[:,-1]



In [None]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

In [27]:
#Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# #Train LSTM RNN

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
# from tensorflow.keras.regularizers import l2
# from tensorflow.keras.optimizers import Adam

# #Define Model
# model = Sequential()
# model.add(Embedding(input_dim=total_words,output_dim=100,input_length=max_sequence_len))
# model.add(LSTM(150,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words,activation='softmax'))

# # Build the Model
# model.build(input_shape=(None, max_sequence_len))  # None for batch size, max_sequence_len for sequence length

# optimizer = Adam(learning_rate=0.0005)
# #Complie Model
# model.compile(loss='categorical_crossentropy',optimizer = optimizer, metrics =['accuracy'])

In [None]:
# model.summary()

In [11]:
# # Early Stopping

# from tensorflow.keras.callbacks import EarlyStopping

# early_stopping = EarlyStopping(
#     monitor ='val_loss',
#     patience = 3,
#     restore_best_weights = True
# )

In [None]:
# # Train Model

# history = model.fit(
#     X_train,
#     y_train,
#     epochs = 50,
#     batch_size = 64,
#     validation_data=(X_test,y_test),
#     #callbacks=[early_stopping],
#     verbose=1
# )

In [34]:
#Train GRU RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

#Define Model
model = Sequential()
model.add(Embedding(input_dim=total_words,output_dim=32,input_length=max_sequence_len))
model.add(GRU(150,return_sequences=True,kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(GRU(100,kernel_regularizer=l2(0.001)))
model.add(Dense(total_words,activation='softmax',kernel_regularizer=l2(0.001)))

# Build the Model
model.build(input_shape=(None, max_sequence_len))  # None for batch size, max_sequence_len for sequence length


#Complie Model
model.compile(loss='categorical_crossentropy',optimizer = 'adam', metrics =['accuracy'])




In [None]:
# Train Model

history = model.fit(
    X_train,
    y_train,
    epochs = 50,
    batch_size = 64,
    validation_data=(X_test,y_test),
    #callbacks=[early_stopping],
    verbose=1
)

In [38]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
input_text="To be or not to be"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]
next_word=predict_next_word(model,tokenzier,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

In [None]:
## Save the model
model.save("next_word_GRU.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenzier,handle,protocol=pickle.HIGHEST_PROTOCOL)