#Importing Libraries

In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [None]:
data=gutenberg.raw('shakespeare-hamlet.txt')
with open('hamlet.txt','w') as file:
  file.write(data)

#Data Preprocessing

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Loading the dataset

In [None]:
with open('hamlet.txt','r') as file:
  text=file.read().lower()

Tokenize the Text

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words

4818

Creating the input sequences

In [None]:
input_sequences=[]
for line in text.split('\n'):
  token_list=tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    n_gram_sequence=token_list[:i+1]
    input_sequences.append(n_gram_sequence)

Pad Sequences

In [None]:
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

14

In [None]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences.shape

(25732, 14)

#Creating Predictors and Label

In [None]:
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [None]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Split the data into training and testing sets

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

#Defining the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout (0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

model.build(input_shape=(None, max_sequence_len - 1))


model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



In [None]:
histroy=model.fit(x_train,y_train,epochs=100,validation_data=(x_test,y_test),verbose=1)

Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 73ms/step - accuracy: 0.0319 - loss: 7.1545 - val_accuracy: 0.0344 - val_loss: 6.7363
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 68ms/step - accuracy: 0.0357 - loss: 6.4725 - val_accuracy: 0.0439 - val_loss: 6.8222
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 71ms/step - accuracy: 0.0435 - loss: 6.2835 - val_accuracy: 0.0482 - val_loss: 6.8716
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 70ms/step - accuracy: 0.0548 - loss: 6.1485 - val_accuracy: 0.0505 - val_loss: 6.8848
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 68ms/step - accuracy: 0.0546 - loss: 6.0000 - val_accuracy: 0.0544 - val_loss: 6.9056
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 68ms/step - accuracy: 0.0643 - loss: 5.8668 - val_accuracy: 0.0600 - val_loss: 6.9512
Epoch 7/10

accuracy: 0.6781 - loss: 1.3928 - val_accuracy: 0.0492 - val_loss: 14.1143

#Predicting The Next Word

In [None]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) >= max_sequence_len:
    token_list = token_list[-(max_sequence_len-1):]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  prediction = model.predict(token_list,verbose=0)
  predicted_word_index = np.argmax(prediction,axis=1)[0]
  for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
      return word
  return None

In [None]:
input_text = "to be or not to be"
max_sequence_len= model.input_shape[1]+1
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input Text: {input_text}")
print(f"Predicted Word: {predicted_word}")

Input Text: to be or not to be
Predicted Word: buried


#Saving the Model

In [None]:
model.save('next_word_lstm.h5')

import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)



In [None]:
! pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!wget -q -O - ipv4.icanhazip.com

34.125.171.225


In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.171.225:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0Kyour url is: https://hip-oranges-think.loca.lt
2024-12-11 17:50:00.703135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-11 17:50:00.729557: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-11 17:50:00.737415: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS fa