## Importing Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text  import Tokenizer
from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os


In [2]:
file = open("/content/metamorphosis_clean.txt" , "r" , encoding ="utf8")
lines = []

for i in file:
  lines.append(i)

print("The First line: ", lines[0])


The First line:  ﻿One morning, when Gregor Samsa woke from troubled dreams, he found



## Cleaning the data

In [3]:
data = ""
for i in lines:
  data = ' '.join(lines)

data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]


'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.'

In [4]:
## replaces punctuation
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'One morning  when Gregor Samsa woke from troubled dreams  he found himself transformed in his bed into a horrible vermin   He lay on his armour like back  and if he lifted his head a little he could see his brown belly  slightly domed and divided by arches into stiff sections   The bedding was hardly able to cover it and seemed ready to slide off any moment   His many legs  pitifully thin compared with the size of the rest of him  waved about helplessly as he looked    What s happened to me   he'

In [5]:
z = []

for i in data.split():
  if i not in z:
    z.append(i)

data =' '.join(z)

print(z)



### Tokenization

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[17, 53, 293, 2, 18, 729, 135, 730, 294, 8]

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2617


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  3889


array([[ 17,  53],
       [ 53, 293],
       [293,   2],
       [  2,  18],
       [ 18, 729],
       [729, 135],
       [135, 730],
       [730, 294],
       [294,   8],
       [  8, 731]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])

X = np.array(X)
y = np.array(y)

In [10]:
sequence_length = 10
X_sequences = []
y_labels = []

for i in range(len(X) - sequence_length):
    X_sequences.append(X[i:i + sequence_length])
    y_labels.append(X[i + sequence_length])

X_sequences = np.array(X_sequences)
y_labels = np.array(y_labels)


from tensorflow.keras.utils import to_categorical
y_labels = to_categorical(y_labels, num_classes=vocab_size)


### Model

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 10))

model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [12]:
model.build(input_shape=(None, 10))

In [13]:
model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.001)
)


In [14]:
X.shape

(3889,)

In [15]:
X

array([  17,   53,  293, ..., 2615,  294,  591])

In [16]:
model.fit(X_sequences, y_labels, epochs=20)

Epoch 1/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 963ms/step - loss: 7.8758
Epoch 2/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 968ms/step - loss: 7.8538
Epoch 3/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 956ms/step - loss: 7.8058
Epoch 4/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 961ms/step - loss: 7.6810
Epoch 5/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 947ms/step - loss: 7.4382
Epoch 6/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 969ms/step - loss: 6.9085
Epoch 7/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 947ms/step - loss: 5.9836
Epoch 8/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 939ms/step - loss: 4.6427
Epoch 9/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 953ms/step - loss: 3.2131
Epoch 10/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7de582d3be50>

array([[  17,   53,  293, ...,  730,  294,    8],
       [  53,  293,    2, ...,  294,    8,  731],
       [ 293,    2,   18, ...,    8,  731,   19],
       ...,
       [ 367,  398, 2609, ..., 2613, 2614,   20],
       [ 398, 2609, 2610, ..., 2614,   20, 2615],
       [2609, 2610, 2611, ...,   20, 2615,  294]])

In [18]:
def predict_next_word(model, tokenizer, seed_text, sequence_length=10):

    tokens = tokenizer.texts_to_sequences([seed_text])[0]

    tokens = tokens[-sequence_length:]

    from tensorflow.keras.preprocessing.sequence import pad_sequences
    padded = pad_sequences([tokens], maxlen=sequence_length)

    predicted_probs = model.predict(padded, verbose=0)
    predicted_id = predicted_probs.argmax()

    index_word = {v: k for k, v in tokenizer.word_index.items()}
    return index_word.get(predicted_id, "<UNK>")


In [33]:
seed_text = "what a strenous"
next_word = predict_next_word(model, tokenizer, seed_text)
print("Next word prediction:", next_word)


Next word prediction: future
