In [3]:
!pip install datasets
from datasets import load_dataset
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [5]:
dataset=load_dataset("wikitext","wikitext-2-raw-v1")
text="\n".join(dataset["train"]["text"][:1000])#1000 lines

Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at C:\Users\Shob raj\.cache\huggingface\datasets\wikitext\wikitext-2-raw-v1\0.0.0\b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Tue Jan 14 18:41:48 2025).


In [6]:
#tokenize and limit
tokenizer=Tokenizer(num_words=10000)
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1

In [10]:
input_sequences=[]
for line in text.split("\n"):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [11]:
max_sequence_len=20
input_sequences=pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre')


In [17]:
#predictors and labels
predictors,labels=input_sequences[:,:-1],input_sequences[:,-1]
labels=tf.keras.utils.to_categorical(labels,num_classes=total_words)

In [18]:
model=Sequential([
    Embedding(total_words,50,input_length=max_sequence_len-1),
    LSTM(100),
    Dense(total_words,activation='softmax')])

In [19]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=0.01),metrics=['accuracy'])
history=model.fit(predictors,labels,epochs=5,verbose=1)

Epoch 1/5
[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 19ms/step - accuracy: 0.0826 - loss: 7.3351
Epoch 2/5
[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 19ms/step - accuracy: 0.1380 - loss: 6.2009
Epoch 3/5
[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 19ms/step - accuracy: 0.1815 - loss: 5.2855
Epoch 4/5
[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 19ms/step - accuracy: 0.2349 - loss: 4.3832
Epoch 5/5
[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 19ms/step - accuracy: 0.3068 - loss: 3.6444


In [20]:
import numpy as np

In [26]:
def generate_next_word(model,tokenizer,input_text,max_sequence_len=10):
    input_sequence=tokenizer.texts_to_sequences([input_text])[0]
    if len(input_sequence)> max_sequence_len-1:
        input_sequence=input_sequence[-(max_sequence_len-1):]

    input_sequence=np.pad(input_sequence,(max_sequence_len-1-len(input_sequence),0),mode='constant')
    input_sequence=np.array(input_sequence).reshape(1,max_sequence_len-1)
    prediction=model.predict(input_sequence)
    predicted_index=np.argmax(prediction)
    predicted_word=tokenizer.index_word[predicted_index]
    return predicted_word

In [29]:
input_text="The quick brown fox was climbing"
predicted_word=generate_next_word(model,tokenizer,input_text)
print(f"INPUT:{input_text}")
print(f"Predicted word:{predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
INPUT:The quick brown fox was climbing
Predicted word:in
