In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [2]:
corpus = [
    "I love machine learning",
    "I love deep learning",
    "I love natural language processing",
    "Deep learning is amazing",
    "Natural language processing is fun"
]

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 

In [4]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)

In [5]:
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

In [6]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [7]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len-1))
model.add(GRU(128))  
model.add(Dense(total_words, activation='softmax'))



In [8]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [9]:
model.fit(X, y, epochs=200, verbose=0)

<keras.src.callbacks.history.History at 0x1c0986f0090>

In [10]:
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    next_word = tokenizer.index_word[np.argmax(predicted)]
    return next_word

In [11]:
seed = "I love"
print(f"Input: '{seed}'")
print("Predicted next word:", predict_next_word(seed))

Input: 'I love'
Predicted next word: deep
