In [20]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [21]:
with open("dataset.txt","r", encoding="utf-8") as f:
    corpus = f.read().splitlines()

In [22]:
corpus

['The sun sets behind the mountains as the sky turns orange.',
 'I love reading books on rainy evenings with a cup of tea.',
 'Artificial intelligence is transforming the world rapidly.',
 'She opened the door and found a letter waiting for her.',
 'Life is full of surprises and unexpected journeys.',
 'The cat jumped onto the table and knocked over a glass.',
 'He dreamed of traveling to distant galaxies someday.',
 'Music has the power to heal and bring people together.',
 'Do you know where I left my keys yesterday?',
 'Once upon a time, there was a brave little girl.',
 'The train arrived at the station ten minutes late.',
 'Happiness often comes from the simplest of things.',
 'The river flowed quietly under the wooden bridge.',
 'Technology evolves faster than most people can adapt.',
 'She couldn’t believe her eyes when she saw the results.',
 'Have you ever wondered why the sky is blue?',
 'They were walking home when the rain suddenly started.',
 'A dog barked loudly in the mi

In [23]:
tokenizer = Tokenizer()

In [24]:
tokenizer.fit_on_texts(corpus)
tokenizer.word_index

{'the': 1,
 'a': 2,
 'and': 3,
 'is': 4,
 'of': 5,
 'she': 6,
 'to': 7,
 'with': 8,
 'he': 9,
 'in': 10,
 'they': 11,
 'for': 12,
 'was': 13,
 'as': 14,
 'are': 15,
 'you': 16,
 'at': 17,
 'can': 18,
 'every': 19,
 'new': 20,
 'it': 21,
 'sky': 22,
 'on': 23,
 'time': 24,
 'when': 25,
 'i': 26,
 'love': 27,
 'world': 28,
 'life': 29,
 'full': 30,
 'people': 31,
 'from': 32,
 'night': 33,
 'stars': 34,
 'after': 35,
 'that': 36,
 'into': 37,
 'waiting': 38,
 'her': 39,
 'my': 40,
 'saw': 41,
 'rain': 42,
 'day': 43,
 'his': 44,
 'dark': 45,
 'house': 46,
 'filled': 47,
 'small': 48,
 'books': 49,
 'music': 50,
 'has': 51,
 'together': 52,
 'left': 53,
 'dog': 54,
 'be': 55,
 'old': 56,
 'but': 57,
 'clouds': 58,
 'one': 59,
 'built': 60,
 'child': 61,
 'big': 62,
 'work': 63,
 'bus': 64,
 'friend': 65,
 'we': 66,
 'place': 67,
 'good': 68,
 'make': 69,
 'sun': 70,
 'behind': 71,
 'orange': 72,
 'reading': 73,
 'opened': 74,
 'door': 75,
 'found': 76,
 'surprises': 77,
 'cat': 78,
 'trav

In [25]:
# Converting text to sequences
sequences = tokenizer.texts_to_sequences(corpus)
sequences

[[1, 70, 160, 71, 1, 161, 14, 1, 22, 162, 72],
 [26, 27, 73, 49, 23, 163, 164, 8, 2, 165, 5, 166],
 [167, 168, 4, 169, 1, 28, 170],
 [6, 74, 1, 75, 3, 76, 2, 171, 38, 12, 39],
 [29, 4, 30, 5, 77, 3, 172, 173],
 [1, 78, 174, 175, 1, 176, 3, 177, 178, 2, 179],
 [9, 180, 5, 79, 7, 181, 182, 183],
 [50, 51, 1, 184, 7, 185, 3, 186, 31, 52],
 [187, 16, 80, 81, 26, 53, 40, 188, 189],
 [190, 191, 2, 24, 192, 13, 2, 193, 194, 195],
 [1, 196, 197, 17, 1, 198, 199, 200, 201],
 [82, 202, 83, 32, 1, 203, 5, 84],
 [1, 204, 205, 206, 207, 1, 208, 209],
 [210, 211, 212, 85, 86, 31, 18, 213],
 [6, 214, 215, 39, 216, 25, 6, 41, 1, 217],
 [87, 16, 218, 219, 220, 1, 22, 4, 221],
 [11, 222, 223, 88, 25, 1, 42, 224, 89],
 [2, 54, 225, 226, 10, 1, 227, 5, 1, 90, 33],
 [1, 28, 4, 30, 5, 228, 38, 7, 55, 91],
 [229, 27, 7, 230, 231, 3, 232, 1, 233],
 [9, 92, 93, 19, 43, 7, 94, 44, 95],
 [1, 34, 234, 96, 10, 1, 45, 33, 22],
 [97, 2, 20, 98, 18, 99, 235, 7, 20, 236],
 [6, 100, 25, 6, 41, 39, 56, 101, 35, 102],
 [

In [26]:
input_sequences = []

for seq in sequences:
    for i in range(1, len(seq)):
        n_gram_seq = seq[:i+1]
        input_sequences.append(n_gram_seq)
input_sequences

[[1, 70],
 [1, 70, 160],
 [1, 70, 160, 71],
 [1, 70, 160, 71, 1],
 [1, 70, 160, 71, 1, 161],
 [1, 70, 160, 71, 1, 161, 14],
 [1, 70, 160, 71, 1, 161, 14, 1],
 [1, 70, 160, 71, 1, 161, 14, 1, 22],
 [1, 70, 160, 71, 1, 161, 14, 1, 22, 162],
 [1, 70, 160, 71, 1, 161, 14, 1, 22, 162, 72],
 [26, 27],
 [26, 27, 73],
 [26, 27, 73, 49],
 [26, 27, 73, 49, 23],
 [26, 27, 73, 49, 23, 163],
 [26, 27, 73, 49, 23, 163, 164],
 [26, 27, 73, 49, 23, 163, 164, 8],
 [26, 27, 73, 49, 23, 163, 164, 8, 2],
 [26, 27, 73, 49, 23, 163, 164, 8, 2, 165],
 [26, 27, 73, 49, 23, 163, 164, 8, 2, 165, 5],
 [26, 27, 73, 49, 23, 163, 164, 8, 2, 165, 5, 166],
 [167, 168],
 [167, 168, 4],
 [167, 168, 4, 169],
 [167, 168, 4, 169, 1],
 [167, 168, 4, 169, 1, 28],
 [167, 168, 4, 169, 1, 28, 170],
 [6, 74],
 [6, 74, 1],
 [6, 74, 1, 75],
 [6, 74, 1, 75, 3],
 [6, 74, 1, 75, 3, 76],
 [6, 74, 1, 75, 3, 76, 2],
 [6, 74, 1, 75, 3, 76, 2, 171],
 [6, 74, 1, 75, 3, 76, 2, 171, 38],
 [6, 74, 1, 75, 3, 76, 2, 171, 38, 12],
 [6, 74, 1, 7

In [27]:
# Pad all sequences to the same length
max_seq_len = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

In [28]:
# Split predictors (X) and label (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [29]:
vocab_size = len(tokenizer.word_index) + 1

In [30]:
# Convert y to categorical (one-hot encoding)
from keras.utils import to_categorical
y = to_categorical(y, num_classes=vocab_size)

In [31]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1035, 11)
Shape of y: (1035, 570)


In [32]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(vocab_size, 50, input_shape=(max_seq_len-1,)))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))


In [33]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [35]:
model.fit(X,y,epochs=100)

Epoch 1/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1913 - loss: 3.8297
Epoch 2/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1981 - loss: 3.7020
Epoch 3/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2261 - loss: 3.5655
Epoch 4/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2454 - loss: 3.4390
Epoch 5/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2773 - loss: 3.3277
Epoch 6/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2986 - loss: 3.2138
Epoch 7/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3256 - loss: 3.0957
Epoch 8/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3459 - loss: 2.9877
Epoch 9/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x209526ab250>

In [36]:
import numpy as np
text1 = "The mountain"

for i in range(10):
    token_text = tokenizer.texts_to_sequences([text1])[0]

    padded_token_text = pad_sequences([token_text], maxlen=vocab_size,padding='pre')

    pos= np.argmax(model.predict(padded_token_text))

    for word, index in tokenizer.word_index.items():
        if index==pos:
            text1 = text1 + " " + word
            print(text1)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step
The mountain peak
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
The mountain peak was
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
The mountain peak was is
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
The mountain peak was is smiling
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
The mountain peak was is smiling but
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
The mountain peak was is smiling but tall
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
The mountain peak was is smiling but tall against
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
The mountain peak was is smiling but tall against the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
The mountain peak was is smiling but tall against the lesson
[1m1/1[

In [37]:
text2 = "He smiled politely"

for i in range(6):
    token_text = tokenizer.texts_to_sequences([text2])[0]

    padded_token_text = pad_sequences([token_text], maxlen=vocab_size,padding='pre')

    pos= np.argmax(model.predict(padded_token_text))

    for word, index in tokenizer.word_index.items():
        if index==pos:
            text2 = text2 + " " + word
            print(text2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
He smiled politely and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
He smiled politely and traveling
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
He smiled politely and traveling to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
He smiled politely and traveling to visit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
He smiled politely and traveling to visit to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
He smiled politely and traveling to visit to find


In [38]:
text3 = "the dog"
for i in range(3):
    token_text = tokenizer.texts_to_sequences([text3])[0]

    padded_token_text = pad_sequences([token_text], maxlen=vocab_size,padding='pre')

    pos= np.argmax(model.predict(padded_token_text))

    for word, index in tokenizer.word_index.items():
        if index==pos:
            text3 = text3 + " " + word
            print(text3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
the dog wagged
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
the dog wagged its
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
the dog wagged its tail


In [39]:
text4 = "A butterfly"
for i in range(8):
    token_text = tokenizer.texts_to_sequences([text4])[0]

    padded_token_text = pad_sequences([token_text], maxlen=vocab_size,padding='pre')

    pos= np.argmax(model.predict(padded_token_text))

    for word, index in tokenizer.word_index.items():
        if index==pos:
            text4 = text4 + " " + word
            print(text4)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
A butterfly landed
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
A butterfly landed a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
A butterfly landed a house
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
A butterfly landed a house at
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
A butterfly landed a house at the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
A butterfly landed a house at the bus
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
A butterfly landed a house at the bus every
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
A butterfly landed a house at the bus every morning
