In [None]:
# Install NLTK
!pip install nltk

# Import required libraries
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

nltk.download('punkt')
nltk.download('punkt_tab')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Upload the dataset (TGDataset.txt) manually in Colab using the file uploader

from google.colab import files
uploaded = files.upload()

# Read and clean text
with open('TGDataset.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Remove unwanted characters
text = re.sub(r'\([^)]*\)', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()

print("Sample cleaned text:", text[:300])



Saving TGDataset.txt to TGDataset (2).txt
Sample cleaned text: the sun was shining brightly in the clear blue sky and a gentle breeze rustled the leaves of the tall trees people were out enjoying the beautiful weather some sitting in the park others taking a leisurely stroll along the riverbank children were playing games and laughter filled the air as the day 


In [None]:
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [None]:
from nltk.util import ngrams
corpus = []
for tokens in tokenized_sentences:
    for n in range(2, 4):  # Bigrams and Trigrams
        n_grams = list(ngrams(tokens, n))
        for gram in n_grams:
            corpus.append(' '.join(gram))


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)


In [None]:
max_sequence_len = max([len(x) for x in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)
vocab_size = len(tokenizer.word_index) + 1


In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_len-1))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

# Wrap 'accuracy' in a list as expected by the metrics argument
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
history = model.fit(X, y, epochs=50, batch_size=256, verbose=2)


Epoch 1/50
201/201 - 27s - 136ms/step - accuracy: 0.0463 - loss: 7.1213
Epoch 2/50
201/201 - 23s - 112ms/step - accuracy: 0.0502 - loss: 6.6634
Epoch 3/50
201/201 - 39s - 192ms/step - accuracy: 0.0529 - loss: 6.5114
Epoch 4/50
201/201 - 22s - 109ms/step - accuracy: 0.0591 - loss: 6.2941
Epoch 5/50
201/201 - 41s - 206ms/step - accuracy: 0.0661 - loss: 6.1245
Epoch 6/50
201/201 - 41s - 202ms/step - accuracy: 0.0774 - loss: 5.9467
Epoch 7/50
201/201 - 41s - 205ms/step - accuracy: 0.0945 - loss: 5.7123
Epoch 8/50
201/201 - 40s - 201ms/step - accuracy: 0.1118 - loss: 5.4991
Epoch 9/50
201/201 - 41s - 204ms/step - accuracy: 0.1252 - loss: 5.3081
Epoch 10/50
201/201 - 20s - 102ms/step - accuracy: 0.1376 - loss: 5.1403
Epoch 11/50
201/201 - 20s - 101ms/step - accuracy: 0.1478 - loss: 4.9812
Epoch 12/50
201/201 - 22s - 108ms/step - accuracy: 0.1589 - loss: 4.8297
Epoch 13/50
201/201 - 20s - 101ms/step - accuracy: 0.1669 - loss: 4.6917
Epoch 14/50
201/201 - 22s - 109ms/step - accuracy: 0.1771 - 

In [1]:
def predict_next_words(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)[0]
        output_word = tokenizer.index_word.get(predicted)
        if output_word is None:
            break
        seed_text += " " + output_word
    return seed_text
