In [1]:
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import files
uploaded = files.upload()

# Read and clean text
with open('TGDataset.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Remove unwanted characters
text = re.sub(r'\([^)]*\)', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()

print("Sample cleaned text:", text[:300])

Saving TGDataset.txt to TGDataset.txt
Sample cleaned text: the sun was shining brightly in the clear blue sky and a gentle breeze rustled the leaves of the tall trees people were out enjoying the beautiful weather some sitting in the park others taking a leisurely stroll along the riverbank children were playing games and laughter filled the air as the day 


In [3]:
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [4]:
from nltk.util import ngrams

from nltk.util import ngrams
corpus = []
for tokens in tokenized_sentences:
    for n in range(2, 4):  # Bigrams and Trigrams
        n_grams = list(ngrams(tokens, n))
        for gram in n_grams:
            corpus.append(' '.join(gram))

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

In [6]:
max_sequence_len = max([len(x) for x in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

In [7]:


X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)
vocab_size = len(tokenizer.word_index) + 1

In [8]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_len-1))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
history = model.fit(X, y, epochs=40, batch_size=256, verbose=2)

Epoch 1/40
201/201 - 28s - 140ms/step - accuracy: 0.0485 - loss: 7.1313
Epoch 2/40
201/201 - 42s - 207ms/step - accuracy: 0.0502 - loss: 6.6527
Epoch 3/40
201/201 - 41s - 204ms/step - accuracy: 0.0522 - loss: 6.4932
Epoch 4/40
201/201 - 40s - 201ms/step - accuracy: 0.0588 - loss: 6.2838
Epoch 5/40
201/201 - 40s - 200ms/step - accuracy: 0.0647 - loss: 6.1007
Epoch 6/40
201/201 - 22s - 107ms/step - accuracy: 0.0785 - loss: 5.9134
Epoch 7/40
201/201 - 40s - 197ms/step - accuracy: 0.0908 - loss: 5.7122
Epoch 8/40
201/201 - 22s - 107ms/step - accuracy: 0.1066 - loss: 5.5165
Epoch 9/40
201/201 - 41s - 203ms/step - accuracy: 0.1180 - loss: 5.3497
Epoch 10/40
201/201 - 22s - 109ms/step - accuracy: 0.1306 - loss: 5.1938
Epoch 11/40
201/201 - 42s - 209ms/step - accuracy: 0.1422 - loss: 5.0461
Epoch 12/40
201/201 - 39s - 192ms/step - accuracy: 0.1518 - loss: 4.8967
Epoch 13/40
201/201 - 43s - 213ms/step - accuracy: 0.1634 - loss: 4.7513
Epoch 14/40
201/201 - 40s - 199ms/step - accuracy: 0.1719 - 