###Import The Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

###Preprocessing

In [2]:
# Define tokenizer and other parameters
tokenizer = Tokenizer()

# Function to preprocess text
def preprocess_text(text):
    return re.sub(r'[^\w\s]', '', text)


# Generator to yield input-output pairs
def text_data_generator(file_path, tokenizer, max_sequence_length):
    with open(file_path, 'r') as file:
        for line in file:
            line = preprocess_text(line.strip())  # Preprocess each line
            sequence = tokenizer.texts_to_sequences([line])[0]   #Convert the sequence into digits
            for i in range(1, len(sequence)):
                input_sequence = sequence[:i]           #Split into input and target words
                target_word = sequence[i]
                # Pad the input sequence
                input_sequence = pad_sequences([input_sequence], maxlen=max_sequence_length, padding='pre')[0]   #Pad the sequence
                yield torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_word, dtype=torch.long)

###Define The Model

In [3]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, num_classes, embedding_dim=100, lstm_units=150,dropout_prob=0.5):
        super(Next_Word_Predictor, self).__init__()
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True, bidirectional=True,dropout=dropout_prob)
        self.fc = nn.Linear(lstm_units * 2, num_classes)
        self.dropout=nn.Dropout(dropout_prob)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

### Read the corpus and build the vocabulary

In [4]:
with open('/content/Corpus.txt', 'r') as file:
    Corpus = file.read()

# Tokenizer fits on the entire corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([Corpus])
vocab = tokenizer.word_index
num_classes = len(vocab) + 1

# Determine max sequence length based on corpus
sentences = Corpus.split('\n')
max_sequence_length = max(len(tokenizer.texts_to_sequences([preprocess_text(sentence)])[0]) for sentence in sentences)

In [5]:
print(max_sequence_length)  #Need the value during testing

19


### Initialize model, criterion, and optimizer

In [6]:
model = Next_Word_Predictor(num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



###Training Loop

In [7]:
# Training loop using the generator
def train_model(model, file_path, tokenizer, max_sequence_length, optimizer, num_epochs=10, steps_per_epoch=6000):
    for epoch in range(num_epochs):
        generator = text_data_generator(file_path, tokenizer, max_sequence_length)
        for step in range(steps_per_epoch):
            input_seq, target_word = next(generator)
            optimizer.zero_grad()
            output = model(input_seq.unsqueeze(0))  # Add batch dimension
            loss = criterion(output, target_word.unsqueeze(0))
            loss.backward()
            optimizer.step()
        print(f"Epoch: {epoch + 1}/{num_epochs}\tLoss: {loss:.4f}")

In [8]:
# Train the model
train_model(model, '/content/Corpus.txt', tokenizer, max_sequence_length, optimizer)

Epoch: 1/10	Loss: 5.2316
Epoch: 2/10	Loss: 4.2431
Epoch: 3/10	Loss: 2.4903
Epoch: 4/10	Loss: 1.0102
Epoch: 5/10	Loss: 0.9458
Epoch: 6/10	Loss: 0.0690
Epoch: 7/10	Loss: 0.6529
Epoch: 8/10	Loss: 0.1621
Epoch: 9/10	Loss: 0.1580
Epoch: 10/10	Loss: 0.2695


###Save The Model And The Vocabulary

In [9]:
torch.save(model,'/content/next_word_predictor.pth')

In [10]:
import json
# Save the vocabulary
with open('/content/vocabulary.json', 'w') as f:
    json.dump(tokenizer.word_index, f)

###Sample Code To Generate A Corpus

In [None]:
#import random
#
## Example templates for sentence structures
#templates = [
#    "The {} {} {} the {}.",  # Needs 4 arguments
#    "{} {} {} to the {}.",  # Needs 4 arguments
#    "{} {} {} a {} {}.",    # Needs 5 arguments
#    "In the {}, the {} {} {}.",  # Needs 4 arguments
#    "After the {}, the {} {} {} {}."  # Needs 5 arguments
#]
#
## Example vocabularies
#nouns = ["cat", "dog", "car", "man", "woman", "child", "building", "tree", "river", "mountain"]
#verbs = ["jumps", "runs", "drives", "flies", "walks", "sings", "dances", "writes", "reads", "talks"]
#adjectives = ["quick", "lazy", "tall", "short", "bright", "dark", "happy", "sad", "big", "small"]
#places = ["park", "city", "forest", "beach", "school", "office", "home", "village", "market", "station"]
#
## Generate sentences
#corpus = []
#for _ in range(3000):  # Generate 1000 sentences
#    template = random.choice(templates)
#    if template.count('{}') == 4:
#        # Choose 4 arguments if the template requires 4 placeholders
#        sentence = template.format(
#            random.choice(adjectives),
#            random.choice(nouns),
#            random.choice(verbs),
#            random.choice(places)
#        )
#    else:
#        # Choose 5 arguments if the template requires 5 placeholders
#        sentence = template.format(
#            random.choice(adjectives),
#            random.choice(nouns),
#            random.choice(verbs),
#            random.choice(adjectives),
#            random.choice(places)
#        )
#    corpus.append(sentence)
#
## Join sentences into a corpus
#generated_corpus = "\n".join(corpus)
#
## Save the corpus to a file
#output_file = '/content/generated_corpus.txt'
#with open(output_file, 'w') as file:
#    file.write(generated_corpus)
#
#print(f"Corpus saved to {output_file}")
