In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Assign end-of-text as the padding token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load and process the Ramayana text data
ramayana_train_file = '/kaggle/input/ramayana-processed/ramayana train.txt'  

with open(ramayana_train_file, 'r', encoding='utf-8') as file:
    ramayana_text = file.readlines()

validation_file_path = '/kaggle/input/ramayana-processed/validation.txt'  

with open(validation_file_path, 'r', encoding='utf-8') as file:
    validation_text = file.read()

validation_dir = '/kaggle/working/validation/'  
os.makedirs(validation_dir, exist_ok=True)

validation_file_path = os.path.join(validation_dir, 'validation_ramayana.txt')
with open(validation_file_path, 'w', encoding='utf-8') as file:
    file.write(validation_text)



Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:

class RamayanaDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data):
        self.examples = tokenizer.batch_encode_plus(
            data,
            padding="max_length",
            max_length=130,
            truncation=True,
            return_tensors="pt"  # This parameter generates PyTorch tensors
        )

        # Move tensors to CUDA
        self.examples = {key: value.to('cuda') for key, value in self.examples.items()}

    # Other methods of the class...


    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.examples["input_ids"][idx],
            "labels": self.examples["input_ids"][idx].clone()  # For language modeling tasks
        }

train_dataset = RamayanaDataset(tokenizer, ramayana_text)

In [3]:


training_args = TrainingArguments(
    output_dir="./ramayana_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    prediction_loss_only=True,
    report_to="none",
    eval_steps=500,  
    evaluation_strategy="steps"
)

In [4]:

import pickle
#pickle.dump(rf,open("rf.h5","wb"))

model = pickle.load(open("/kaggle/input/samskrta/samskrta.h5","rb"))

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
    eval_dataset=RamayanaDataset(tokenizer, [validation_text])  # Use the validation text as a single example
)

In [5]:
import torch

In [None]:
# Move input_ids tensor to CUDA

In [6]:
text_prompt =  "तपःस्वाध्यायनिरतं "

# Tokenize the text prompt
input_ids = tokenizer.encode(text_prompt, return_tensors="pt")
input_ids = input_ids.to('cuda')

# Ensure attention_mask is set for reliable results
attention_mask = torch.ones_like(input_ids)

# Generate predictions
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=130,
        temperature=1.35,
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
    )



In [7]:
# Convert the generated output tensor to a list of integers
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(output_text)

तपःस्वाध्यायनिरतं ्यं सर्वं समरे स्थितम्

यथा सह सीतां विश्रम्

यम्

यम्

यम्



In [None]:

#Downloaded the trained model with its weights
import pickle

Pkl_Filename = "samskrta ocr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

 from IPython.display import FileLink  

FileLink(r'samskrta.h5')