In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, Trainer, TrainingArguments
import os
import torch

In [3]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.mask_token = None  # Set the mask token to None

model = GPT2LMHeadModel.from_pretrained("gpt2")


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:

# Load and process the Ramayana text data
ramayana_text_file = '/kaggle/input/sampurna-ramayanam/ramayana.txt'

with open(ramayana_text_file, 'r', encoding='utf-8') as file:
    ramayana_text = file.read()


In [8]:

# Define the directory for training files
training_dir = '/kaggle/working/'
os.makedirs(training_dir, exist_ok=True)

In [9]:

# Save the Ramayana text as a training file
training_file_path = os.path.join(training_dir, 'ramayana.txt')
with open(training_file_path, 'w', encoding='utf-8') as file:
    file.write(ramayana_text)

In [24]:

# Create a TextDataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=training_file_path,
    block_size=128  # Adjust the block_size according to your data length
)

In [25]:

# Define the training arguments for causal language modeling
training_args = TrainingArguments(
    output_dir="./ramayana_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    prediction_loss_only=True,
    report_to="none"
)


In [37]:
def my_data_collator(examples):
    input_ids = [example['input_ids'] for example in examples]

    # Determine the maximum length among input_ids
    max_length = max(len(ids) for ids in input_ids)

    # Pad the input_ids directly
    padded_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]

    return {
        'input_ids': torch.tensor(padded_input_ids, dtype=torch.long)
    }


In [38]:
# Start the training process with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=my_data_collator,
    train_dataset=train_dataset
)

In [39]:
# Train the model
trainer.train()

IndexError: too many indices for tensor of dimension 1

In [42]:
import re

# Example verses with potential non-Devanagari characters
verses = ["
    % Ramayana: Balakanda
% Last updated: Thu Oct 21 2021
% Encoding: Unicode Devanagari
%
1001001a तपःस्वाध्यायनिरतं तपस्वी वाग्विदां वरम्
1001001c नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुंगवम्
1001002a को न्वस्मिन्साम्प्रतं लोके गुणवान्कश्च वीर्यवान्
1001002c धर्मज्ञश्च कृतज्ञश्च सत्यवाक्यो दृढव्रतः
1001003a चारित्रेण च को युक्तः सर्वभूतेषु को हितः
1001003c विद्वान्कः कः समर्थश्च कश्चैकप्रियदर्शनः
1001004a आत्मवान्को जितक्रोधो द्युतिमान्कोऽनसूयकः"
]

# Regular expression to filter only Devanagari characters
devanagari_pattern = re.compile("[\u0900-\u097F]+")

# Filter non-Devanagari characters from the verses
devanagari_verses = ["".join(re.findall(devanagari_pattern, verse)) for verse in verses]


SyntaxError: invalid decimal literal (2278479223.py, line 9)

In [41]:

# Display the filtered verses
for verse in devanagari_verses:
    print(verse)


तपःस्वाध्यायनिरतंतपस्वीवाग्विदांवरम्

नारदंपरिपप्रच्छवाल्मीकिर्मुनिपुंगवम्
