In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os
import torch




In [2]:
# Initialize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.mask_token = None  # Set the mask token to None

model = GPT2LMHeadModel.from_pretrained("gpt2")


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:

# Load and process the Ramayana text data
ramayana_text_file = '/kaggle/input/sampurna-ramayanam/ramayana.txt'  # Replace with your dataset file path

with open(ramayana_text_file, 'r', encoding='utf-8') as file:
    ramayana_text = file.read()


In [4]:

# Define the directory for training files
training_dir = '/kaggle/working/' # Replace with your desired directory
os.makedirs(training_dir, exist_ok=True)


In [5]:

# Save the Ramayana text as a training file
training_file_path = os.path.join(training_dir, 'ramayana.txt')
with open(training_file_path, 'w', encoding='utf-8') as file:
    file.write(ramayana_text)


In [6]:

# Create a TextDataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=training_file_path,
    block_size=128  # Adjust the block_size according to your data length
)




In [7]:
training_args = TrainingArguments(
    output_dir="./ramayana_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    prediction_loss_only=True,
    report_to="none")
   


In [8]:

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset
)


In [9]:

# Start training the model
trainer.train()




Step,Training Loss
500,1.5314
1000,1.3557
1500,1.2824
2000,1.2324
2500,1.1981
3000,1.1682
3500,1.1484
4000,1.1306
4500,1.119
5000,1.1098


TrainOutput(global_step=8169, training_loss=1.1701064206115297, metrics={'train_runtime': 2468.6107, 'train_samples_per_second': 26.471, 'train_steps_per_second': 3.309, 'total_flos': 4268597280768000.0, 'train_loss': 1.1701064206115297, 'epoch': 3.0})