# Swara Transformer – Hindustani Classical Music

In [ ]:
!pip install transformers torch

In [ ]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Sample demo corpus
raga_data = {
    'Raga_Yaman': ["S R G M D N S'", "S' N D P M G R S"],
    'Raga_Bhairav': ["S r G M P d N S'", "S' N d P M G r S"]
}

# Flatten corpus with raga label
lines = []
for raga, phrases in raga_data.items():
    for phrase in phrases:
        lines.append(f'|{raga}| ' + phrase)

# Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.add_special_tokens({'additional_special_tokens': ['|Raga_Yaman|', '|Raga_Bhairav|', 'S', 'R', 'G', 'M', 'P', 'D', 'N', "S'"]})
encodings = tokenizer('
'.join(lines), return_tensors='pt')

# Dataset
class SwaraDataset(Dataset):
    def __init__(self, encodings, block_size=16):
        self.input_ids = encodings['input_ids'][0]
        self.block_size = block_size
    def __len__(self):
        return len(self.input_ids) - self.block_size
    def __getitem__(self, idx):
        x = self.input_ids[idx:idx+self.block_size]
        y = self.input_ids[idx+1:idx+1+self.block_size]
        return {'input_ids': x, 'labels': y}

dataset = SwaraDataset(encodings)

# Model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Training
training_args = TrainingArguments(output_dir='./results', per_device_train_batch_size=2, num_train_epochs=2, logging_steps=10, report_to='none')
trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
trainer.train()