In [None]:
# ================================
# Setup: Install and Import Libraries
# ================================
!pip install -q peft

import os
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm
from torch.optim import AdamW
# Evaluation metrics
!pip install -q sacrebleu bert-score evaluate unbabel-comet

import sacrebleu
from bert_score import score as bert_score
import evaluate
from comet import download_model, load_from_checkpoint


# PEFT (LoRA)
from peft import LoraConfig, get_peft_model, TaskType


In [None]:
# Enter Access Token and rerun
from huggingface_hub import login
login(new_session=False)

In [None]:
# ================================
# Load Base Model and Tokenizer
# ================================
ckpt = "ai4bharat/indictrans2-indic-en-1B"
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(ckpt)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Model loaded on", device)


In [None]:
# ================================
# Prepare Domain-Specific Data
# ================================

raw_data = load_dataset("atrisaxena/mini-iitb-english-hindi")
raw_data = raw_data["train"]  # use train split

# 1000 statements
domain_train = raw_data.shuffle(seed=42).select(range(1000))
domain_val   = raw_data.shuffle(seed=42).select(range(100, 150))  # small dev set



In [None]:
# ================================
# Tokenization Function (same format as Task 2)
# ================================
SRC_TAG = "hin_Deva "
TGT_TAG = "eng_Latn"
source_lang = "hi"; target_lang = "en"
max_input_length = 128; max_target_length = 128

def preprocess_function(examples):
    inputs = [f"{SRC_TAG} {TGT_TAG} {ex[source_lang].strip()}" for ex in examples["translation"]]
    targets = [ex[target_lang].strip() for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length,
                             truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer([f"{TGT_TAG} {t}" for t in targets],
                           max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
# Tokenize domain data
tokenized_train = domain_train.map(preprocess_function, batched=True, remove_columns=["translation"])
tokenized_val   = domain_val.map(preprocess_function, batched=True, remove_columns=["translation"])

In [None]:
# DataLoader setup
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=2)
val_loader   = DataLoader(tokenized_val,   batch_size=16, shuffle=False, collate_fn=data_collator, num_workers=1)


In [None]:
# ================================
# Configure LoRA Adapters
# ================================
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=4,                     
    lora_alpha=8,            # scaling
    lora_dropout=0.05,
    bias="none",

    # ONLY query & value projections
    target_modules=[
        "q_proj",
        "v_proj"
    ]
)
model.gradient_checkpointing_enable()
model.config.use_cache = False


# Wrap the model with LoRA adapters
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # (Optional) shows how many params are trainable


In [None]:
# ================================
# Training Loop with LoRA Fine-Tuning (AMP-safe)
# ================================

from torch.amp import autocast, GradScaler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scaler = GradScaler("cuda")

num_epochs = 1
model.train()

for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad(set_to_none=True)

        # Mixed Precision Forward Pass
        with autocast("cuda"):
            outputs = model(**batch)
            loss = outputs.loss

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f}")

    # ================================
    # Validation
    # ================================
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with autocast("cuda"):
                outputs = model(**batch)
                val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss: {val_loss:.4f}")

    model.train()

print(" LoRA Fine-tuning complete.")


In [None]:
from huggingface_hub import login

login()  # paste your HF token

In [None]:
repo_id = "Vir123-dev/indictrans2_hi_en_finetune_1B"


In [None]:
# Save LoRA adapter + push to hub
model.push_to_hub(
    repo_id,
    commit_message="LoRA fine-tuned IndicTrans2 on Domain-1 (HI-EN)"
)

# Save tokenizer (IMPORTANT)
tokenizer.push_to_hub(repo_id)
