In [4]:
# ================================
# Setup: Install and Import Libraries
# ================================
!pip install -q peft

import os
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm
from torch.optim import AdamW
# Evaluation metrics
!pip install -q sacrebleu bert-score evaluate unbabel-comet

import sacrebleu
from bert_score import score as bert_score
import evaluate
from comet import download_model, load_from_checkpoint


# PEFT (LoRA)
from peft import LoraConfig, get_peft_model, TaskType


In [5]:
# Enter Access Token and rerun
from huggingface_hub import login
login(new_session=False)

In [6]:
# ================================
# Load Base Model and Tokenizer
# ================================
ckpt = "ai4bharat/indictrans2-en-indic-1B"
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(ckpt)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Model loaded on", device)


The repository ai4bharat/indictrans2-en-indic-1B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/ai4bharat/indictrans2-en-indic-1B .
 You can inspect the repository content at https://hf.co/ai4bharat/indictrans2-en-indic-1B.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Model loaded on cuda


In [23]:
# ================================
# Prepare Domain-Specific Data
# ================================

raw_data = load_dataset("LingoIITGN/PHINC")
def convert_to_translation(example):
    return {
        "translation": {
            "en": example["English_Translation"],
            "hing": example["Sentence"]
        }
    }

raw_data = raw_data.map(convert_to_translation)

raw_data = raw_data["train"]  # use train split

# 300 statements --> 10000 is taking too long and my net speed is not high(reason)
domain_train = raw_data.shuffle(seed=42).select(range(300))
domain_val   = raw_data.shuffle(seed=42).select(range(100, 150))  # small dev set

print(domain_train[0])


{'Sentence': "@205HiDeeps arey Ashuuu sir ki khaas ' Twamily ' ki member kyu ni baat krungi yar ! Kamaal krte ho ap b", 'English_Translation': "@205HiDeeps oh shishu sir's special 'twamily' member, why will I not talk! you too do amazing", 'translation': {'en': "@205HiDeeps oh shishu sir's special 'twamily' member, why will I not talk! you too do amazing", 'hing': "@205HiDeeps arey Ashuuu sir ki khaas ' Twamily ' ki member kyu ni baat krungi yar ! Kamaal krte ho ap b"}}


In [25]:
raw_data

Dataset({
    features: ['Sentence', 'English_Translation', 'translation'],
    num_rows: 13738
})

In [26]:
import torch
from datasets import Dataset

# -------------------------------
# Language tags
# -------------------------------
SRC_TAG = "eng_Latn"     # Hinglish (Latin)
TGT_TAG = "hin_Deva"     # Hindi (Devanagari)

MAX_LEN = 128

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

IndicTransForConditionalGeneration(
  (model): IndicTransModel(
    (encoder): IndicTransEncoder(
      (embed_tokens): Embedding(32322, 1024, padding_idx=1)
      (embed_positions): IndicTransSinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-17): 18 x IndicTransEncoderLayer(
          (self_attn): IndicTransAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05,

In [27]:
import re
import torch

# ----------------------------------
# Patterns
# ----------------------------------
URL_PATTERN = r"https?://\S+"
HANDLE_PATTERN = r"@\w+"

TECH_WORDS = [
    "ai/ml", "ai", "ml",
    "artificial intelligence",
    "machine learning",
    "data science",
    "deep learning"
]

SOCIAL_WORDS = [
    "really", "amazing", "awesome", "emotional",
    "touching", "bhai", "and", "sir", "madam",
    "fan", "fans", "love", "respect", "support"
]

WORD_PATTERN = r"\b(" + "|".join(map(re.escape, TECH_WORDS + SOCIAL_WORDS)) + r")\b"


# ----------------------------------
# Token protection
# ----------------------------------
def protect_tokens(text):
    protected = {}
    idx = 0

    patterns = [
        URL_PATTERN,      # URLs
        HANDLE_PATTERN,   # @handles
        WORD_PATTERN      # tech + social words
    ]

    for pattern in patterns:
        def repl(match):
            nonlocal idx
            key = f"XQZPLCH{idx}XQZ"
            protected[key] = match.group()
            idx += 1
            return key

        text = re.sub(pattern, repl, text, flags=re.IGNORECASE)

    return text, protected


def restore_tokens(text, protected):
    for k, v in protected.items():
        text = text.replace(k, v)
    return text



In [29]:
def hinglish_to_hindi_batch(
    sentences,
    model,
    tokenizer,
    batch_size=32,
    src_tag="eng_Latn",
    tgt_tag="hin_Deva",
    max_length=96
):
    hindi_outputs = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]

        # Protect tokens (sentence-wise)
        safe_batch = []
        protected_maps = []

        for s in batch:
            safe_text, protected = protect_tokens(s)
            safe_batch.append(f"{src_tag} {tgt_tag} {safe_text}")
            protected_maps.append(protected)

        #  Tokenize batch
        inputs = tokenizer(
            safe_batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(model.device)

        # Generate translations
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=1,        # IndicTrans2 stable
                use_cache=False,   # REQUIRED
                early_stopping=True
            )

        # Decode batch
        decoded = tokenizer.batch_decode(
            output_ids,
            skip_special_tokens=True
        )

        #  Restore protected tokens (sentence-wise)
        for text, protected in zip(decoded, protected_maps):
            restored = restore_tokens(text, protected)
            hindi_outputs.append(restored)

    return hindi_outputs


In [30]:
def add_hi_column(
    examples,
    model,
    tokenizer,
    batch_size=16
):
    """
    Adds a 'hi' (Hindi) field to each translation dict
    by converting Hinglish ('hing') → Hindi using IndicTrans2.
    """

    # examples["translation"] is a LIST of dicts (batched=True)
    translations = examples["translation"]

    # 1️⃣ Collect Hinglish sentences
    hinglish_sentences = [
        t.get("hing", "") for t in translations
    ]

    # 2️⃣ Convert Hinglish → Hindi (BATCH)
    hindi_sentences = hinglish_to_hindi_batch(
        sentences=hinglish_sentences,
        model=model,
        tokenizer=tokenizer,
        batch_size=batch_size
    )

    # 3️⃣ Rebuild translation dicts with added 'hi'
    new_translations = []
    for t, hi in zip(translations, hindi_sentences):
        t_new = dict(t)   # shallow copy (safe)
        t_new["hi"] = hi
        new_translations.append(t_new)

    # 4️⃣ Return updated batch
    return {
        "translation": new_translations
    }


In [31]:
domain_train = domain_train.map(
    add_hi_column,
    batched=True,
    batch_size=64,
    keep_in_memory=True,
    fn_kwargs={
        "model": model,
        "tokenizer": tokenizer,
        "batch_size":16
    }
)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [32]:
domain_val = domain_val.map(
    add_hi_column,
    batched=True,
    batch_size=32,
    keep_in_memory=True,
    fn_kwargs={
        "model": model,
        "tokenizer": tokenizer,
        "batch_size": 32   # translation batch size (GPU-friendly)
    }
)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [33]:
SRC_TAG = "eng_Latn"
TGT_TAG = "hin_Deva"

source_lang = "en"
target_lang = "hi"

max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    # -------- source side --------
    sources = [
        f"{SRC_TAG} {TGT_TAG} {ex[source_lang].strip()}"
        for ex in examples["translation"]
    ]

    # -------- target side (IMPORTANT) --------
    targets = [
        f"{SRC_TAG} {TGT_TAG} {ex[target_lang].strip()}"
        for ex in examples["translation"]
    ]

    # tokenize source
    model_inputs = tokenizer(
        sources,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # tokenize target (WITH TAGS)
    labels = tokenizer(
        targets,
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    # replace pad tokens with -100 for loss masking
    labels["input_ids"] = [
        [(tok if tok != tokenizer.pad_token_id else -100) for tok in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [34]:
tokenized_train = domain_train.map(
    preprocess_function,
    batched=True,
    remove_columns=domain_train.column_names
)

tokenized_val = domain_val.map(
    preprocess_function,
    batched=True,
    remove_columns=domain_val.column_names
)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [36]:
# DataLoader setup
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=2)
val_loader   = DataLoader(tokenized_val,   batch_size=16, shuffle=False, collate_fn=data_collator, num_workers=1)


In [37]:
# ================================
# Configure LoRA Adapters
# ================================
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=4,                     
    lora_alpha=8,            # scaling
    lora_dropout=0.05,
    bias="none",

    # ONLY query & value projections
    target_modules=[
        "q_proj",
        "v_proj"
    ]
)
model.gradient_checkpointing_enable()
model.config.use_cache = False


# Wrap the model with LoRA adapters
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # (Optional) shows how many params are trainable


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


trainable params: 884,736 || all params: 1,116,428,288 || trainable%: 0.0792


In [38]:
# ================================
# Training Loop with LoRA Fine-Tuning (AMP-safe)
# ================================

from torch.amp import autocast, GradScaler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scaler = GradScaler("cuda")

num_epochs = 1
model.train()

for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad(set_to_none=True)

        # Mixed Precision Forward Pass
        with autocast("cuda"):
            outputs = model(**batch)
            loss = outputs.loss

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f}")

    # ================================
    # Validation
    # ================================
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with autocast("cuda"):
                outputs = model(**batch)
                val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss: {val_loss:.4f}")

    model.train()

print(" LoRA Fine-tuning complete.")


Training Epoch 1:   0%|          | 0/19 [00:00<?, ?it/s]

  return fn(*args, **kwargs)


Epoch 1 | Train Loss: 12.7065
Epoch 1 | Validation Loss: 12.8074
 LoRA Fine-tuning complete.


In [39]:
from huggingface_hub import login

login()  # paste your HF token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
repo_id = "Vir123-dev/indictrans2_en_hing_finetune_1B"


In [41]:
# Save LoRA adapter + push to hub
model.push_to_hub(
    repo_id,
    commit_message="LoRA fine-tuned IndicTrans2 on Domain-1 (EN-HINGLISH(converted to HI))"
)

# Save tokenizer (IMPORTANT)
tokenizer.push_to_hub(repo_id)


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/Vir123-dev/indictrans2_en_hing_finetune_1B/commit/1e34844f03f3f320f224b7bd1b666150d7ba7861', commit_message='Upload tokenizer', commit_description='', oid='1e34844f03f3f320f224b7bd1b666150d7ba7861', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Vir123-dev/indictrans2_en_hing_finetune_1B', endpoint='https://huggingface.co', repo_type='model', repo_id='Vir123-dev/indictrans2_en_hing_finetune_1B'), pr_revision=None, pr_num=None)