In [None]:
import torch
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### Load Model

In [2]:
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model
model_pth = "../../models/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_pth,fast=True)
lora_config = LoraConfig(
            r=8,
            task_type=TaskType.CAUSAL_LM,
            lora_alpha=16,
            lora_dropout=0.01,
            # init_lora_weights='gaussian',
            target_modules=['gate_proj','up_proj','down_proj','lm_head'],
        )
model = AutoModelForCausalLM.from_pretrained(model_pth,
                                            device_map='auto',
                                            torch_dtype=torch.bfloat16,
                                            
                                            )
model.config.pad_token_id=model.config.eos_token_id
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
tokenizer.model_max_length=64
tokenizer.padding_side='left'

  from .autonotebook import tqdm as notebook_tqdm


trainable params: 3,976,192 || all params: 600,026,112 || trainable%: 0.6627


### Load Dataset

In [4]:
def filtering(examples):
    return tokenizer.encode(f"Translate English to Hindi.\nEnglish:\n{examples['en']}\n##Hindi:\n{examples['hi']}",return_tensors='pt').shape[-1]<tokenizer.model_max_length
def formatting(examples):
    prompt = f"Translate English to Hindi.\nEnglish:\n{examples['en']}\n##Hindi:\n"
    response = examples['hi']
    return dict(prompt=prompt,completion=response)

In [None]:
from datasets import load_dataset,DatasetDict
dataset=load_dataset("bajpaideeksha/english-hindi-colloquial-dataset")['train']
dataset=dataset.rename_columns({'"English Input"':"en",'"Hindi Output"':"hi"})
dataset = dataset.filter(filtering)
dataset = dataset.map(formatting,remove_columns=dataset.column_names)
dataset = dataset.train_test_split(0.1)

## Data Collator

In [6]:
from typing import List, Dict
from collections import defaultdict

class PromptResponseDataCollator:
    def __init__(self, tokenizer: AutoTokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch: List[Dict[str, str]]):
        inputs = defaultdict(list)
        labels = defaultdict(list)
        # Extract texts
        texts = [example['prompt'] + example['completion'] + self.tokenizer.eos_token for example in batch]
        prompts = [example['prompt'] for example in batch]

        for text,prompt in zip(texts,prompts):
            prompt_completion_ids=tokenizer.encode(text)
            prompt_ids=tokenizer.encode(prompt)
            assert prompt_completion_ids[:len(prompt_ids)]==prompt_ids,"Mismatch between tokenized prompt and the start of tokenized prompt+completion."

            #Set Prompt value to ignore_index value
            prompt_completion_ids[:len(prompt_ids)]=[ -100 ]*len(prompt_ids)

            #Prompots
            output = tokenizer.encode(text)
            inputs['input_ids'].append(output)
            labels['input_ids'].append(prompt_completion_ids)

        # Apply Padding
        inputs=tokenizer.pad(inputs,return_tensors='pt')
        labels=tokenizer.pad(labels,return_tensors='pt')
        
        labs=labels['input_ids']
        mask=labels['attention_mask']
        labs[mask==0]=-100
        inputs['labels']=labs

        return inputs
datacollator=PromptResponseDataCollator(tokenizer)

### Metrics

In [7]:
import evaluate
import numpy as np
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    global preds,labels
    preds, labels = eval_preds
    preds= preds.argmax(axis=-1)
    preds[labels==-100]=tokenizer.pad_token_id
    labels[labels==-100]=tokenizer.pad_token_id
    decoded_preds = tokenizer.batch_decode(labels,skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(preds,skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


### Trainer

In [8]:
from transformers import Trainer,TrainingArguments
training_args = TrainingArguments(
    output_dir='./tmp',
    lr_scheduler_type="constant",
    num_train_epochs=10,
    # max_steps=2,
    per_device_train_batch_size=8,
    # gradient_accumulation_steps=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=10,
    logging_strategy='epoch',
    eval_strategy='epoch',
    # eval_steps=2,
    # logging_steps=2,
    bf16=True,
    remove_unused_columns=False,

)
trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=datacollator,
)
trainer.train()

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu
1,3.7063,2.213621,6.711379
2,1.8554,1.637332,16.191307
3,1.4455,1.402447,28.157392
4,1.2034,1.249496,34.754514
5,1.0617,1.151669,39.502582
6,0.9058,1.096217,43.472561
7,0.7696,1.046083,44.54307
8,0.6649,1.029133,45.971624
9,0.5898,1.012726,46.914798
10,0.5121,1.008392,48.595655


TrainOutput(global_step=730, training_loss=1.2714380865227686, metrics={'train_runtime': 111.4944, 'train_samples_per_second': 51.751, 'train_steps_per_second': 6.547, 'total_flos': 578689629652992.0, 'train_loss': 1.2714380865227686, 'epoch': 10.0})

## Test

In [10]:
eval_model = trainer.model
eval_model.eval()
prompt = f'Translate English to Hindi.\nEnglish:\n"Let’s skip the lecture today."\n##Hindi:\n'
with torch.no_grad():
    tokens=tokenizer.encode(prompt,return_tensors='pt')
    output = eval_model.generate(tokens.to(eval_model.device),do_sample=False)
    print("Prediction   :",tokenizer.batch_decode(output.cpu(),skip_special_tokens=True)[0][len(prompt):])
    print("Original     :",'"Lecture bunk karte hain aaj."')

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Prediction   : "Today lecture skip karte hain."
Original     : "Lecture bunk karte hain aaj."


## Save

In [None]:
save_path="final_model"
eval_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)