# Setting Up

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os
import json
import torch
import time
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from functools import partial
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# Initialize Model

In [2]:
max_seq_length = 1024
device = "cuda"

In [4]:
model_name = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

# Data Preparation

In [5]:
with open("processed.json", 'rb') as f:
    data = json.load(f)

In [6]:
data[0]

'Question: (LCA) I would like to open an account with my son, do u have any product for kids?\nAnswer: Main Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time What are the main Features of the Little Champs Account. Minimum initial deposit of Rs.100/- Little Champs Savings A/C Free first chequebook* Profit Payment Profit Rate Free debit card* (annual/replacement fees apply). This debit card has the following limits Semi-Annually 0.19 Daily funds Transfer Limit: Rs.100,000/- Daily ATM Withdrawal Limit: Rs.25,000/- Daily POS Limit: Rs.50,000/- * For Current Account only'

In [7]:
dataset = Dataset.from_dict({"texts": data})

In [8]:
def process_batch(batch):
    return tokenizer(batch['texts'], padding=True)

dataset = dataset.map(
    process_batch,
    batched=True,
)

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

In [9]:
dataset = dataset.shuffle()

# Fine-Tuning

In [10]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [11]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj',
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model.gradient_checkpointing_enable()

model = get_peft_model(model, lora_config)

In [12]:
training_args = TrainingArguments(
    output_dir = "outputs/",
    warmup_steps=1,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=1e-3,
    optim="paged_adamw_8bit",
    logging_steps=3,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    eval_strategy="no",
    do_eval=False,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
    dataloader_pin_memory=False,
    save_total_limit=3,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
)

trainer.train()

Truncating train dataset:   0%|          | 0/308 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
3,6.9357
6,0.6783
9,0.5505
12,0.6225
15,0.412
18,0.5893
21,0.4268
24,0.3778
27,0.5051
30,0.4969


TrainOutput(global_step=77, training_loss=0.728941015608899, metrics={'train_runtime': 159.7887, 'train_samples_per_second': 1.928, 'train_steps_per_second': 0.482, 'total_flos': 344839823032320.0, 'train_loss': 0.728941015608899})