# Qwen3-4B + Unsloth 4-bit (QLoRA) — Banking Chatbot Fine-tune
**Generated:** 2025-09-18 09:22:08

This notebook replaces the baseline "HF Transformers + PEFT QLoRA" pipeline with **Unsloth 4‑bit QLoRA** for **Qwen/Qwen3-4B-Instruct-2507**.
It is configured for **GPUs without FlashAttention2** (e.g., **Tesla P100**) and focuses on **packing**, **length‑grouping**, and **gradient accumulation** to reduce wall time.

> Tip: If you previously trained with Hugging Face QLoRA and saw ~30h/epoch, Unsloth typically brings a noticeable speedup; exact gains depend on your GPU and I/O.

In [None]:
!pip install transformers unsloth datasets peft accelerate trl

In [None]:
!pip install bitsandbytes

In [None]:
# %% [config] Hyperparameters (carefully chosen for P100-like GPUs)
from dataclasses import dataclass

@dataclass
class CFG:
    base_model: str = "Qwen/Qwen3-4B-Instruct-2507"
    output_dir: str = "qwen3_4b_banking_unsloth_lora"
    max_seq_length: int = 1024
    train_on_inputs: bool = False
    lr: float = 1e-4
    warmup_ratio: float = 0.03
    weight_decay: float = 0.0
    num_train_epochs: float = 2.0
    per_device_train_batch_size: int = 1
    gradient_accumulation_steps: int = 16
    logging_steps: int = 10
    eval_strategy: str = "no"
    save_steps: int = 500
    save_total_limit: int = 2
    fp16: bool = True
    bf16: bool = False
    gradient_checkpointing: str = "unsloth"
    lora_r: int = 16
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    target_modules: tuple = ("q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj")
    use_rslora: bool = False
    dataset_path: str = "data/train.csv"
    text_columns: tuple = ("prompt","response")
    packing: bool = True
    group_by_length: bool = True
    max_steps: int | None = None

CFG = CFG()
print(CFG)

In [None]:
# %% [load] Model & Tokenizer via Unsloth
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import AutoTokenizer

dtype = torch.float16 if not is_bfloat16_supported() else torch.bfloat16

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CFG.base_model,
    max_seq_length = CFG.max_seq_length,
    load_in_4bit = True,
    dtype = None,
    device_map = "auto",
    trust_remote_code = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = CFG.lora_r,
    lora_alpha = CFG.lora_alpha,
    lora_dropout = CFG.lora_dropout,
    target_modules = list(CFG.target_modules),
    use_rslora = CFG.use_rslora,
    bias = "none",
    random_state = 42,
    use_gradient_checkpointing = CFG.gradient_checkpointing,
)

tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"
print("Loaded model with Unsloth 4-bit + LoRA.")

In [None]:
# %% [data] FAST PATH: load CSV -> split -> save JSONL to /kaggle/working (no heavy cleaning)

import os, random
import pandas as pd
from datasets import load_dataset, Dataset

# ========= Config =========
CSV_PATH     = "/kaggle/input/data-banking-processed/final_sua_mapped_v2.csv"  # đổi nếu khác
OUT_DIR      = "/kaggle/working/prepared"
VAL_RATIO    = 0.05
TEST_RATIO   = 0.05
SEED         = 42

# Tùy chọn stratify theo 1 cột nhãn (nhanh nhất là để None):
STRATIFY_COL = None  # ví dụ: "category" hoặc "intent" nếu muốn stratify

# Dùng template nhanh (không gọi tokenizer.apply_chat_template để tránh vòng lặp chậm):
USE_HF_CHAT_TEMPLATE = False  # bật True nếu bạn muốn đúng template của tokenizer (nhưng sẽ chậm hơn)

SYSTEM_PROMPT = "You are a helpful, safe, and precise banking financial assistant. Answer concisely."
# ==========================

os.makedirs(OUT_DIR, exist_ok=True)

# 1) Load CSV (nhanh, không xử lý ngoặc nữa)
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# pandas đọc nhanh; nếu muốn nữa có thể thử engine='pyarrow' nếu đã cài
try:
    df = pd.read_csv(CSV_PATH, dtype=str, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(CSV_PATH, dtype=str, encoding_errors="ignore")

# 2) Kiểm tra 2 cột bắt buộc
for col in ["instruction", "response"]:
    if col not in df.columns:
        raise ValueError(f"❌ Thiếu cột '{col}'. Cột hiện có: {df.columns.tolist()}")

# Loại hàng rỗng/quá ngắn ở mức tối thiểu (tránh lỗi huấn luyện)
df = df.dropna(subset=["instruction", "response"])
df = df[(df["instruction"].str.len() >= 3) & (df["response"].str.len() >= 5)].copy()

# 3) Tạo text theo template nhanh (không gọi tokenizer để tránh overhead vòng lặp Python)
def fast_template(u: str, a: str) -> str:
    return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{u}\n<|assistant|>\n{a}"

if USE_HF_CHAT_TEMPLATE:
    # Cảnh báo: chậm hơn do gọi tokenizer nhiều lần
    messages_list = [
        [
            {"role":"system","content":SYSTEM_PROMPT},
            {"role":"user","content":str(u)},
            {"role":"assistant","content":str(a)},
        ]
        for u, a in zip(df["instruction"].tolist(), df["response"].tolist())
    ]
    # tokenizer phải được load trước cell này nếu bạn bật flag
    df["text"] = [tokenizer.apply_chat_template(m, tokenize=False, add_generation_prompt=False) for m in messages_list]
else:
    # Nhanh nhất
    df["text"] = [fast_template(u, a) for u, a in zip(df["instruction"], df["response"])]

# 4) Split rất nhanh (random); stratify nếu bạn chỉ định cột
random.seed(SEED)
df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
n = len(df)
n_val  = int(round(n * VAL_RATIO))
n_test = int(round(n * TEST_RATIO))
n_train = max(0, n - n_val - n_test)

if STRATIFY_COL and STRATIFY_COL in df.columns and df[STRATIFY_COL].nunique() >= 2:
    # stratify đơn giản, nhưng vẫn nhẹ
    train_idx, val_idx, test_idx = [], [], []
    for _, g in df.groupby(STRATIFY_COL, dropna=False):
        g = g.sample(frac=1.0, random_state=SEED)
        gn = len(g)
        gv = int(round(gn * VAL_RATIO))
        gt = int(round(gn * TEST_RATIO))
        gr = max(0, gn - gv - gt)
        idxs = list(g.index)
        train_idx += idxs[:gr]
        val_idx   += idxs[gr:gr+gv]
        test_idx  += idxs[gr+gv:gr+gv+gt]
    # Fallback nếu lệch tổng
    used = set(train_idx) | set(val_idx) | set(test_idx)
    remain = [i for i in range(n) if i not in used]
    train_idx += remain  # nhét hết phần còn lại vào train
else:
    train_idx = list(range(0, n_train))
    val_idx   = list(range(n_train, n_train+n_val))
    test_idx  = list(range(n_train+n_val, n_train+n_val+n_test))

train_df = df.iloc[train_idx].copy()
val_df   = df.iloc[val_idx].copy()
test_df  = df.iloc[test_idx].copy()

print(f"Split sizes -> train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

# 5) Lưu JSONL ra /kaggle/working để load cực nhanh ở bước train
train_path = os.path.join(OUT_DIR, "train.jsonl")
val_path   = os.path.join(OUT_DIR, "val.jsonl")
test_path  = os.path.join(OUT_DIR, "test.jsonl")

# mỗi dòng là {"text": "..."}
train_df[["text"]].to_json(train_path, orient="records", lines=True, force_ascii=False)
val_df[["text"]].to_json(val_path, orient="records", lines=True, force_ascii=False)
test_df[["text"]].to_json(test_path, orient="records", lines=True, force_ascii=False)

print("Saved:", train_path, val_path, test_path, sep="\n")

# 6) (Tuỳ chọn) load lại bằng datasets để kiểm tra nhanh; SFTTrainer có thể dùng trực tiếp ds['train']
from datasets import load_dataset
ds = load_dataset("json", data_files={"train": train_path, "validation": val_path, "test": test_path})
train_ds, val_ds, test_ds = ds["train"], ds["validation"], ds["test"]
print(train_ds, val_ds, test_ds, sep="\n")


In [None]:
# %% [train] TRL SFTTrainer with packing (padding-free) — KHÔNG dùng data_collator
from trl import SFTTrainer, SFTConfig

train_args = SFTConfig(
    output_dir = CFG.output_dir,
    num_train_epochs = CFG.num_train_epochs if CFG.max_steps is None else 1.0,
    max_steps = CFG.max_steps if CFG.max_steps is not None else -1,
    learning_rate = CFG.lr,
    warmup_ratio = CFG.warmup_ratio,
    lr_scheduler_type = "cosine",
    per_device_train_batch_size = CFG.per_device_train_batch_size,
    gradient_accumulation_steps = CFG.gradient_accumulation_steps,
    weight_decay = CFG.weight_decay,
    logging_steps = CFG.logging_steps,
    save_steps = CFG.save_steps,
    save_total_limit = CFG.save_total_limit,
    fp16 = CFG.fp16,
    bf16 = CFG.bf16,
    packing = True,                 # giữ packing để tăng tốc
    dataset_text_field = "text",    # dữ liệu của bạn đã là 1 field 'text'
    dataset_num_proc = 2,
    group_by_length = CFG.group_by_length,
    report_to = "none",
    eval_strategy = "steps",  # nếu muốn eval định kỳ
    eval_steps = 500,               # tuỳ chỉnh
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,       # dùng ds đã split/lưu JSONL
    eval_dataset = val_ds,          # nếu bật evaluation_strategy
    args = train_args,
    # ❌ KHÔNG truyền data_collator khi packing=True
)

trainer.train()
trainer.save_model(CFG.output_dir)
tokenizer.save_pretrained(CFG.output_dir)
print("Training complete.")


In [None]:
# %% [export] Merge LoRA into base weights (optional) and save
merge_dir = CFG.output_dir + "_merged"
try:
    model.save_pretrained_merged(
        merge_dir,
        tokenizer = tokenizer,
        save_method = "merged_16bit",
    )
    print("Merged model saved to:", merge_dir)
except Exception as e:
    print("Merge step skipped or failed:", e)

## Notes & Rationale
- **LoRA**: `r=16`, `alpha=16`, `dropout=0.05` balance capacity & stability for noisy mixed‑domain banking QA.  
- **Target modules**: `q/k/v/o` + `gate/up/down` (MLP) are standard for Qwen3.  
- **Packing & group-by-length**: reduce padding overhead on short Q/A turns.  
- **Max sequence**: 1024 covers most banking interactions; raise if needed.  
- **P100**: FA2 not required; speed gains come from Unsloth kernels + packing + accumulation.  
- **Time budget**: set `CFG.max_steps` (e.g., 8000–30000) to fix wall time.