# QLoRA Fine-tuning with Unsloth on **FreedomIntelligence/medical-o1-reasoning-SFT**  
**Base model:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B`  

**Method:** QLoRA (4-bit) via **Unsloth**  

**Goal:** Distill reasoning style and compare **pre** vs **post** generations.

> Run this notebook on Google Colab / Kaggle (GPU T4/A100/L4). Internet must be enabled.

## 1) Environment setup & installs

In [None]:
!pip -q install "unsloth>=2024.12.0" "torch>=2.3" --index-url https://download.pytorch.org/whl/cu121 -U
!pip -q install transformers accelerate peft trl datasets bitsandbytes evaluate rouge-score sacrebleu --upgrade
!pip -q install "xformers>=0.0.27" --index-url https://download.pytorch.org/whl/cu121 -U || true
import torch, sys, os, random
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda, "GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)


## 2) Config

In [None]:
from dataclasses import dataclass

@dataclass
class CFG:
    base_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    dataset_name: str = "FreedomIntelligence/medical-o1-reasoning-SFT"
    text_field_instruction: str = "instruction"
    text_field_input: str = "input"
    text_field_output: str = "output"
    max_seq_len: int = 4096
    micro_batch_size: int = 2
    grad_accum_steps: int = 8
    epochs: int = 1
    lr: float = 2e-4
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    weight_decay: float = 0.0
    warmup_ratio: float = 0.03
    logging_steps: int = 10
    eval_samples: int = 64
    save_dir: str = "outputs_deepseek_qwen15b_med_o1_qLoRA"
    max_new_tokens: int = 512
    prompt_template_name: str = "med_o1_reason"
CFG


## 3) Load dataset & build SFT samples

In [None]:
from datasets import load_dataset
import random, json

raw = load_dataset(CFG.dataset_name)
print(raw)

def build_prompt(inst, inp):
    # Simple, deterministic instruction-following template
    return f"""[INST] 你是一名严谨的医学辅助决策系统。请阅读“病人信息/主诉”，进行分步推理（可列出鉴别诊断），最后给出**非诊断性的**专业建议与就医检查建议。
- 要求：
  1) 列出关键线索
  2) 分步分析（可能机制/风险点）
  3) 可能诊断方向（Top-3，含不确定性）
  4) 建议下一步检查/转诊科室
  5) 安全提示（不能替代医生）
病人信息/主诉：{inp}
[/INST]"""

def build_response(output):
    # Use the provided high-quality reasoning + answer if available
    return output

def to_sft(example):
    inst = example.get(CFG.text_field_instruction, "")
    inp  = example.get(CFG.text_field_input, "")
    out  = example.get(CFG.text_field_output, "")
    return {
        "prompt": build_prompt(inst, inp),
        "response": build_response(out)
    }

train = raw["train"].map(to_sft, remove_columns=raw["train"].column_names)
print(train[0])
print("Train size:", len(train))


In [None]:
for i in range(2):
    print("="*80)
    print(train[i]["prompt"][:600])
    print("---")
    print(train[i]["response"][:600])


## 4) Load base model in 4-bit & prepare QLoRA (Unsloth)

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_len = CFG.max_seq_len
dtype = None  # let unsloth decide BF16/FP16

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CFG.base_model,
    max_seq_length = max_seq_len,
    dtype = dtype,
    load_in_4bit = True,
)

FastLanguageModel.for_training(
    model,
    lora_r = CFG.lora_r,
    lora_alpha = CFG.lora_alpha,
    lora_dropout = CFG.lora_dropout,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    target_modules = "all-linear",  # good default for Qwen-style
)
tokenizer.pad_token = tokenizer.eos_token


## 5) Baseline generation (before fine-tune)

In [None]:
import random
from transformers import TextStreamer

def generate_text(prompts, max_new_tokens=CFG.max_new_tokens, temp=0.7, top_p=0.9):
    model.eval()
    outputs = []
    for p in prompts:
        toks = tokenizer(p, return_tensors="pt").to(model.device)
        with torch.no_grad():
            out_ids = model.generate(
                **toks,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temp,
                top_p=top_p,
                repetition_penalty=1.05,
                eos_token_id=tokenizer.eos_token_id,
            )
        outputs.append(tokenizer.decode(out_ids[0], skip_special_tokens=True))
    return outputs

sample_idx = random.sample(range(len(train)), min(CFG.eval_samples, len(train)))
eval_prompts = [train[i]["prompt"] for i in sample_idx]
baseline_outputs = generate_text(eval_prompts, max_new_tokens=384)
print(baseline_outputs[0][:1200])


## 6) Tokenize SFT dataset

In [None]:
from datasets import Dataset
def format_example(ex):
    # supervised fine-tuning pairs: prompt + response
    text = ex["prompt"] + "\n" + ex["response"]
    return {"text": text}

train_text = train.map(format_example, remove_columns=train.column_names)

from transformers import AutoTokenizer
tok = tokenizer  # alias
def tokenize(examples):
    toks = tok(examples["text"], truncation=True, padding="max_length", max_length=CFG.max_seq_len)
    toks["labels"] = toks["input_ids"].copy()
    return toks

tokenized = train_text.map(tokenize, batched=True, remove_columns=["text"])
tokenized = tokenized.shuffle(seed=42)
tokenized


## 7) SFT Training (QLoRA)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import math, os

os.makedirs(CFG.save_dir, exist_ok=True)

train_args = TrainingArguments(
    output_dir = CFG.save_dir,
    per_device_train_batch_size = CFG.micro_batch_size,
    gradient_accumulation_steps = CFG.grad_accum_steps,
    num_train_epochs = CFG.epochs,
    learning_rate = CFG.lr,
    lr_scheduler_type = "cosine",
    warmup_ratio = CFG.warmup_ratio,
    logging_steps = CFG.logging_steps,
    bf16 = torch.cuda.is_available(),
    fp16 = (not torch.cuda.is_available()),
    gradient_checkpointing = True,
    optim = "paged_adamw_8bit",
    weight_decay = CFG.weight_decay,
    max_grad_norm = 0.3,
    save_strategy = "epoch",
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized,
    dataset_text_field = None,  # we already tokenized
    args = train_args,
    packing = False,
)
trainer.train()
trainer.save_state()
# Save PEFT adapter
trainer.model.save_pretrained(CFG.save_dir)
tokenizer.save_pretrained(CFG.save_dir)
print("Saved adapter to", CFG.save_dir)


## 8) Post-training evaluation & comparison

In [None]:
# Reload model with adapter (safety)
from unsloth import FastLanguageModel
from peft import PeftModel

base_model, tok2 = FastLanguageModel.from_pretrained(
    model_name = CFG.base_model,
    max_seq_length = CFG.max_seq_len,
    dtype = None,
    load_in_4bit = True,
)
base_model = PeftModel.from_pretrained(base_model, CFG.save_dir)
base_model.eval()

def generate_with(model, tokenizer, prompts, max_new_tokens=CFG.max_new_tokens):
    outs = []
    for p in prompts:
        toks = tokenizer(p, return_tensors="pt").to(model.device)
        with torch.no_grad():
            out_ids = model.generate(
                **toks,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.05,
                eos_token_id=tokenizer.eos_token_id,
            )
        outs.append(tokenizer.decode(out_ids[0], skip_special_tokens=True))
    return outs

finetuned_outputs = generate_with(base_model, tok2, eval_prompts, max_new_tokens=384)

print("=== SAMPLE BEFORE ===")
print(baseline_outputs[0][:1200])
print("\n=== SAMPLE AFTER ===")
print(finetuned_outputs[0][:1200])


## 9) Quantitative proxy metrics (ROUGE-L / BLEU vs dataset `response`)

In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")

refs = [train[i]["response"] for i in sample_idx]

rouge_before = rouge.compute(predictions=baseline_outputs, references=refs, use_stemmer=True)
rouge_after  = rouge.compute(predictions=finetuned_outputs, references=refs, use_stemmer=True)
bleu_before  = bleu.compute(predictions=baseline_outputs, references=[[r] for r in refs])
bleu_after   = bleu.compute(predictions=finetuned_outputs, references=[[r] for r in refs])

def fmt(d): 
    return {k: round(float(v), 4) for k, v in d.items()}

print("ROUGE-L (before):", round(rouge_before["rougeL"], 4), " | (after):", round(rouge_after["rougeL"], 4))
print("BLEU (before):", round(bleu_before["score"], 4), " | (after):", round(bleu_after["score"], 4))


## 10) Export qualitative comparison samples (Markdown table)

In [None]:
import pandas as pd
rows = []
for i,(p,b,a,r) in enumerate(zip(eval_prompts, baseline_outputs, finetuned_outputs, refs)):
    rows.append({
        "idx": i,
        "prompt": p[:2200],
        "baseline": b[:2200],
        "finetuned": a[:2200],
        "reference": r[:2200]
    })
df = pd.DataFrame(rows)
df.to_markdown("comparison.md", index=False)
df.to_csv("comparison.csv", index=False)
print("Saved comparison.md and comparison.csv")
df.head(3)


## 11) Try your own prompts

In [None]:
user_prompt = """[INST] 请阅读下述主诉，按**关键线索→分步分析→可能诊断方向(Top-3)→建议检查与转诊→安全提示**的结构回答：
主诉：男性，28岁，程序员，近一周熬夜后出现头晕、颈部疼痛、间歇性恶心。[/INST]"""
print(generate_with(base_model, tok2, [user_prompt])[0])


## 12) Notes & Export

- The adapter (LoRA weights) is saved in `outputs_deepseek_qwen15b_med_o1_qLoRA/`.  
- Use `peft` to merge with base weights if needed for deployment.  
- Files exported:
  - `comparison.csv` and `comparison.md` for qualitative review.
  - LoRA adapter directory with `adapter_config.json` and weight files.