# Fine-tune Qwen 2.5 7B với Dataset Rephrase sử dụng LoRA

Notebook này hướng dẫn fine-tune Qwen 2.5 7B Instruct với dataset Rephrase để tạo JSON output từ query tiếng Việt.

## Thông tin
- **Base Model**: Qwen/Qwen2.5-7B-Instruct
- **PEFT Method**: LoRA (r=8 hoặc r=16)
- **Dataset**: Rephrase (1,000 samples)
- **Task**: Text-to-JSON Generation


In [2]:
# Install dependencies (chạy lần đầu)
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers>=4.35.0
# !pip install peft>=0.6.0
# !pip install datasets>=2.14.0
# !pip install accelerate>=0.24.0
# !pip install bitsandbytes>=0.41.0
# !pip install scikit-learn
# !pip install tqdm


In [None]:
import json
import torch
from typing import Dict, List
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from tqdm import tqdm
import os

# Kiểm tra GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


## Bước 1: Load và Xử Lý Dataset


In [None]:
# Load dataset
dataset_path = "../../dataset/01_simple/01_dataset_rephrase.json"

with open(dataset_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"Total samples: {len(raw_data)}")
print(f"Sample keys: {raw_data[0].keys()}")
print(f"\nExample sample:")
print(json.dumps(raw_data[0], ensure_ascii=False, indent=2))


In [None]:
def format_prompt(query: str) -> str:
    """Format input prompt cho model"""
    return f"Query: {query}\n\nOutput JSON:"

def format_output(data: Dict) -> str:
    """Format output JSON từ data"""
    output = {
        "keyword": data.get("keyword", ""),
        "is_in_scope": data.get("is_in_scope", False),
        "reasoning": data.get("reasoning", ""),
        "message_banner": data.get("message_banner", ""),
        "message_no_result": data.get("message_no_result", "")
    }
    return json.dumps(output, ensure_ascii=False, indent=None)

def prepare_dataset(raw_data: List[Dict]) -> List[Dict]:
    """Chuẩn bị dataset cho training"""
    formatted_data = []
    
    for item in raw_data:
        query = item.get("query", "")
        prompt = format_prompt(query)
        output = format_output(item)
        
        formatted_data.append({
            "prompt": prompt,
            "output": output,
            "query": query,
            "is_in_scope": item.get("is_in_scope", False)
        })
    
    return formatted_data

# Format dataset
formatted_data = prepare_dataset(raw_data)
print(f"Formatted samples: {len(formatted_data)}")
print("\nExample:")
print(f"Prompt: {formatted_data[0]['prompt']}")
print(f"Output: {formatted_data[0]['output'][:150]}...")


In [None]:
# Chia dataset: 80% train, 10% validation, 10% test
train_data, temp_data = train_test_split(
    formatted_data,
    test_size=0.2,
    random_state=42,
    stratify=[item['is_in_scope'] for item in formatted_data]  # Stratified split
)

val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42,
    stratify=[item['is_in_scope'] for item in temp_data]
)

print(f"Train: {len(train_data)} samples")
print(f"Validation: {len(val_data)} samples")
print(f"Test: {len(test_data)} samples")

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)


## Bước 2: Load Model và Tokenizer


In [None]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

# Cấu hình quantization (tùy chọn, để tiết kiệm bộ nhớ)
# Bỏ comment nếu cần quantization
use_quantization = True  # Đặt False nếu có đủ VRAM

if use_quantization:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
else:
    quantization_config = None

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right"
)

# Đảm bảo có pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("Tokenizer loaded")
print(f"Vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")


In [None]:
# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16 if not use_quantization else None
)

print(f"Model loaded: {model_name}")
print(f"Model device: {next(model.parameters()).device}")

# Kiểm tra số tham số
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")


## Bước 3: Cấu Hình LoRA


In [None]:
# Cấu hình LoRA - Option 1: Conservative (r=8) - Khuyến nghị bắt đầu
lora_config = LoraConfig(
    r=8,                          # Rank
    lora_alpha=16,               # Alpha = 2 * r
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention layers
    lora_dropout=0.1,            # Dropout để tránh overfitting
    bias="none",                  # Không train bias
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

# Hoặc Option 2: Balanced (r=16) - Uncomment để dùng
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM,
#     inference_mode=False
# )

print("LoRA Config:")
print(lora_config)


In [None]:
# Áp dụng LoRA
model = get_peft_model(model, lora_config)

# Kiểm tra tham số có thể train
model.print_trainable_parameters()


In [None]:
def tokenize_function(examples):
    """Tokenize prompt và output"""
    # Combine prompt và output
    texts = []
    for prompt, output in zip(examples["prompt"], examples["output"]):
        text = f"{prompt} {output}"
        texts.append(text)
    
    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,  # Điều chỉnh dựa trên độ dài output
        padding=False,
        return_tensors=None
    )
    
    # Labels cho training (same as input_ids)
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Tokenize datasets
print("Tokenizing train dataset...")
train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Tokenizing validation dataset...")
val_tokenized = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

print(f"Train tokenized: {len(train_tokenized)} samples")
print(f"Val tokenized: {len(val_tokenized)} samples")
print(f"Example tokenized length: {len(train_tokenized[0]['input_ids'])}")


In [None]:
# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, không phải masked LM
    pad_to_multiple_of=8  # Tối ưu cho GPU
)


## Bước 5: Cấu Hình Training


In [None]:
output_dir = "./results/qwen2.5-7b-rephrase-lora"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=4,  # Điều chỉnh dựa trên GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 4 * 4 = 16
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=10,
    eval_steps=50,
    save_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,  # Mixed precision training
    gradient_checkpointing=True,  # Tiết kiệm memory
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    logging_dir=f"{output_dir}/logs",
    remove_unused_columns=False,
    save_total_limit=3,  # Chỉ giữ 3 checkpoints gần nhất
)

print("Training arguments configured")
print(f"Output directory: {output_dir}")


In [None]:
# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
)

print("Trainer initialized")
print(f"Training samples: {len(train_tokenized)}")
print(f"Validation samples: {len(val_tokenized)}")


## Bước 6: Training


In [None]:
# Bắt đầu training
print("Starting training...")
train_result = trainer.train()

# Lưu model cuối cùng
trainer.save_model()
tokenizer.save_pretrained(output_dir)

print(f"\nTraining completed! Model saved to {output_dir}")
print(f"Training loss: {train_result.training_loss:.4f}")


## Bước 7: Evaluation


In [None]:
# Evaluate trên test set
print("Tokenizing test dataset...")
test_tokenized = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

eval_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


In [3]:
# Đánh giá JSON Validity
def evaluate_json_validity(model, tokenizer, test_data, max_samples=50):
    """Đánh giá tỷ lệ JSON hợp lệ"""
    model.eval()
    valid_count = 0
    total_count = 0
    
    for i, item in enumerate(tqdm(test_data[:max_samples], desc="Evaluating")):
        prompt = item["prompt"]
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract JSON (sau "Output JSON:")
        if "Output JSON:" in generated_text:
            json_text = generated_text.split("Output JSON:")[-1].strip()
        else:
            json_text = generated_text
        
        # Kiểm tra JSON validity
        try:
            parsed = json.loads(json_text)
            valid_count += 1
        except:
            if total_count < 3:  # Print first few errors
                print(f"\nError parsing JSON {i}:")
                print(f"Generated: {json_text[:200]}")
        
        total_count += 1
    
    validity_rate = valid_count / total_count if total_count > 0 else 0
    print(f"\nJSON Validity Rate: {validity_rate:.2%} ({valid_count}/{total_count})")
    return validity_rate

# Chạy evaluation
validity_rate = evaluate_json_validity(model, tokenizer, test_data, max_samples=50)


NameError: name 'model' is not defined

## Bước 8: Inference


In [None]:
def generate_json_response(model, tokenizer, query: str, max_length=512):
    """Generate JSON response từ query"""
    prompt = f"Query: {query}\n\nOutput JSON:"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract JSON
    if "Output JSON:" in generated_text:
        json_text = generated_text.split("Output JSON:")[-1].strip()
    else:
        json_text = generated_text
    
    # Parse JSON
    try:
        result = json.loads(json_text)
        return result
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        print(f"Generated text: {json_text}")
        return None

# Test inference
test_queries = [
    "sữa cho bé 6 tháng",
    "tã bỉm size M",
    "đồ chơi cho trẻ sơ sinh"
]

print("Testing inference:\n")
for query in test_queries:
    result = generate_json_response(model, tokenizer, query)
    print(f"Query: {query}")
    print(f"Result: {json.dumps(result, ensure_ascii=False, indent=2)}")
    print("-" * 50)


In [None]:
# Lưu adapter (chỉ LoRA weights, nhỏ)
adapter_path = f"{output_dir}/adapter"
model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)

print(f"Adapter saved to {adapter_path}")
print("\nĐể load lại model sau này:")
print(f"""
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "{model_name}",
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# Load adapter
model = PeftModel.from_pretrained(base_model, "{adapter_path}")
tokenizer = AutoTokenizer.from_pretrained("{adapter_path}")
""")


## Lưu Ý

- Điều chỉnh `per_device_train_batch_size` và `gradient_accumulation_steps` dựa trên GPU memory
- Có thể thử `r=16` nếu `r=8` không đủ hiệu suất
- Monitor training loss và validation loss để tránh overfitting
- Sử dụng TensorBoard để theo dõi: `tensorboard --logdir ./results/qwen2.5-7b-rephrase-lora/logs`
