# Qwen/Qwen3-4B Banking Chatbot — Kaggle Notebook

Notebook này gom **đúng logic & tham số** từ 4 file của bạn theo thứ tự:
1) `data_preprocessing` → 2) `lora_config` → 3) `training_config` → 4) `fine_tune_qwen`.

**Lưu ý:**
- Mình giữ nguyên code gốc (không đổi logic/hyperparams). Các đường dẫn được cấu hình bằng biến trong notebook để chạy trên Kaggle.
- Bạn cần **bật GPU** trong Kaggle (Settings → Accelerator: GPU) trước khi train.

In [None]:
!pip install transformers datasets peft accelerate scikit-learn tokenizers einops

In [None]:
!pip install bitsandbytes

## Session 1 — Data Preprocessing
Thiết lập đường dẫn Kaggle và chạy pipeline tiền xử lý để xuất `train.jsonl`, `validation.jsonl`, `test.jsonl` và `dataset_stats.json`.

In [4]:
# Data Preprocessing Script for Qwen2-4B Banking Chatbot Fine-tuning
# Xử lý dữ liệu CSV thành format ChatML phù hợp với Qwen2-4B

import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
from typing import List, Dict, Any
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class BankingDataPreprocessor:
    def __init__(self, csv_path: str, output_dir: str):
        self.csv_path = csv_path
        self.output_dir = output_dir
        self.system_prompt = """Bạn là trợ lý tư vấn tài chính ngân hàng chuyên nghiệp của HDBank. Bạn có kiến thức sâu về các sản phẩm và dịch vụ ngân hàng, luôn hỗ trợ khách hàng một cách tận tình và chính xác. Hãy trả lời các câu hỏi một cách chi tiết, dễ hiểu và thân thiện."""
        
    def load_data(self) -> pd.DataFrame:
        """Load CSV data"""
        logger.info(f"Loading data from {self.csv_path}")
        df = pd.read_csv(self.csv_path, encoding='utf-8')
        logger.info(f"Loaded {len(df)} records")
        return df
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if pd.isna(text):
            return ""
        
        # Remove extra whitespaces and normalize
        text = str(text).strip()
        text = ' '.join(text.split())
        
        # Remove quotes that might interfere with JSON
        text = text.replace('""', '"')
        
        return text
    
    def create_chat_format(self, instruction: str, response: str) -> Dict[str, Any]:
        """Convert instruction-response pair to ChatML format"""
        return {
            "messages": [
                {
                    "role": "system",
                    "content": self.system_prompt
                },
                {
                    "role": "user", 
                    "content": self.clean_text(instruction)
                },
                {
                    "role": "assistant",
                    "content": self.clean_text(response)
                }
            ]
        }
    
    def process_data(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """Process DataFrame to ChatML format"""
        processed_data = []
        
        for idx, row in df.iterrows():
            if pd.isna(row['instruction']) or pd.isna(row['response']):
                logger.warning(f"Skipping row {idx} due to missing data")
                continue
                
            chat_data = self.create_chat_format(
                instruction=row['instruction'],
                response=row['response']
            )
            processed_data.append(chat_data)
            
        logger.info(f"Processed {len(processed_data)} valid records")
        return processed_data
    
    def split_data(self, data: List[Dict[str, Any]], 
                   train_ratio: float = 0.8, 
                   val_ratio: float = 0.1,
                   test_ratio: float = 0.1) -> tuple:
        """Split data into train/validation/test sets"""
        assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"
        
        # First split: train + val vs test
        train_val, test = train_test_split(
            data, 
            test_size=test_ratio, 
            random_state=42,
            shuffle=True
        )
        
        # Second split: train vs val
        val_size = val_ratio / (train_ratio + val_ratio)
        train, val = train_test_split(
            train_val,
            test_size=val_size,
            random_state=42,
            shuffle=True
        )
        
        logger.info(f"Data split - Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
        return train, val, test
    
    def save_jsonl(self, data: List[Dict[str, Any]], filename: str):
        """Save data in JSONL format"""
        filepath = os.path.join(self.output_dir, filename)
        os.makedirs(self.output_dir, exist_ok=True)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
        
        logger.info(f"Saved {len(data)} records to {filepath}")
    
    def generate_statistics(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate dataset statistics"""
        total_samples = len(data)
        
        # Calculate text lengths
        instruction_lengths = []
        response_lengths = []
        
        for item in data:
            messages = item['messages']
            user_msg = next(msg for msg in messages if msg['role'] == 'user')
            assistant_msg = next(msg for msg in messages if msg['role'] == 'assistant')
            
            instruction_lengths.append(len(user_msg['content']))
            response_lengths.append(len(assistant_msg['content']))
        
        stats = {
            'total_samples': total_samples,
            'avg_instruction_length': sum(instruction_lengths) / len(instruction_lengths),
            'avg_response_length': sum(response_lengths) / len(response_lengths),
            'max_instruction_length': max(instruction_lengths),
            'max_response_length': max(response_lengths),
            'min_instruction_length': min(instruction_lengths),
            'min_response_length': min(response_lengths)
        }
        
        return stats
    
    def run_preprocessing(self):
        """Main preprocessing pipeline"""
        logger.info("Starting data preprocessing...")
        
        # Load and process data
        df = self.load_data()
        processed_data = self.process_data(df)
        
        # Split data
        train_data, val_data, test_data = self.split_data(processed_data)
        
        # Save datasets
        self.save_jsonl(train_data, 'train.jsonl')
        self.save_jsonl(val_data, 'validation.jsonl')
        self.save_jsonl(test_data, 'test.jsonl')
        
        # Generate and save statistics
        stats = self.generate_statistics(processed_data)
        stats_path = os.path.join(self.output_dir, 'dataset_stats.json')
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(stats, f, ensure_ascii=False, indent=2)
        
        logger.info("Preprocessing completed successfully!")
        logger.info(f"Dataset statistics: {stats}")
        
        return train_data, val_data, test_data


 ## **Cấu hình đường dẫn & chạy tiền xử lý**

In [5]:
from pathlib import Path

# 👉 SỬA đường dẫn này theo dataset bạn đã gắn vào notebook
# Ví dụ nếu dataset name là: hdbank-finetune-data, chứa file final_sua_mapped_v2.csv
CSV_PATH = "/kaggle/input/data-banking-processed/final_sua_mapped_v2.csv"

# Thư mục output nên để trong /kaggle/working
OUTPUT_DIR = "/kaggle/working/data/processed/train_split"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

preprocessor = BankingDataPreprocessor(CSV_PATH, OUTPUT_DIR)
train_data, val_data, test_data = preprocessor.run_preprocessing()

print("Số mẫu Train/Val/Test:", len(train_data), len(val_data), len(test_data))


Số mẫu Train/Val/Test: 20434 2555 2555


 ## **Xem nhanh dữ liệu & thống kê**

In [6]:
import json, os, itertools

def peek_jsonl(path, n=3):
    print(f"==> {path}")
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in zip(range(n), f):
            print(json.loads(line))

base = OUTPUT_DIR
for name in ["train.jsonl", "validation.jsonl", "test.jsonl"]:
    peek_jsonl(os.path.join(base, name), n=2)

print("\n==> dataset_stats.json")
with open(os.path.join(base, "dataset_stats.json"), "r", encoding="utf-8") as f:
    print(json.load(f))


==> /kaggle/working/data/processed/train_split/train.jsonl
{'messages': [{'role': 'system', 'content': 'Bạn là trợ lý tư vấn tài chính ngân hàng chuyên nghiệp của HDBank. Bạn có kiến thức sâu về các sản phẩm và dịch vụ ngân hàng, luôn hỗ trợ khách hàng một cách tận tình và chính xác. Hãy trả lời các câu hỏi một cách chi tiết, dễ hiểu và thân thiện.'}, {'role': 'user', 'content': 'I cant find my master card will uhlep me lock it (tạm dịch: Tôi sẽ tìm thấy thẻ chủ của tôi sẽ uhlep tôi khóa nó)'}, {'role': 'assistant', 'content': 'Xin vui lòng chấp nhận lời xin lỗi của tôi cho bất kỳ khó chịu gây ra bởi mất thẻ Master của bạn. Tôi ở đây để giúp bạn đóng nó cho sự bình an của bạn. Để giúp bạn với điều này, xin vui lòng làm theo các bước sau đây: 1. Kiểm tra ví của bạn, túi, hoặc bất kỳ địa điểm gần đây bạn đã truy cập để xem nếu bạn có thể tìm thấy thẻ Master của bạn. Nếu bạn tìm thấy nó, đảm bảo bạn giữ nó an toàn. 2. Nếu bạn không thể tìm thấy thẻ của bạn, điều quan trọng là để báo cáo m

## Session 2 — LoRA Configuration
Import nguyên `lora_config.py` và in tóm tắt cấu hình + ước lượng bộ nhớ.

In [7]:
"""
LoRA Configuration for Qwen3-4B Banking Chatbot Fine-tuning
Cấu hình LoRA được tối ưu cho VRAM và chất lượng training
"""

from peft import LoraConfig, TaskType
from dataclasses import dataclass
from typing import List, Optional
import torch

@dataclass
class LoRAConfigManager:
    """
    LoRA Configuration Manager với các tham số được tối ưu cho Qwen3-4B
    
    Phân tích tham số:
    - r=16: Rank thấp vừa đủ để capture patterns quan trọng, tiết kiệm VRAM
    - lora_alpha=32: Scaling factor = 2*r, cân bằng tốt giữa stability và learning capacity
    - target_modules: Tập trung vào attention và MLP layers quan trọng nhất
    - lora_dropout=0.1: Regularization vừa phải, tránh overfitting
    """
    
    # Core LoRA parameters
    r: int = 16                    # Rank - cân bằng hiệu suất/chất lượng
    lora_alpha: int = 32           # Scaling factor (2*r)
    lora_dropout: float = 0.1      # Dropout cho LoRA layers
    bias: str = "none"             # Không train bias để tiết kiệm memory
    
    # Target modules cho Qwen3-4B architecture
    target_modules: List[str] = None
    
    # Task configuration
    task_type: TaskType = TaskType.CAUSAL_LM
    
    def __post_init__(self):
        """Initialize target modules if not provided"""
        if self.target_modules is None:
            self.target_modules = [
                # Attention layers - quan trọng nhất cho language understanding
                "q_proj",      # Query projection
                "k_proj",      # Key projection  
                "v_proj",      # Value projection
                "o_proj",      # Output projection
                
                # MLP layers - quan trọng cho knowledge representation
                "gate_proj",   # Gate projection in MLP
                "up_proj",     # Up projection in MLP
                "down_proj",   # Down projection in MLP
            ]
    
    def get_lora_config(self) -> LoraConfig:
        """Create LoRA configuration"""
        return LoraConfig(
            r=self.r,
            lora_alpha=self.lora_alpha,
            target_modules=self.target_modules,
            lora_dropout=self.lora_dropout,
            bias=self.bias,
            task_type=self.task_type,
            inference_mode=False,
        )
    
    def get_memory_efficient_config(self) -> LoraConfig:
        """Get memory-efficient LoRA config for limited VRAM"""
        return LoraConfig(
            r=8,                    # Giảm rank để tiết kiệm memory
            lora_alpha=16,          # Tương ứng giảm alpha
            target_modules=[
                "q_proj", "v_proj",  # Chỉ train query và value
                "down_proj"          # Và output MLP
            ],
            lora_dropout=0.1,
            bias="none",
            task_type=self.task_type,
            inference_mode=False,
        )
    
    def get_high_quality_config(self) -> LoraConfig:
        """Get high-quality LoRA config for better performance"""
        return LoraConfig(
            r=32,                   # Rank cao hơn cho chất lượng tốt hơn
            lora_alpha=64,          # Alpha tương ứng
            target_modules=self.target_modules + [
                "embed_tokens",     # Thêm embedding layer
                "lm_head"          # Thêm language model head
            ],
            lora_dropout=0.05,      # Dropout thấp hơn
            bias="none",
            task_type=self.task_type,
            inference_mode=False,
        )
    
    def estimate_memory_usage(self, model_size_gb: float = 8.0) -> dict:
        """Estimate memory usage for different configurations"""
        
        # Base model memory
        base_memory = model_size_gb
        
        # LoRA parameters estimation
        # Rough calculation: r * (input_dim + output_dim) * num_layers * num_target_modules
        qwen_hidden_size = 3584  # Qwen3-4B hidden size
        num_layers = 40          # Qwen3-4B layers
        num_target_modules = len(self.target_modules)
        
        # Standard config
        standard_params = self.r * qwen_hidden_size * num_layers * num_target_modules
        standard_memory = standard_params * 4 / (1024**3)  # 4 bytes per param, convert to GB
        
        # Memory efficient config  
        efficient_params = 8 * qwen_hidden_size * num_layers * 3  # r=8, 3 modules
        efficient_memory = efficient_params * 4 / (1024**3)
        
        # High quality config
        quality_params = 32 * qwen_hidden_size * num_layers * (num_target_modules + 2)
        quality_memory = quality_params * 4 / (1024**3)
        
        return {
            "base_model_memory_gb": base_memory,
            "standard_lora_memory_gb": standard_memory,
            "efficient_lora_memory_gb": efficient_memory,
            "quality_lora_memory_gb": quality_memory,
            "total_standard_gb": base_memory + standard_memory,
            "total_efficient_gb": base_memory + efficient_memory,
            "total_quality_gb": base_memory + quality_memory
        }
    
    def print_config_summary(self):
        """Print configuration summary"""
        print("=== LoRA Configuration Summary ===")
        print(f"Rank (r): {self.r}")
        print(f"Alpha: {self.lora_alpha}")
        print(f"Dropout: {self.lora_dropout}")
        print(f"Target modules: {len(self.target_modules)}")
        print(f"Modules: {', '.join(self.target_modules)}")
        
        memory_info = self.estimate_memory_usage()
        print(f"\n=== Memory Estimation ===")
        print(f"Standard config total: {memory_info['total_standard_gb']:.2f} GB")
        print(f"Efficient config total: {memory_info['total_efficient_gb']:.2f} GB") 
        print(f"Quality config total: {memory_info['total_quality_gb']:.2f} GB")

def get_recommended_config(available_vram_gb: float) -> LoraConfig:
    """Get recommended LoRA config based on available VRAM"""
    config_manager = LoRAConfigManager()
    
    if available_vram_gb >= 16:
        print("Using high-quality LoRA configuration")
        return config_manager.get_high_quality_config()
    elif available_vram_gb >= 12:
        print("Using standard LoRA configuration")
        return config_manager.get_lora_config()
    else:
        print("Using memory-efficient LoRA configuration")
        return config_manager.get_memory_efficient_config()


## Session 3 — Training Configuration
Import nguyên `training_config.py`, in tóm tắt cấu hình và lịch train ước tính dựa trên số mẫu đã tiền xử lý.

In [8]:
"""
Training Configuration for Qwen2.5-4B Banking Chatbot Fine-tuning
Cấu hình training được tối ưu cho chất lượng và hiệu suất
"""

from transformers import TrainingArguments
from dataclasses import dataclass
from typing import Optional
import torch
import os

@dataclass
class TrainingConfigManager:
    """
    Training Configuration Manager với các tham số được nghiên cứu kỹ lưỡng
    
    Phân tích tham số:
    - learning_rate=2e-4: Optimal cho LoRA fine-tuning, không quá cao gây instability
    - batch_size=4: Cân bằng giữa memory usage và gradient stability
    - gradient_accumulation=4: Effective batch size = 16, đủ lớn cho stable training
    - warmup_steps=100: Warm-up để tránh gradient explosion ở đầu training
    - max_steps: Được tính dựa trên dataset size và epochs
    """
    
    # Core training parameters
    output_dir: str = "./qwen-banking-lora"
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    
    # Learning rate and optimization
    learning_rate: float = 2e-4
    weight_decay: float = 0.01
    warmup_steps: int = 100
    lr_scheduler_type: str = "cosine"
    
    # Mixed precision and optimization
    fp16: bool = True
    bf16: bool = False  # Set to True if using Ampere+ GPUs
    gradient_checkpointing: bool = True
    dataloader_pin_memory: bool = False
    
    # Evaluation and logging
    eval_strategy: str = "steps"
    eval_steps: int = 100
    logging_steps: int = 10
    save_strategy: str = "steps"
    save_steps: int = 200
    save_total_limit: int = 3
    
    # Early stopping and best model
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_loss"
    greater_is_better: bool = False
    
    # Memory optimization
    remove_unused_columns: bool = False
    dataloader_num_workers: int = 0
    
    def __post_init__(self):
        """Post-initialization setup"""
        # Auto-detect bf16 support
        if torch.cuda.is_available():
            device_capability = torch.cuda.get_device_capability()
            if device_capability[0] >= 8:  # Ampere or newer
                self.bf16 = True
                self.fp16 = False
                print("Using BF16 precision (Ampere+ GPU detected)")
            else:
                print("Using FP16 precision")
        
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)
    
    def get_training_args(self, max_steps: Optional[int] = None) -> TrainingArguments:
        """Create TrainingArguments object"""
        
        args_dict = {
            "output_dir": self.output_dir,
            "num_train_epochs": self.num_train_epochs,
            "per_device_train_batch_size": self.per_device_train_batch_size,
            "per_device_eval_batch_size": self.per_device_eval_batch_size,
            "gradient_accumulation_steps": self.gradient_accumulation_steps,
            "learning_rate": self.learning_rate,
            "weight_decay": self.weight_decay,
            "warmup_steps": self.warmup_steps,
            "lr_scheduler_type": self.lr_scheduler_type,
            "fp16": self.fp16,
            "bf16": self.bf16,
            "gradient_checkpointing": self.gradient_checkpointing,
            "dataloader_pin_memory": self.dataloader_pin_memory,
            "eval_strategy": self.eval_strategy,
            "eval_steps": self.eval_steps,
            "logging_steps": self.logging_steps,
            "save_strategy": self.save_strategy,
            "save_steps": self.save_steps,
            "save_total_limit": self.save_total_limit,
            "load_best_model_at_end": self.load_best_model_at_end,
            "metric_for_best_model": self.metric_for_best_model,
            "greater_is_better": self.greater_is_better,
            "remove_unused_columns": self.remove_unused_columns,
            "dataloader_num_workers": self.dataloader_num_workers,
            "report_to": "none",  # Disable wandb/tensorboard by default
            "seed": 42,
        }
        
        if max_steps is not None:
            args_dict["max_steps"] = max_steps
            args_dict.pop("num_train_epochs")  # Remove epochs if using max_steps
        
        return TrainingArguments(**args_dict)
    
    def get_memory_efficient_args(self, max_steps: Optional[int] = None) -> TrainingArguments:
        """Get memory-efficient training arguments"""
        # Reduce batch sizes for memory efficiency
        self.per_device_train_batch_size = 2
        self.per_device_eval_batch_size = 2
        self.gradient_accumulation_steps = 8  # Keep effective batch size = 16
        self.gradient_checkpointing = True
        self.dataloader_pin_memory = False
        
        return self.get_training_args(max_steps)
    
    def get_high_performance_args(self, max_steps: Optional[int] = None) -> TrainingArguments:
        """Get high-performance training arguments"""
        # Increase batch sizes for better performance
        self.per_device_train_batch_size = 4
        self.per_device_eval_batch_size = 4
        self.gradient_accumulation_steps = 8  # Effective batch size = 16
        self.dataloader_num_workers = 4
        self.dataloader_pin_memory = True
        
        return self.get_training_args(max_steps)
    
    def calculate_training_steps(self, dataset_size: int) -> dict:
        """Calculate training steps and duration"""
        effective_batch_size = (
            self.per_device_train_batch_size * 
            self.gradient_accumulation_steps
        )
        
        steps_per_epoch = dataset_size // effective_batch_size
        total_steps = steps_per_epoch * self.num_train_epochs
        
        # Estimate training time (rough approximation)
        # Assuming ~1.5 seconds per step on RTX 4090
        estimated_time_hours = (total_steps * 1.5) / 3600
        
        return {
            "dataset_size": dataset_size,
            "effective_batch_size": effective_batch_size,
            "steps_per_epoch": steps_per_epoch,
            "total_training_steps": total_steps,
            "estimated_time_hours": estimated_time_hours,
            "warmup_ratio": self.warmup_steps / total_steps if total_steps > 0 else 0
        }
    
    def print_config_summary(self, dataset_size: Optional[int] = None):
        """Print training configuration summary"""
        print("=== Training Configuration Summary ===")
        print(f"Learning rate: {self.learning_rate}")
        print(f"Batch size per device: {self.per_device_train_batch_size}")
        print(f"Gradient accumulation: {self.gradient_accumulation_steps}")
        print(f"Effective batch size: {self.per_device_train_batch_size * self.gradient_accumulation_steps}")
        print(f"Epochs: {self.num_train_epochs}")
        print(f"Warmup steps: {self.warmup_steps}")
        print(f"Weight decay: {self.weight_decay}")
        print(f"Precision: {'BF16' if self.bf16 else 'FP16' if self.fp16 else 'FP32'}")
        print(f"Gradient checkpointing: {self.gradient_checkpointing}")
        
        if dataset_size:
            training_info = self.calculate_training_steps(dataset_size)
            print(f"\n=== Training Schedule ===")
            print(f"Dataset size: {training_info['dataset_size']}")
            print(f"Steps per epoch: {training_info['steps_per_epoch']}")
            print(f"Total training steps: {training_info['total_training_steps']}")
            print(f"Estimated time: {training_info['estimated_time_hours']:.1f} hours")
            print(f"Warmup ratio: {training_info['warmup_ratio']:.3f}")

def get_recommended_training_config(available_vram_gb: float, dataset_size: int) -> TrainingArguments:
    """Get recommended training config based on available VRAM"""
    config_manager = TrainingConfigManager()
    
    # Calculate max steps for better control
    training_info = config_manager.calculate_training_steps(dataset_size)
    max_steps = training_info['total_training_steps']
    
    if available_vram_gb >= 16:
        print("Using high-performance training configuration")
        return config_manager.get_high_performance_args(max_steps)
    elif available_vram_gb >= 12:
        print("Using standard training configuration")
        return config_manager.get_training_args(max_steps)
    else:
        print("Using memory-efficient training configuration")
        return config_manager.get_memory_efficient_args(max_steps)


## Compat + Memory-Efficient

In [9]:
# === Compat + Memory-Efficient TrainingArguments builder ===
import os, json, torch
from dataclasses import fields as dataclass_fields
from transformers import TrainingArguments as HFTrainingArguments

# 1) dataset_size
if 'train_data' in globals():
    dataset_size = len(train_data)
else:
    dataset_size = 0
    guess = "/kaggle/working/data/processed/train_split/train.jsonl"
    if os.path.exists(guess):
        with open(guess, "r", encoding="utf-8") as f:
            for _ in f:
                dataset_size += 1
    else:
        dataset_size = 1000
print(f"Detected dataset_size = {dataset_size}")

# 2) VRAM
available_vram = 8.0
if torch.cuda.is_available():
    props = torch.cuda.get_device_properties(0)
    available_vram = props.total_memory / 1e9
print(f"Available VRAM ≈ {available_vram:.1f} GB")

# 3) config summary
tcm = TrainingConfigManager()
tcm.print_config_summary(dataset_size=dataset_size)

# 4) ÉP profile memory-efficient (tránh BS=8)
#    Giữ đúng logic function get_memory_efficient_args của bạn:
tcm.per_device_train_batch_size = 1    # mạnh tay để chắc chắn fit
tcm.per_device_eval_batch_size = 1
tcm.gradient_accumulation_steps = 16   # effective batch vẫn 16
tcm.gradient_checkpointing = True
tcm.dataloader_pin_memory = False

# 5) Dùng AdamW 8-bit (giảm RAM) nếu transformers hỗ trợ qua TrainingArguments
extra_optim = {"optim": "paged_adamw_8bit"}  # cần bitsandbytes

# 6) Build args_dict giống hệt get_training_args
args_dict = {
    "output_dir": tcm.output_dir,
    "num_train_epochs": tcm.num_train_epochs,
    "per_device_train_batch_size": tcm.per_device_train_batch_size,
    "per_device_eval_batch_size": tcm.per_device_eval_batch_size,
    "gradient_accumulation_steps": tcm.gradient_accumulation_steps,
    "learning_rate": tcm.learning_rate,
    "weight_decay": tcm.weight_decay,
    "warmup_steps": tcm.warmup_steps,
    "lr_scheduler_type": tcm.lr_scheduler_type,
    "fp16": tcm.fp16,
    "bf16": tcm.bf16,
    "gradient_checkpointing": tcm.gradient_checkpointing,
    "dataloader_pin_memory": tcm.dataloader_pin_memory,
    "eval_strategy": tcm.eval_strategy,
    "eval_steps": tcm.eval_steps,
    "logging_steps": tcm.logging_steps,
    "save_strategy": tcm.save_strategy,
    "save_steps": tcm.save_steps,
    "save_total_limit": tcm.save_total_limit,
    "load_best_model_at_end": tcm.load_best_model_at_end,
    "metric_for_best_model": tcm.metric_for_best_model,
    "greater_is_better": tcm.greater_is_better,
    "remove_unused_columns": tcm.remove_unused_columns,
    "dataloader_num_workers": tcm.dataloader_num_workers,
    "report_to": "none",
    "seed": 42,
    **extra_optim,
}

# dùng max_steps như trước
sched = tcm.calculate_training_steps(dataset_size)
max_steps = sched['total_training_steps']
if max_steps and max_steps > 0:
    args_dict["max_steps"] = max_steps
    args_dict.pop("num_train_epochs", None)

# 7) lọc field theo version transformers hiện tại
ta_field_names = {f.name for f in dataclass_fields(HFTrainingArguments)}
if "eval_strategy" not in ta_field_names and "eval_strategy" in ta_field_names:
    args_dict["eval_strategy"] = args_dict.pop("eval_strategy", "steps")
filtered_args = {k: v for k, v in args_dict.items() if k in ta_field_names}

training_args = HFTrainingArguments(**filtered_args)

print("\n== Preview key TrainingArguments ==")
print("output_dir:", training_args.output_dir)
print("per_device_train_batch_size:", training_args.per_device_train_batch_size)
print("gradient_accumulation_steps:", training_args.gradient_accumulation_steps)
print("learning_rate:", training_args.learning_rate)
print("eval/log/save steps:",
      getattr(training_args, "eval_steps", None),
      getattr(training_args, "logging_steps", None),
      getattr(training_args, "save_steps", None))
print("precision fp16/bf16:", getattr(training_args, "fp16", None), getattr(training_args, "bf16", None))
print("optim:", getattr(training_args, "optim", None))


Detected dataset_size = 20434
Available VRAM ≈ 17.1 GB
Using FP16 precision
=== Training Configuration Summary ===
Learning rate: 0.0002
Batch size per device: 4
Gradient accumulation: 4
Effective batch size: 16
Epochs: 3
Warmup steps: 100
Weight decay: 0.01
Precision: FP16
Gradient checkpointing: True

=== Training Schedule ===
Dataset size: 20434
Steps per epoch: 1277
Total training steps: 3831
Estimated time: 1.6 hours
Warmup ratio: 0.026

== Preview key TrainingArguments ==
output_dir: ./qwen-banking-lora
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
learning_rate: 0.0002
eval/log/save steps: 100 10 200
precision fp16/bf16: True False
optim: OptimizerNames.PAGED_ADAMW_8BIT


## Session 4 — Fine-tune Qwen
Import `QwenBankingFineTuner` từ `fine_tune_qwen.py` (không chạy `main()`), rồi chạy train bằng đường dẫn Kaggle.

> **Ghi chú:** Nếu không bật GPU, tham số `attn_implementation='flash_attention_2'` trong code chỉ được set khi CUDA khả dụng. Nếu dùng CPU-only, bạn vẫn có thể import lớp và huấn luyện chậm hơn.

In [10]:
!pip -q install "flash-attn>=2.5.9" --no-build-isolation || echo "flash-attn install failed; will fallback to SDPA"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [11]:
"""
Fine-tuning Script for Qwen3-4B Banking Chatbot
Script chính để fine-tune model với LoRA
"""

import os
import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    Trainer
)
from peft import get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from typing import Dict, List, Any
import logging
from datetime import datetime

# === LƯU Ý ===
# Trong notebook, bạn đã có:
#  - LoRAConfigManager, get_recommended_config (từ phần LoRA)
#  - TrainingConfigManager, get_recommended_training_config (từ phần Training)
# Nếu muốn import như file .py: hãy %%writefile lora_config.py & training_config.py trước, rồi mở 2 dòng dưới:
# from lora_config import LoRAConfigManager, get_recommended_config
# from training_config import TrainingConfigManager, get_recommended_training_config

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QwenBankingFineTuner:
    def __init__(self, 
                 model_name: str = "Qwen/Qwen3-4B",
                 data_dir: str = "/kaggle/working/data/processed/train_split",
                 output_dir: str = "/kaggle/working/qwen-banking-lora"):
        
        self.model_name = model_name
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Initialize components
        self.tokenizer = None
        self.model = None
        self.train_dataset = None
        self.eval_dataset = None
        
        logger.info(f"Initializing fine-tuner for {model_name}")
        logger.info(f"Device: {self.device}")
        if torch.cuda.is_available():
            logger.info(f"GPU: {torch.cuda.get_device_name()}")
            logger.info(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    def load_tokenizer_and_model(self):
        """Load tokenizer and model"""
        logger.info("Loading tokenizer and model...")
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        
        # Add pad token if not exists
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Load model with optimizations
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
        )
        
        # Prepare model for LoRA
        self.model = prepare_model_for_kbit_training(self.model)
        
        logger.info("Model and tokenizer loaded successfully")
    
    def load_datasets(self):
        """Load training and validation datasets"""
        logger.info("Loading datasets...")
        
        def load_jsonl(filepath: str) -> List[Dict[str, Any]]:
            data = []
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    data.append(json.loads(line.strip()))
            return data
        
        # Load data
        train_data = load_jsonl(os.path.join(self.data_dir, "train.jsonl"))
        val_data = load_jsonl(os.path.join(self.data_dir, "validation.jsonl"))
        
        # Convert to datasets
        self.train_dataset = Dataset.from_list(train_data)
        self.eval_dataset = Dataset.from_list(val_data)
        
        logger.info(f"Loaded {len(self.train_dataset)} training samples")
        logger.info(f"Loaded {len(self.eval_dataset)} validation samples")
    
    def preprocess_function(self, examples):
        """Preprocess examples for training"""
        model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
        
        for messages in examples["messages"]:
            # Apply chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            
            # Tokenize
            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=1024,
                padding=False,
                return_tensors=None
            )
            
            model_inputs["input_ids"].append(tokenized["input_ids"])
            model_inputs["attention_mask"].append(tokenized["attention_mask"])
            
            # Labels are the same as input_ids for causal LM
            model_inputs["labels"].append(tokenized["input_ids"].copy())
        
        return model_inputs
    
    def setup_lora(self, available_vram_gb: float = 12.0):
        """Setup LoRA configuration"""
        logger.info("Setting up LoRA...")
        
        # Get recommended LoRA config (đã định nghĩa ở session LoRA)
        lora_config = get_recommended_config(available_vram_gb)
        
        # Apply LoRA to model
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        self.model.print_trainable_parameters()
        
        logger.info("LoRA setup completed")
    
    def train(self, available_vram_gb: float = 12.0):
        """Main training function"""
        logger.info("Starting training...")
        
        # Preprocess datasets
        train_dataset = self.train_dataset.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train_dataset.column_names
        )
        
        eval_dataset = self.eval_dataset.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.eval_dataset.column_names
        )
        
        # Get training arguments (đã định nghĩa ở session Training)
        training_args = get_recommended_training_config(
            available_vram_gb, 
            len(train_dataset)
        )
        training_args.output_dir = self.output_dir
        
        # Data collator
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True,
            pad_to_multiple_of=8,   # thêm dòng này
            return_tensors="pt"
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
        )
        
        # Start training
        start_time = datetime.now()
        logger.info(f"Training started at {start_time}")
        
        trainer.train()
        
        end_time = datetime.now()
        training_duration = end_time - start_time
        logger.info(f"Training completed at {end_time}")
        logger.info(f"Training duration: {training_duration}")
        
        # Save the final model
        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)
        
        logger.info(f"Model saved to {self.output_dir}")
        
        return trainer
    
    def run_fine_tuning(self, available_vram_gb: float = 12.0):
        """Complete fine-tuning pipeline"""
        try:
            # Load model and tokenizer
            self.load_tokenizer_and_model()
            
            # Load datasets
            self.load_datasets()
            
            # Setup LoRA
            self.setup_lora(available_vram_gb)
            
            # Train model
            trainer = self.train(available_vram_gb)
            
            logger.info("Fine-tuning completed successfully!")
            return trainer
            
        except Exception as e:
            logger.error(f"Error during fine-tuning: {str(e)}")
            raise


## **QLoRA 4-bit hay fp16 fallback**

In [12]:
# Safe patch: ưu tiên QLoRA 4-bit, nếu fail thử 8-bit, cuối cùng fallback fp16
import torch, logging
from transformers import AutoTokenizer, AutoModelForCausalLM

def _safe_load_tokenizer_and_model(self):
    logger = logging.getLogger(__name__)
    logger.info("Loading tokenizer/model with auto QLoRA(4b)->8b->fp16 fallback...")

    # Tokenizer
    self.tokenizer = AutoTokenizer.from_pretrained(
        self.model_name, trust_remote_code=True, padding_side="right"
    )
    if self.tokenizer.pad_token is None:
        self.tokenizer.pad_token = self.tokenizer.eos_token

    # Attention backend
    attn_impl = None
    if torch.cuda.is_available():
        try:
            import flash_attn  # noqa
            attn_impl = "flash_attention_2"
            logger.info("FlashAttention-2 detected.")
        except Exception:
            attn_impl = "sdpa"
            logger.info("Using SDPA attention.")
    self.attn_impl_chosen = attn_impl

    # Thử import BnB & BitsAndBytesConfig
    use_bnb = False
    try:
        from transformers import BitsAndBytesConfig
        import bitsandbytes as bnb  # noqa
        use_bnb = True
    except Exception as e:
        logger.warning(f"bitsandbytes/BitsAndBytesConfig not available -> fallback fp16. Detail: {e}")

    # 4-bit -> 8-bit -> fp16
    if use_bnb:
        # Ưu tiên 4-bit
        try:
            bnb_cfg = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.float16
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                device_map="auto",
                trust_remote_code=True,
                quantization_config=bnb_cfg,
                attn_implementation=attn_impl
            )
            self.load_mode = "4bit"
            logger.info("Loaded model with QLoRA (4-bit).")
        except Exception as e4:
            logger.warning(f"QLoRA 4-bit failed -> try 8-bit. Detail: {e4}")
            # Thử 8-bit
            try:
                bnb_cfg8 = BitsAndBytesConfig(load_in_8bit=True)
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    device_map="auto",
                    trust_remote_code=True,
                    quantization_config=bnb_cfg8,
                    attn_implementation=attn_impl
                )
                self.load_mode = "8bit"
                logger.info("Loaded model with 8-bit quantization.")
            except Exception as e8:
                logger.warning(f"8-bit failed -> fallback fp16. Detail: {e8}")
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    trust_remote_code=True,
                    attn_implementation=attn_impl
                )
                self.load_mode = "fp16"
                logger.info("Loaded model in fp16.")
    else:
        # Không có BnB -> fp16
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            attn_implementation=attn_impl
        )
        self.load_mode = "fp16"
        logger.info("Loaded model in fp16.")

    # Hợp gradient checkpointing
    if hasattr(self.model, "config") and getattr(self.model.config, "use_cache", False):
        self.model.config.use_cache = False

    # Chuẩn bị cho k-bit training (OK cho 4b/8b/fp16)
    from peft import prepare_model_for_kbit_training
    self.model = prepare_model_for_kbit_training(self.model)
    logger.info(f"Model ready. load_mode={self.load_mode}, attn_impl={self.attn_impl_chosen}")

QwenBankingFineTuner.load_tokenizer_and_model = _safe_load_tokenizer_and_model


In [13]:
import gc, torch
for obj in ["trainer", "fine_tuner"]:
    if obj in globals():
        try:
            del globals()[obj]
        except: pass
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    # giảm phân mảnh bộ nhớ:
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("GPU cache cleared.")


GPU cache cleared.


## **Cấu hình Kaggle & chạy fine-tuning (thay cho main())**

In [18]:
# Cấu hình cho Kaggle
MODEL_NAME = "Qwen/Qwen3-4B"
DATA_DIR   = "/kaggle/working/data/processed/train_split"  # đã tạo ở bước tiền xử lý
OUTPUT_DIR = "/kaggle/working/qwen-banking-lora"

# Phát hiện VRAM khả dụng
if torch.cuda.is_available():
    available_vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    logger.info(f"Available VRAM: {available_vram:.1f} GB")
else:
    available_vram = 8.0
    logger.warning("CUDA not available, using CPU")

# Khởi tạo & chạy fine-tuning
fine_tuner = QwenBankingFineTuner(
    model_name=MODEL_NAME,
    data_dir=DATA_DIR,
    output_dir=OUTPUT_DIR
)

trainer = fine_tuner.run_fine_tuning(available_vram)

# (Tuỳ chọn) In một số thống kê cuối
if trainer:
    hist = trainer.state.log_history
    # Tìm log loss cuối cùng nếu có
    final_train_loss = next((d.get('train_loss') for d in reversed(hist) if 'train_loss' in d), 'N/A')
    final_eval_loss  = next((d.get('eval_loss') for d in reversed(hist) if 'eval_loss' in d), 'N/A')
    print("\n=== Training Statistics ===")
    print("Final training loss:", final_train_loss)
    print("Final eval loss:", final_eval_loss)


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [6]:
import torch, platform

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("device:", torch.cuda.get_device_name(0))
    print("capability:", torch.cuda.get_device_capability(0))  # (major, minor)

try:
    import bitsandbytes as bnb
    from bitsandbytes.nn import Linear4bit
    print("bitsandbytes:", bnb.__version__)
    # Thử khởi tạo layer 4-bit
    test = Linear4bit(16, 16, quant_type='nf4', compute_dtype=torch.float16)
    print("Linear4bit init: OK -> 4-bit likely supported")
    del test
    fourbit_ok = True
except Exception as e:
    print("4-bit check failed:", repr(e))
    fourbit_ok = False


torch: 2.6.0+cu124
cuda available: True
device: Tesla P100-PCIE-16GB
capability: (6, 0)
bitsandbytes: 0.47.0
Linear4bit init: OK -> 4-bit likely supported


In [None]:
def print_quantization_status(ft):
    print("load_mode:", getattr(ft, "load_mode", "<unknown>"))
    print("attn_impl:", getattr(ft, "attn_impl_chosen", "<unknown>"))
    qc = getattr(ft.model, "quantization_config", None)
    print("has quantization_config:", qc is not None)
    if qc:
        for k in ["load_in_4bit","bnb_4bit_quant_type","bnb_4bit_use_double_quant","bnb_4bit_compute_dtype"]:
            print(f"  {k}:", getattr(qc, k, None))
    try:
        from bitsandbytes.nn import Linear4bit
        has_4bit = any(isinstance(m, Linear4bit) for m in ft.model.modules())
        print("contains Linear4bit layers:", has_4bit)
    except Exception as e:
        print("bitsandbytes check skipped:", e)

print_quantization_status(fine_tuner)