# RAG Baseline Fine-tuning with Bench-RAG Evaluation

This notebook fine-tunes a Korean LLM (kanana-nano-2.1b) for RAG tasks with comprehensive evaluation metrics.

**Features:**
- LoRA fine-tuning for efficient training
- Weights & Biases (wandb) tracking
- ROUGE and BERTScore evaluation
- Bench-RAG evaluation system
- Modular evaluation functions

## 1. Setup & Installation

In [None]:
# Install required packages
!pip install -q transformers peft datasets accelerate
!pip install -q wandb rouge-score bert-score
!pip install -q sentencepiece  # For tokenizer

## 2. Environment Check

In [None]:
import os
import sys
import torch
import json
from pathlib import Path

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"cuDNN version: {torch.backends.cudnn.version()}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 3. Initialize Weights & Biases

In [None]:
import wandb

# Initialize wandb
wandb.login()

# Configuration
config = {
    "model_name": "kakaocorp/kanana-nano-2.1b-base",
    "dataset": "jecheon_rag_training",
    "task": "rag_finetuning",
    "lora_r": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "learning_rate": 2e-4,
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 5,
    "max_steps": 100,
    "evaluation_strategy": "steps",
    "eval_steps": 20,
}

# Start wandb run
wandb.init(
    project="goodganglabs-rag",
    name="rag-baseline-kanana-nano",
    config=config,
    tags=["rag", "kanana-nano", "lora", "bench-rag"]
)

## 4. Load Model & Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = config["model_name"]

# Load model
print(f"Loading model: {model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    trust_remote_code=True
)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token

print(f"‚úì Model loaded successfully")
print(f"‚úì Tokenizer vocabulary size: {len(tokenizer)}")

## 5. Load Training Data

In [None]:
from datasets import Dataset
import json

# Load training data
data_path = project_root / "data" / "processed" / "training_data.jsonl"
print(f"Loading training data from: {data_path}")

# Read JSONL file
data_samples = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        data_samples.append(json.loads(line))

print(f"‚úì Loaded {len(data_samples)} training samples")

# Show example
print("\nExample training sample:")
print(json.dumps(data_samples[0], ensure_ascii=False, indent=2))

## 6. Data Preprocessing

In [None]:
def format_rag_prompt(sample):
    """
    Format RAG training sample into instruction-following format
    """
    # Get relevant documents
    documents = sample.get('documents', [])
    
    # Find correct document(s)
    correct_docs = [doc for doc in documents if doc.get('is_correct', False)]
    
    # Build context from correct documents
    if correct_docs:
        context_parts = []
        for doc in correct_docs:
            title = doc.get('title', '')
            content = doc.get('content', '')
            if title and content:
                context_parts.append(f"[{title}]\n{content}")
            elif content:
                context_parts.append(content)
        context = "\n\n".join(context_parts)
    else:
        # Use all documents if no correct document is marked
        context = "\n\n".join([doc.get('content', '') for doc in documents[:3]])
    
    # Create instruction-following format
    instruction = "Ï£ºÏñ¥ÏßÑ Î¨∏ÏÑú ÎÇ¥Ïö©ÏùÑ Î∞îÌÉïÏúºÎ°ú ÏßàÎ¨∏Ïóê ÎãµÌïòÏÑ∏Ïöî."
    question = sample['question']
    answer = sample['answer']
    
    prompt = f"""Îã§ÏùåÏùÄ ÏßàÎ¨∏Ïóê ÎãµÌïòÍ∏∞ ÏúÑÌïú Î¨∏ÏÑúÏûÖÎãàÎã§:

{context}

### ÏßàÎ¨∏:
{question}

### ÎãµÎ≥Ä:
{answer}"""
    
    return prompt + tokenizer.eos_token


def preprocess_dataset(samples):
    """
    Preprocess dataset for training
    """
    # Format prompts
    texts = [format_rag_prompt(sample) for sample in samples]
    
    # Create dataset with metadata
    dataset_dict = {
        'text': texts,
        'question': [s['question'] for s in samples],
        'answer': [s['answer'] for s in samples],
        'question_type': [s.get('question_type', 'unknown') for s in samples],
    }
    
    return Dataset.from_dict(dataset_dict)


# Create dataset
train_dataset = preprocess_dataset(data_samples)
print(f"‚úì Created dataset with {len(train_dataset)} samples")
print(f"\nDataset features: {train_dataset.features}")

# Show formatted example
print("\n" + "="*60)
print("Example formatted prompt:")
print("="*60)
print(train_dataset[0]['text'][:500] + "...")

## 7. Tokenize Dataset

In [None]:
def tokenize_function(examples):
    """
    Tokenize text samples
    """
    tokens = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    tokens["labels"] = tokens["input_ids"].clone()
    return tokens


# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "question", "answer", "question_type"]
)

print(f"‚úì Tokenization complete")
print(f"  Tokenized features: {tokenized_dataset.features}")

## 8. Configure LoRA

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=config["lora_r"],
    lora_alpha=config["lora_alpha"],
    lora_dropout=config["lora_dropout"],
    target_modules=["q_proj", "k_proj", "v_proj"],
    bias="none"
)

# Apply LoRA to model
model = get_peft_model(base_model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

print("‚úì LoRA configuration applied")

## 9. Setup Evaluation Metrics

In [None]:
# Import evaluation modules
from src.evaluation.metrics import (
    GenerationMetrics,
    BenchRAGEvaluator,
    create_evaluator
)

# Initialize evaluators
generation_metrics = GenerationMetrics()
bench_rag_evaluator = create_evaluator(k_values=[1, 3, 5])

print("‚úì Evaluation metrics initialized")
print("  - ROUGE scores")
print("  - BERTScore")
print("  - Bench-RAG metrics (Recall@K, NDCG@K, MRR)")

## 10. Custom Evaluation Callback

In [None]:
from transformers import TrainerCallback
from typing import Dict, Any
import numpy as np

class RAGEvaluationCallback(TrainerCallback):
    """
    Custom callback for RAG evaluation during training
    """
    
    def __init__(self, eval_samples, generation_metrics, tokenizer):
        self.eval_samples = eval_samples
        self.generation_metrics = generation_metrics
        self.tokenizer = tokenizer
    
    def on_evaluate(self, args, state, control, model, metrics=None, **kwargs):
        """
        Called after evaluation phase
        """
        if metrics is None:
            return
        
        # Generate predictions for evaluation samples
        predictions = []
        references = []
        
        model.eval()
        with torch.no_grad():
            for sample in self.eval_samples[:10]:  # Evaluate on first 10 samples
                # Format prompt (without answer)
                documents = sample.get('documents', [])
                correct_docs = [doc for doc in documents if doc.get('is_correct', False)]
                
                if correct_docs:
                    context = correct_docs[0].get('content', '')
                else:
                    context = documents[0].get('content', '') if documents else ''
                
                prompt = f"""Îã§ÏùåÏùÄ ÏßàÎ¨∏Ïóê ÎãµÌïòÍ∏∞ ÏúÑÌïú Î¨∏ÏÑúÏûÖÎãàÎã§:

{context}

### ÏßàÎ¨∏:
{sample['question']}

### ÎãµÎ≥Ä:
"""
                
                # Generate answer
                inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
                
                generated_text = self.tokenizer.decode(
                    outputs[0][inputs['input_ids'].shape[1]:],
                    skip_special_tokens=True
                )
                
                predictions.append(generated_text.strip())
                references.append(sample['answer'])
        
        # Compute ROUGE scores
        rouge_scores = []
        for pred, ref in zip(predictions, references):
            scores = self.generation_metrics.rouge_scores(pred, ref)
            rouge_scores.append(scores)
        
        # Average ROUGE scores
        avg_rouge = {
            'rouge1': np.mean([s['rouge1'] for s in rouge_scores]),
            'rouge2': np.mean([s['rouge2'] for s in rouge_scores]),
            'rougeL': np.mean([s['rougeL'] for s in rouge_scores]),
        }
        
        # Compute BERTScore
        bert_scores = self.generation_metrics.bert_score(predictions, references)
        
        # Log to wandb
        wandb.log({
            "eval/rouge1": avg_rouge['rouge1'],
            "eval/rouge2": avg_rouge['rouge2'],
            "eval/rougeL": avg_rouge['rougeL'],
            "eval/bert_f1": bert_scores['bert_f1'],
            "eval/bert_precision": bert_scores['bert_precision'],
            "eval/bert_recall": bert_scores['bert_recall'],
        }, step=state.global_step)
        
        # Log example predictions
        if state.global_step % 40 == 0:  # Log examples every 40 steps
            example_table = wandb.Table(
                columns=["Question", "Reference", "Prediction", "ROUGE-L"],
                data=[
                    [q, r, p, s['rougeL']]
                    for q, r, p, s in list(zip(
                        [s['question'] for s in self.eval_samples[:3]],
                        references[:3],
                        predictions[:3],
                        rouge_scores[:3]
                    ))
                ]
            )
            wandb.log({"eval/examples": example_table}, step=state.global_step)
        
        print(f"\nüìä Evaluation Metrics (Step {state.global_step}):")
        print(f"  ROUGE-1: {avg_rouge['rouge1']:.4f}")
        print(f"  ROUGE-2: {avg_rouge['rouge2']:.4f}")
        print(f"  ROUGE-L: {avg_rouge['rougeL']:.4f}")
        print(f"  BERTScore F1: {bert_scores['bert_f1']:.4f}")


# Create callback instance
eval_callback = RAGEvaluationCallback(
    eval_samples=data_samples[:20],  # Use first 20 samples for evaluation
    generation_metrics=generation_metrics,
    tokenizer=tokenizer
)

print("‚úì Custom evaluation callback created")

## 11. Training Configuration

In [None]:
from transformers import Trainer, TrainingArguments

# Training arguments with wandb integration
training_args = TrainingArguments(
    output_dir="./outputs/rag_baseline",
    
    # Training hyperparameters
    per_device_train_batch_size=config["batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    warmup_steps=config["warmup_steps"],
    max_steps=config["max_steps"],
    
    # Optimization
    bf16=True,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    
    # Logging
    logging_dir="./logs",
    logging_steps=1,
    logging_first_step=True,
    
    # Evaluation
    evaluation_strategy=config["evaluation_strategy"],
    eval_steps=config["eval_steps"],
    save_strategy="steps",
    save_steps=config["eval_steps"],
    save_total_limit=3,
    
    # Wandb integration
    report_to="wandb",
    run_name="rag-baseline-kanana-nano",
    
    # Misc
    seed=1234,
    load_best_model_at_end=True,
)

print("‚úì Training arguments configured")
print(f"  Total steps: {config['max_steps']}")
print(f"  Batch size: {config['batch_size']} x {config['gradient_accumulation_steps']} (accumulation)")
print(f"  Effective batch size: {config['batch_size'] * config['gradient_accumulation_steps']}")
print(f"  Learning rate: {config['learning_rate']}")
print(f"  Evaluation every: {config['eval_steps']} steps")

## 12. Initialize Trainer

In [None]:
# Create trainer with custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(min(20, len(tokenized_dataset)))),  # Small eval set
    callbacks=[eval_callback],
)

print("‚úì Trainer initialized with RAG evaluation callback")

## 13. GPU Memory Check

In [None]:
# Check GPU memory usage
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    
    print(f"GPU: {gpu_stats.name}")
    print(f"Max memory: {max_memory} GB")
    print(f"Reserved memory: {start_gpu_memory} GB")
    print(f"Available memory: {max_memory - start_gpu_memory:.3f} GB")
    
    # Log to wandb
    wandb.log({
        "system/gpu_name": gpu_stats.name,
        "system/gpu_memory_total_gb": max_memory,
        "system/gpu_memory_reserved_gb": start_gpu_memory,
    })

## 14. Start Training

In [None]:
# Train the model
print("\n" + "="*60)
print("üöÄ Starting training...")
print("="*60 + "\n")

trainer_stats = trainer.train()

print("\n" + "="*60)
print("‚úÖ Training completed!")
print("="*60)
print(f"Total training time: {trainer_stats.metrics['train_runtime']:.2f}s")
print(f"Training loss: {trainer_stats.metrics['train_loss']:.4f}")

## 15. Final Evaluation with Bench-RAG

In [None]:
print("\n" + "="*60)
print("üìä Running final Bench-RAG evaluation...")
print("="*60 + "\n")

# Generate predictions for test set
test_samples = data_samples[:30]  # Use first 30 for comprehensive evaluation
predictions = []

model.eval()
with torch.no_grad():
    for i, sample in enumerate(test_samples):
        # Format prompt
        documents = sample.get('documents', [])
        correct_docs = [doc for doc in documents if doc.get('is_correct', False)]
        
        if correct_docs:
            context = correct_docs[0].get('content', '')
            correct_doc_id = correct_docs[0].get('doc_id', '')
        else:
            context = documents[0].get('content', '') if documents else ''
            correct_doc_id = documents[0].get('doc_id', '') if documents else ''
        
        prompt = f"""Îã§ÏùåÏùÄ ÏßàÎ¨∏Ïóê ÎãµÌïòÍ∏∞ ÏúÑÌïú Î¨∏ÏÑúÏûÖÎãàÎã§:

{context}

### ÏßàÎ¨∏:
{sample['question']}

### ÎãµÎ≥Ä:
"""
        
        # Generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()
        
        # For Bench-RAG, assume retrieved docs are the correct ones
        # (In real scenario, you'd use actual retrieval results)
        retrieved_doc_ids = [doc['doc_id'] for doc in documents[:3]]
        
        predictions.append({
            'answer': generated,
            'retrieved_doc_ids': retrieved_doc_ids
        })
        
        if (i + 1) % 10 == 0:
            print(f"  Processed {i + 1}/{len(test_samples)} samples...")

# Run Bench-RAG evaluation
bench_rag_results = bench_rag_evaluator.evaluate_dataset(
    dataset=test_samples,
    model_predictions=predictions
)

# Print results
print("\n" + bench_rag_evaluator.format_results(bench_rag_results))

# Log to wandb
wandb.log({f"bench_rag/{k}": v for k, v in bench_rag_results.items()})

# Log example predictions
examples_table = wandb.Table(
    columns=["Question", "Reference", "Prediction", "Question Type"],
    data=[
        [
            test_samples[i]['question'],
            test_samples[i]['answer'],
            predictions[i]['answer'],
            test_samples[i].get('question_type', 'unknown')
        ]
        for i in range(min(10, len(test_samples)))
    ]
)
wandb.log({"final_evaluation/examples": examples_table})

## 16. Save Model

In [None]:
# Save fine-tuned model
output_dir = "./models/rag_baseline_finetuned"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"‚úì Model saved to: {output_dir}")

# Save to wandb
artifact = wandb.Artifact(
    name="rag-baseline-model",
    type="model",
    description="Fine-tuned RAG baseline model (kanana-nano-2.1b)"
)
artifact.add_dir(output_dir)
wandb.log_artifact(artifact)

print("‚úì Model uploaded to wandb")

## 17. Finish Wandb Run

In [None]:
# Finish wandb run
wandb.finish()

print("\n" + "="*60)
print("‚úÖ All done! Check your wandb dashboard for detailed metrics.")
print("="*60)

---

## Summary

This notebook:
1. ‚úÖ Fine-tuned kanana-nano-2.1b for RAG tasks using LoRA
2. ‚úÖ Tracked training with Weights & Biases
3. ‚úÖ Evaluated with ROUGE and BERTScore metrics
4. ‚úÖ Implemented Bench-RAG evaluation system
5. ‚úÖ Used modular, reusable evaluation functions

**Next Steps:**
- Compare with baseline (untrained) model
- Analyze performance by question type
- Upload model to Hugging Face Hub
- Create detailed evaluation report