# Baseline Response Generation

This notebook generates baseline responses using Llama-3.1-8B-Instruct for DPO training.

**Environment**: Google Colab with T4 GPU (15GB VRAM)

## Workflow
1. Install dependencies
2. Load cleaned dataset (from GitHub or upload)
3. Load Llama-3.1-8B-Instruct with 4-bit quantization
4. Generate baseline responses (simple prompt, no professional guidance)
5. Save results with checkpoint support
6. Download results

## 1. Setup and Installation

In [None]:
# Install required packages
# Fix torch/torchvision version conflict and accelerate compatibility
!pip uninstall -y torchvision torchaudio -q
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers==4.44.0 accelerate==0.33.0 bitsandbytes==0.43.0 datasets

# IMPORTANT: After running this cell, RESTART the runtime!
# Go to Runtime -> Restart runtime, then skip this cell and continue from the next one
print("Installation complete! Please restart runtime now.")

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
import json
import os
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
import gc

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import torch

## 2. Configuration

In [None]:
# Configuration
CONFIG = {
    # Model settings
    "model_name": "meta-llama/Llama-3.1-8B-Instruct",
    
    # Data paths
    "input_file": "data/processed/counsel_chat_cleaned.jsonl",
    "output_file": "data/baseline/responses.jsonl",
    "checkpoint_file": "data/baseline/checkpoint.json",
    
    # Generation settings (OPTIMIZED for speed)
    "batch_size": 2,              # Smaller batch for stability
    "max_new_tokens": 256,        # Reduced from 512 - still sufficient for responses
    "temperature": 0.7,
    "top_p": 0.9,
    "do_sample": True,
    
    # Checkpoint frequency (save every N records)
    "checkpoint_freq": 50
}

# Create directories
os.makedirs("data/baseline", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print(f"Config loaded. Estimated time: ~3-4 hours for full dataset")

## 3. Load Dataset

**Option A**: Clone from GitHub  
**Option B**: Upload file directly

In [None]:
# Option A: Clone from GitHub
!git clone https://github.com/yuchangyuan1/6895_project_Agent.git temp_repo
!cp temp_repo/data/processed/counsel_chat_cleaned.jsonl data/processed/
!rm -rf temp_repo

# Option B: Upload file directly
# from google.colab import files
# uploaded = files.upload()  # Upload counsel_chat_cleaned.jsonl
# !mv counsel_chat_cleaned.jsonl data/processed/

In [None]:
def load_dataset(filepath: str) -> list:
    """Load dataset from JSONL file."""
    records = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line.strip()))
    print(f"Loaded {len(records)} records from {filepath}")
    return records

# Load data
dataset = load_dataset(CONFIG["input_file"])
print(f"Sample record: {dataset[0]}")

## 4. Load Model with 4-bit Quantization

In [None]:
# HuggingFace login (required for Llama models)
from huggingface_hub import login

# Enter your HuggingFace token
# Get token from: https://huggingface.co/settings/tokens
HF_TOKEN = ""  

if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Logged in to HuggingFace")
else:
    print("WARNING: No HuggingFace token provided. You may need to login manually.")
    # login()  # Interactive login

In [None]:
def load_model_and_tokenizer(model_name: str):
    """Load model with 4-bit quantization for memory efficiency."""
    
    # 4-bit quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    
    print(f"Loading tokenizer: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    print(f"Loading model with 4-bit quantization: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    
    model.eval()
    
    # Print memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU memory used: {memory_used:.2f} GB")
    
    return model, tokenizer

# Load model
model, tokenizer = load_model_and_tokenizer(CONFIG["model_name"])

## 5. Baseline Prompt Template

**Important**: This is a simple prompt WITHOUT professional mental health guidance.  
This simulates a basic model response that will serve as the "rejected" response in DPO.

In [None]:
def create_baseline_prompt(question: str) -> str:
    """
    Create a simple baseline prompt without professional guidance.
    
    This prompt intentionally lacks:
    - Empathy instructions
    - Professional counseling guidelines
    - Safety considerations
    """
    return f"""You are a helpful assistant. Please respond to the following question:

{question}"""


def format_for_llama(prompt: str) -> str:
    """Format prompt for Llama-3.1 chat template."""
    messages = [
        {"role": "user", "content": prompt}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

# Test prompt
sample_prompt = create_baseline_prompt(dataset[0]["question"])
formatted = format_for_llama(sample_prompt)
print("Sample formatted prompt:")
print(formatted[:500])

## 6. Generation Functions

In [None]:
def generate_response(prompts: list, model, tokenizer, config: dict) -> list:
    """Generate responses for a batch of prompts."""
    
    # Tokenize
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=config["max_new_tokens"],
            temperature=config["temperature"],
            top_p=config["top_p"],
            do_sample=config["do_sample"],
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode responses (only new tokens)
    responses = []
    for i, output in enumerate(outputs):
        # Get only the generated part
        input_len = inputs["input_ids"][i].shape[0]
        response = tokenizer.decode(
            output[input_len:],
            skip_special_tokens=True
        ).strip()
        responses.append(response)
    
    return responses


def load_checkpoint(checkpoint_file: str) -> int:
    """Load checkpoint to resume from last position."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            data = json.load(f)
            return data.get("last_index", 0)
    return 0


def save_checkpoint(checkpoint_file: str, last_index: int):
    """Save checkpoint."""
    with open(checkpoint_file, "w") as f:
        json.dump({"last_index": last_index, "timestamp": str(datetime.now())}, f)

## 7. Run Baseline Generation

In [None]:
def run_baseline_generation(dataset: list, model, tokenizer, config: dict):
    """
    Run baseline generation with checkpoint support.
    
    Saves results incrementally to support interruption and resume.
    """
    output_file = config["output_file"]
    checkpoint_file = config["checkpoint_file"]
    batch_size = config["batch_size"]
    checkpoint_freq = config["checkpoint_freq"]
    
    # Sort dataset by question length to minimize padding waste
    print("Sorting dataset by question length for efficient batching...")
    sorted_dataset = sorted(dataset, key=lambda x: len(x["question"]))
    print(f"Length range: {len(sorted_dataset[0]['question'])} - {len(sorted_dataset[-1]['question'])} chars")
    
    # Load checkpoint
    start_index = load_checkpoint(checkpoint_file)
    if start_index > 0:
        print(f"Resuming from index {start_index}")
    
    # Open output file in append mode if resuming
    mode = "a" if start_index > 0 else "w"
    
    total = len(sorted_dataset)
    
    with open(output_file, mode, encoding="utf-8") as f:
        for i in tqdm(range(start_index, total, batch_size), desc="Generating"):
            # Get batch
            batch = sorted_dataset[i:min(i + batch_size, total)]
            
            # Prepare prompts
            prompts = [
                format_for_llama(create_baseline_prompt(record["question"]))
                for record in batch
            ]
            
            # Generate responses
            try:
                responses = generate_response(prompts, model, tokenizer, config)
                
                # Save results
                for j, (record, response) in enumerate(zip(batch, responses)):
                    result = {
                        "id": record["id"],
                        "question": record["question"],
                        "original_answer": record["answer"],
                        "baseline_response": response,
                        "topic": record.get("topic", "general")
                    }
                    f.write(json.dumps(result, ensure_ascii=False) + "\n")
                
                # Flush to disk
                f.flush()
                
            except Exception as e:
                print(f"Error at batch starting {i}: {e}")
                save_checkpoint(checkpoint_file, i)
                raise
            
            # Save checkpoint periodically
            if (i + batch_size) % checkpoint_freq == 0:
                save_checkpoint(checkpoint_file, i + batch_size)
            
            # Clear GPU cache periodically
            if (i + batch_size) % (batch_size * 10) == 0:
                torch.cuda.empty_cache()
                gc.collect()
    
    # Final checkpoint
    save_checkpoint(checkpoint_file, total)
    print(f"\nGeneration complete! Output saved to {output_file}")

# Run generation
run_baseline_generation(dataset, model, tokenizer, CONFIG)

## 8. Verify Results

In [None]:
# Check output file
!wc -l {CONFIG["output_file"]}
!head -3 {CONFIG["output_file"]}

In [None]:
# Load and inspect results
def inspect_results(filepath: str, n: int = 3):
    """Inspect generated results."""
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            record = json.loads(line)
            print(f"\n{'='*60}")
            print(f"ID: {record['id']}")
            print(f"\nQuestion: {record['question'][:200]}...")
            print(f"\nOriginal Answer: {record['original_answer'][:200]}...")
            print(f"\nBaseline Response: {record['baseline_response'][:200]}...")

inspect_results(CONFIG["output_file"])

## 9. Download Results

In [None]:
# Option A: Download directly
from google.colab import files
files.download(CONFIG["output_file"])

In [None]:
# Option B: Save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp {CONFIG["output_file"]} /content/drive/MyDrive/

## 10. Cleanup

In [None]:
# Free GPU memory
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

print("Cleanup complete!")