# Baseline Response Generation (Unsloth Version)

This notebook uses **Unsloth** for faster inference with Llama-3.1-8B-Instruct.

**Environment**: Google Colab with T4 GPU (15GB VRAM)

## Advantages over standard transformers
- 2-5x faster inference
- Lower GPU memory usage
- Native 4-bit quantization support

## Workflow
1. Install Unsloth
2. Load cleaned dataset
3. Load Llama-3.1-8B-Instruct with Unsloth
4. Generate baseline responses
5. Download results

## 1. Install Unsloth

In [None]:
%%capture
# Install Unsloth (optimized for Colab)
!pip install unsloth
# Install xformers for additional speedup
!pip install xformers

In [None]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
import json
import os
from datetime import datetime
from tqdm import tqdm
import gc

## 2. Configuration

In [None]:
# Configuration
CONFIG = {
    # Model settings
    "model_name": "unsloth/Meta-Llama-3.1-8B-Instruct",  # Unsloth optimized version
    
    # Data paths
    "input_file": "data/processed/counsel_chat_cleaned.jsonl",
    "output_file": "data/baseline/responses.jsonl",
    "checkpoint_file": "data/baseline/checkpoint.json",
    
    # Generation settings
    "batch_size": 1,              # Unsloth works best with batch_size=1
    "max_new_tokens": 256,
    "temperature": 0.7,
    "top_p": 0.9,
    "do_sample": True,
    
    # Checkpoint frequency
    "checkpoint_freq": 100
}

# Create directories
os.makedirs("data/baseline", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print("Config loaded. Unsloth will be 2-5x faster than standard transformers!")

## 3. Load Dataset

In [None]:
# Clone from GitHub
!git clone https://github.com/yuchangyuan1/6895_project_Agent.git temp_repo
!cp temp_repo/data/processed/counsel_chat_cleaned.jsonl data/processed/
!rm -rf temp_repo
print("Dataset loaded from GitHub!")

In [None]:
def load_dataset(filepath: str) -> list:
    """Load dataset from JSONL file."""
    records = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line.strip()))
    print(f"Loaded {len(records)} records")
    return records

dataset = load_dataset(CONFIG["input_file"])
print(f"Sample: {dataset[0]['question'][:100]}...")

## 4. Load Model with Unsloth

In [None]:
from unsloth import FastLanguageModel

# Load model with 4-bit quantization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG["model_name"],
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True,
)

# Enable optimized inference mode
FastLanguageModel.for_inference(model)

print(f"Model loaded! GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## 5. Prompt Template

In [None]:
def create_baseline_prompt(question: str) -> str:
    """Create a simple baseline prompt without professional guidance."""
    return f"""You are a helpful assistant. Please respond to the following question:

{question}"""


def format_for_llama(prompt: str, tokenizer) -> str:
    """Format prompt for Llama-3.1 chat template."""
    messages = [
        {"role": "user", "content": prompt}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

# Test
sample = format_for_llama(create_baseline_prompt(dataset[0]["question"]), tokenizer)
print(sample[:300])

## 6. Generation Functions

In [None]:
def generate_single(prompt: str, model, tokenizer, config: dict) -> str:
    """Generate response for a single prompt using Unsloth."""
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=config["max_new_tokens"],
            temperature=config["temperature"],
            top_p=config["top_p"],
            do_sample=config["do_sample"],
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode only new tokens
    input_len = inputs["input_ids"].shape[1]
    response = tokenizer.decode(
        outputs[0][input_len:],
        skip_special_tokens=True
    ).strip()
    
    return response


def load_checkpoint(checkpoint_file: str) -> int:
    """Load checkpoint to resume."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return json.load(f).get("last_index", 0)
    return 0


def save_checkpoint(checkpoint_file: str, last_index: int):
    """Save checkpoint."""
    with open(checkpoint_file, "w") as f:
        json.dump({"last_index": last_index, "timestamp": str(datetime.now())}, f)

## 7. Run Generation

In [None]:
# Optional: Delete old checkpoint to start fresh
# !rm -f data/baseline/checkpoint.json data/baseline/responses.jsonl
# print("Old files deleted!")

In [None]:
def run_baseline_generation(dataset: list, model, tokenizer, config: dict):
    """Run baseline generation with Unsloth."""
    output_file = config["output_file"]
    checkpoint_file = config["checkpoint_file"]
    checkpoint_freq = config["checkpoint_freq"]
    
    # Load checkpoint
    start_index = load_checkpoint(checkpoint_file)
    if start_index > 0:
        print(f"Resuming from index {start_index}")
    
    mode = "a" if start_index > 0 else "w"
    total = len(dataset)
    
    with open(output_file, mode, encoding="utf-8") as f:
        for i in tqdm(range(start_index, total), desc="Generating"):
            record = dataset[i]
            
            # Prepare prompt
            prompt = format_for_llama(
                create_baseline_prompt(record["question"]),
                tokenizer
            )
            
            try:
                # Generate response
                response = generate_single(prompt, model, tokenizer, config)
                
                # Save result
                result = {
                    "id": record["id"],
                    "question": record["question"],
                    "original_answer": record["answer"],
                    "baseline_response": response,
                    "topic": record.get("topic", "general")
                }
                f.write(json.dumps(result, ensure_ascii=False) + "\n")
                f.flush()
                
            except Exception as e:
                print(f"Error at index {i}: {e}")
                save_checkpoint(checkpoint_file, i)
                raise
            
            # Save checkpoint
            if (i + 1) % checkpoint_freq == 0:
                save_checkpoint(checkpoint_file, i + 1)
    
    save_checkpoint(checkpoint_file, total)
    print(f"\nDone! Output: {output_file}")

# Run!
run_baseline_generation(dataset, model, tokenizer, CONFIG)

## 8. Verify Results

In [None]:
# Check output
!wc -l data/baseline/responses.jsonl
!head -2 data/baseline/responses.jsonl

In [None]:
# Inspect results
def inspect_results(filepath: str, n: int = 2):
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            record = json.loads(line)
            print(f"\n{'='*50}")
            print(f"Q: {record['question'][:150]}...")
            print(f"\nBaseline: {record['baseline_response'][:200]}...")

inspect_results(CONFIG["output_file"])

## 9. Download Results

In [None]:
from google.colab import files
files.download(CONFIG["output_file"])

## 10. Cleanup

In [None]:
# Free GPU memory
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()
print("Cleanup complete!")