In [2]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires 

Testing 5 configs

In [5]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import os
import gc

# Memory optimization
torch.cuda.empty_cache()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Base Model and Dataset
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_name = "qwedsacf/grade-school-math-instructions"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load subset of dataset
dataset = load_dataset(dataset_name, split="train[:5000]")

def format_prompt(example):
    prompt = (
        f"### Instruction:\n{example['INSTRUCTION']}\n\n"
        f"### Response:\n{example['RESPONSE']}"
    )
    tokens = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    # Set labels for causal language modeling
    tokens["labels"] = tokens["input_ids"].clone()
    
    # Convert to regular Python objects (not tensors)
    return {
        "input_ids": tokens["input_ids"].squeeze().tolist(),
        "attention_mask": tokens["attention_mask"].squeeze().tolist(),
        "labels": tokens["labels"].squeeze().tolist()
    }

tokenized_dataset = dataset.map(format_prompt, remove_columns=dataset.column_names)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Extended trial configurations - 5 diverse experiments (memory-optimized)
trial_configs = [
    # Trial 1: Low rank, moderate learning rate (your original)
    {
        "rank": 4, 
        "lr": 2e-4,
        "target_modules": ["q_proj", "v_proj", "gate_proj", "down_proj"],
        "epochs": 3,
        "batch_size": 2,
        "lora_alpha": 8,
        "dropout": 0.1,
        "description": "Low rank baseline"
    },
    
    # Trial 2: High rank, low learning rate
    {
        "rank": 16,  # Reduced from 32
        "lr": 5e-5,
        "target_modules": ["q_proj", "v_proj"],  # Reduced modules
        "epochs": 2,
        "batch_size": 1,
        "lora_alpha": 32,
        "dropout": 0.05,
        "description": "High rank attention-only"
    },
    
    # Trial 3: Medium rank, high learning rate, FFN focus
    {
        "rank": 8,  # Reduced from 16
        "lr": 5e-4,
        "target_modules": ["gate_proj", "down_proj"],  # Reduced modules
        "epochs": 3,  # Reduced from 4
        "batch_size": 2,
        "lora_alpha": 16,
        "dropout": 0.15,
        "description": "FFN-focused with aggressive LR"
    },
    
    # Trial 4: Comprehensive adaptation, balanced settings
    {
        "rank": 4,  # Reduced from 8
        "lr": 1e-4,
        "target_modules": ["q_proj", "v_proj", "gate_proj", "down_proj"],  # Reduced modules
        "epochs": 3,
        "batch_size": 1,
        "lora_alpha": 8,
        "dropout": 0.08,
        "description": "Comprehensive adaptation"
    },
    
    # Trial 5: Minimal adaptation, very low rank
    {
        "rank": 2,
        "lr": 3e-4,
        "target_modules": ["q_proj", "v_proj"],
        "epochs": 3,  # Reduced from 5
        "batch_size": 2,
        "lora_alpha": 4,
        "dropout": 0.2,
        "description": "Minimal rank, attention-only"
    }
]

# Run trials
for i, config in enumerate(trial_configs):
    rank = config["rank"]
    lr = config["lr"]
    target_modules = config["target_modules"]
    epochs = config["epochs"]
    batch_size = config["batch_size"]
    lora_alpha = config["lora_alpha"]
    dropout = config["dropout"]
    
    print(f"\n🚀 Trial {i+1}: {config['description']}")
    print(f"   LoRA Rank: {rank}, LR: {lr}, Epochs: {epochs}, Batch Size: {batch_size}")
    print(f"   Target Modules: {target_modules}")

    # Load base model fresh for each trial with memory optimization
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16, 
        device_map={"": 1},  # Force everything to GPU 1
        low_cpu_mem_usage=True,
        use_cache=False  # Disable cache for gradient checkpointing
    )

    # LoRA config - now using trial-specific parameters
    lora_config = LoraConfig(
        r=rank,
        lora_alpha=lora_alpha,
        target_modules=target_modules,
        lora_dropout=dropout,
        bias="none",  # Back to none for stability
        task_type=TaskType.CAUSAL_LM,
        init_lora_weights="gaussian"
    )
    model = get_peft_model(model, lora_config)
    
    # Enable gradient checkpointing on the model
    model.enable_input_require_grads()
    
    # Ensure model is in training mode
    model.train()

    # Print trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"   Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")

    # Unique output directory per trial
    output_dir = f"./sft_trial_{i+1}_rank{rank}_lr{str(lr).replace('.', '')}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=max(4, 8 // batch_size),  # Increased accumulation
        learning_rate=lr,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=20,
        save_strategy="epoch",
        fp16=True,
        report_to="none",
        warmup_steps=25,
        weight_decay=0.01,
        dataloader_drop_last=True,
        gradient_checkpointing=True,  # Enable gradient checkpointing
        remove_unused_columns=False,
        dataloader_pin_memory=False,  # Reduce memory usage
        save_total_limit=1,  # Keep only latest checkpoint
        max_grad_norm=1.0  # Gradient clipping
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        processing_class=tokenizer,  # Use processing_class instead of tokenizer
        data_collator=data_collator
    )

    # Train
    try:
        trainer.train()
        
        # Save model + tokenizer
        final_path = os.path.join(output_dir, "final")
        trainer.save_model(final_path)
        tokenizer.save_pretrained(final_path)
        
        print(f"✅ Trial {i+1} complete — Model saved to: {final_path}")
        
    except Exception as e:
        print(f"❌ Trial {i+1} failed: {e}")
        print("Continuing to next trial...")
    
    # Clean up memory
    del model, trainer
    torch.cuda.empty_cache()
    import gc
    gc.collect()

print("\n🎯 All 5 trials completed! Ready for evaluation.")
print("\nTrial Summary:")
for i, config in enumerate(trial_configs):
    print(f"Trial {i+1}: {config['description']} (Rank {config['rank']}, LR {config['lr']})")

2025-06-11 13:37:03.841078: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749649024.035174      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749649024.089887      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/852 [00:00<?, ?B/s]

(…)-00000-of-00001-3f5d416810641542.parquet:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8792 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]


🚀 Trial 1: Low rank baseline
   LoRA Rank: 4, LR: 0.0002, Epochs: 3, Batch Size: 2
   Target Modules: ['q_proj', 'v_proj', 'gate_proj', 'down_proj']


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


   Trainable params: 1,914,880 (0.17%)


Step,Training Loss
20,1.4468
40,1.1842
60,1.1414
80,1.1642
100,1.1255
120,1.1072
140,1.1196
160,1.0928
180,1.0944
200,1.1103


✅ Trial 1 complete — Model saved to: ./sft_trial_1_rank4_lr00002/final

🚀 Trial 2: High rank attention-only
   LoRA Rank: 16, LR: 5e-05, Epochs: 2, Batch Size: 1
   Target Modules: ['q_proj', 'v_proj']


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


   Trainable params: 2,252,800 (0.20%)


Step,Training Loss
20,1.6751
40,1.4832
60,1.3171
80,1.2866
100,1.2402
120,1.2091
140,1.2151
160,1.1767
180,1.1756
200,1.1842


✅ Trial 2 complete — Model saved to: ./sft_trial_2_rank16_lr5e-05/final

🚀 Trial 3: FFN-focused with aggressive LR
   LoRA Rank: 8, LR: 0.0005, Epochs: 3, Batch Size: 2
   Target Modules: ['gate_proj', 'down_proj']


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


   Trainable params: 2,703,360 (0.25%)


Step,Training Loss
20,1.4023
40,1.1717
60,1.1304
80,1.1583
100,1.1286
120,1.1088
140,1.1274
160,1.0961
180,1.1001
200,1.1147


✅ Trial 3 complete — Model saved to: ./sft_trial_3_rank8_lr00005/final

🚀 Trial 4: Comprehensive adaptation
   LoRA Rank: 4, LR: 0.0001, Epochs: 3, Batch Size: 1
   Target Modules: ['q_proj', 'v_proj', 'gate_proj', 'down_proj']


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


   Trainable params: 1,914,880 (0.17%)


Step,Training Loss
20,1.5056
40,1.2144
60,1.1589
80,1.1715
100,1.1288
120,1.11
140,1.1208
160,1.0928
180,1.0917
200,1.1081


✅ Trial 4 complete — Model saved to: ./sft_trial_4_rank4_lr00001/final

🚀 Trial 5: Minimal rank, attention-only
   LoRA Rank: 2, LR: 0.0003, Epochs: 3, Batch Size: 2
   Target Modules: ['q_proj', 'v_proj']


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


   Trainable params: 281,600 (0.03%)


Step,Training Loss
20,1.5834
40,1.2947
60,1.2406
80,1.256
100,1.2162
120,1.2
140,1.2118
160,1.1845
180,1.1805
200,1.1944


✅ Trial 5 complete — Model saved to: ./sft_trial_5_rank2_lr00003/final

🎯 All 5 trials completed! Ready for evaluation.

Trial Summary:
Trial 1: Low rank baseline (Rank 4, LR 0.0002)
Trial 2: High rank attention-only (Rank 16, LR 5e-05)
Trial 3: FFN-focused with aggressive LR (Rank 8, LR 0.0005)
Trial 4: Comprehensive adaptation (Rank 4, LR 0.0001)
Trial 5: Minimal rank, attention-only (Rank 2, LR 0.0003)


In [6]:
import json
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from evaluate import load as load_metric

# -------------------------------
# 6. Evaluate SFT Trials with BLEU
# -------------------------------

# Trial metadata (should match your training config)
trial_configs = [
    {"rank": 4, "lr": 2e-4, "description": "Low rank baseline"},
    {"rank": 32, "lr": 5e-5, "description": "High rank attention-only"},
    {"rank": 16, "lr": 5e-4, "description": "FFN-focused with aggressive LR"},
    {"rank": 8, "lr": 1e-4, "description": "Comprehensive adaptation"},
    {"rank": 2, "lr": 3e-4, "description": "Minimal rank, attention-only"}
]

# Paths to each trained model
sft_models = [
    f"./sft_trial_{i+1}_rank{cfg['rank']}_lr{str(cfg['lr']).replace('.', '')}/final"
    for i, cfg in enumerate(trial_configs)
]

# Load BLEU metric
bleu = load_metric("bleu")

# Prompts and references for evaluation
eval_data = [
    {
        "INSTRUCTION": "Liam baked 36 cookies. He gave 1/3 of them to his classmates and shared the rest equally between two friends. How many cookies did each of his two friends get?\nCan you help me out?",
        "RESPONSE": "Liam gave away 1/3 of 36 = 12 cookies.\nHe had 36 - 12 = 24 cookies left.\nHe split the 24 cookies equally between 2 friends, so each friend got 24 / 2 = 12 cookies."
    },
    {
        "INSTRUCTION": "A pack of 5 pencils costs $3. How much would 4 packs cost, and how many pencils would you get in total?\nCan you walk me through it?",
        "RESPONSE": "Each pack costs $3, so 4 packs cost 3 × 4 = $12.\nEach pack has 5 pencils, so 4 packs have 5 × 4 = 20 pencils."
    },
    {
        "INSTRUCTION": "A train travels 60 miles in 1.5 hours. What's its average speed in miles per hour?\nHelp me solve this?",
        "RESPONSE": "Average speed is distance ÷ time.\nSo, 60 ÷ 1.5 = 40 miles per hour."
    },
    {
        "INSTRUCTION": "A class has 24 students. 3/4 of them brought lunch from home. How many students brought lunch?\nCan you explain it to me?",
        "RESPONSE": "3/4 of 24 = 24 × 3 ÷ 4 = 18 students brought lunch from home."
    },
    {
        "INSTRUCTION": "Sarah had $120. She spent 1/3 of it on books and the rest on a backpack. How much did the backpack cost?\nCould you break it down for me?",
        "RESPONSE": "1/3 of $120 is 120 ÷ 3 = $40.\nShe spent the rest, which is 120 - 40 = $80 on the backpack."
    },
    {
        "INSTRUCTION": "A rectangle has a length of 10 cm and width of 4 cm. What's its perimeter?\nCan you help me understand this?",
        "RESPONSE": "Perimeter of a rectangle = 2 × (length + width)\n= 2 × (10 + 4) = 2 × 14 = 28 cm"
    },
    {
        "INSTRUCTION": "James read 45 pages of a book on Monday and twice as many on Tuesday. How many pages did he read in total?\nMind helping me out?",
        "RESPONSE": "On Tuesday, he read 45 × 2 = 90 pages.\nSo total pages = 45 + 90 = 135 pages."
    },
    {
        "INSTRUCTION": "A movie ticket costs $9. If 5 friends go to the movies, how much do they spend in total?\nCan you show me how to work it out?",
        "RESPONSE": "Each friend pays $9, and there are 5 friends.\nSo total = 9 × 5 = $45"
    },
    {
        "INSTRUCTION": "Anna had $75. She bought a shirt for $28 and jeans for $36. How much money does she have left?\nCan you help me calculate?",
        "RESPONSE": "Total spent = 28 + 36 = $64\nMoney left = 75 - 64 = $11"
    },
    {
        "INSTRUCTION": "A water tank holds 500 liters. If 125 liters are used, what fraction of the tank's capacity remains full?\nCan you explain?",
        "RESPONSE": "Water left = 500 - 125 = 375 liters\nFraction remaining = 375 / 500 = 3/4"
    }
]

references = [[ex["RESPONSE"]] for ex in eval_data]
prompts = [ex["INSTRUCTION"] for ex in eval_data]

# Evaluate BLEU for each model
bleu_scores = []
detailed_results = []

print("🔍 Evaluating all 5 SFT trials...")

for i, (path, config) in enumerate(zip(sft_models, trial_configs)):
    print(f"\n📊 Evaluating Trial {i+1}: {config['description']}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map="auto")
        model.eval()

        preds = []
        for j, prompt in enumerate(prompts):
            full_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                output = model.generate(
                    **inputs, 
                    max_new_tokens=100,
                    do_sample=False,
                    temperature=1.0,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            text = tokenizer.decode(output[0], skip_special_tokens=True)
            response = text.split("### Response:")[-1].strip()
            preds.append(response)
            
            if j == 0:  # Show first prediction as example
                print(f"   Sample prediction: {response[:100]}...")

        result = bleu.compute(predictions=preds, references=references)
        bleu_score = result["bleu"]
        bleu_scores.append(bleu_score)
        
        detailed_results.append({
            "trial": i+1,
            "description": config["description"],
            "rank": config["rank"],
            "lr": config["lr"],
            "bleu_score": bleu_score,
            "model_path": path
        })
        
        print(f"   ✅ BLEU Score: {bleu_score:.4f}")
        
        # Clean up GPU memory
        del model, tokenizer
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"   ❌ Error evaluating Trial {i+1}: {e}")
        bleu_scores.append(0.0)
        detailed_results.append({
            "trial": i+1,
            "description": config["description"],
            "rank": config["rank"],
            "lr": config["lr"],
            "bleu_score": 0.0,
            "model_path": path,
            "error": str(e)
        })

# Create comprehensive results DataFrame
bleu_df = pd.DataFrame({
    "Trial": [f"Trial {i+1}" for i in range(len(trial_configs))],
    "Description": [c["description"] for c in trial_configs],
    "LoRA Rank": [c["rank"] for c in trial_configs],
    "Learning Rate": [c["lr"] for c in trial_configs],
    "BLEU Score": bleu_scores
})

# Sort by BLEU score (descending)
bleu_df_sorted = bleu_df.sort_values("BLEU Score", ascending=False).reset_index(drop=True)

# Save results
bleu_df.to_csv("sft_bleu_scores.csv", index=False)
bleu_df_sorted.to_csv("sft_bleu_scores_ranked.csv", index=False)

# Save detailed results as JSON
with open("detailed_evaluation_results.json", "w") as f:
    json.dump(detailed_results, f, indent=2)

# Display results
print("\n" + "="*80)
print("📊 FINAL RESULTS - All 5 SFT Trials")
print("="*80)
print("\nBLEU Scores (Original Order):")
print(bleu_df.to_string(index=False))

print("\n🏆 BLEU Scores (Ranked by Performance):")
print(bleu_df_sorted.to_string(index=False))

# Select and highlight best model
if bleu_scores and max(bleu_scores) > 0:
    best_sft_index = bleu_scores.index(max(bleu_scores))
    best_sft_path = sft_models[best_sft_index]
    best_config = trial_configs[best_sft_index]
    
    print(f"\n🏆 BEST PERFORMING MODEL:")
    print(f"   Trial {best_sft_index+1}: {best_config['description']}")
    print(f"   LoRA Rank: {best_config['rank']}")
    print(f"   Learning Rate: {best_config['lr']}")
    print(f"   BLEU Score: {bleu_scores[best_sft_index]:.4f}")
    print(f"   Model Path: {best_sft_path}")
    
    # Performance analysis
    print(f"\n📈 PERFORMANCE ANALYSIS:")
    avg_score = sum(s for s in bleu_scores if s > 0) / len([s for s in bleu_scores if s > 0])
    print(f"   Average BLEU Score: {avg_score:.4f}")
    print(f"   Best vs Average: +{((bleu_scores[best_sft_index] - avg_score) / avg_score * 100):.1f}%")
    
    # Configuration insights
    ranks = [c["rank"] for i, c in enumerate(trial_configs) if bleu_scores[i] > 0]
    lrs = [c["lr"] for i, c in enumerate(trial_configs) if bleu_scores[i] > 0]
    print(f"   Rank range tested: {min(ranks)} - {max(ranks)}")
    print(f"   LR range tested: {min(lrs):.0e} - {max(lrs):.0e}")
    
else:
    print("\n❌ No successful evaluations completed.")

print("\n✅ Evaluation complete! Results saved to:")
print("   • sft_bleu_scores.csv (original order)")
print("   • sft_bleu_scores_ranked.csv (ranked by performance)")
print("   • detailed_evaluation_results.json (full details)")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

🔍 Evaluating all 5 SFT trials...

📊 Evaluating Trial 1: Low rank baseline
   Sample prediction: Liam gave 1/3*36 = 12 cookies to his classmates.
So, Liam has 36-12 = 24 cookies left.
Hence, each o...
   ✅ BLEU Score: 0.1176

📊 Evaluating Trial 2: High rank attention-only
   ❌ Error evaluating Trial 2: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './sft_trial_2_rank32_lr5e-05/final'. Use `repo_type` argument if needed.

📊 Evaluating Trial 3: FFN-focused with aggressive LR
   ❌ Error evaluating Trial 3: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './sft_trial_3_rank16_lr00005/final'. Use `repo_type` argument if needed.

📊 Evaluating Trial 4: Comprehensive adaptation
   ❌ Error evaluating Trial 4: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './sft_trial_4_rank8_lr00001/final'. Use `repo_type` argument if needed.

📊 Evaluating Trial 5: Minimal rank, attention-only
   Sample prediction: Liam gave 36/3=12 cookies to his classma

In [None]:
pip install trl>=0.7.0  # Make sure TRL is up to date

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOTrainer
from peft import get_peft_model, LoraConfig, TaskType
import os
import gc
import json
import pandas as pd

# Memory optimization
torch.cuda.empty_cache()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Configuration - Use your best SFT model as starting point
# Update this path based on your evaluation results
BEST_SFT_MODEL_PATH = "./sft_trial_1_rank4_lr0002/final"  # Update with your best model
dataset_name = "Anthropic/hh-rlhf"

# DPO trial configurations - optimized for your setup
dpo_trial_configs = [
    # Trial 1: Conservative DPO settings
    {
        "rank": 4,
        "lr": 5e-7,
        "beta": 0.1,
        "epochs": 2,
        "batch_size": 1,
        "lora_alpha": 8,
        "dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"],
        "description": "Conservative DPO baseline"
    },
    
    # Trial 2: Higher learning rate
    {
        "rank": 4,
        "lr": 1e-6,
        "beta": 0.1,
        "epochs": 2,
        "batch_size": 1,
        "lora_alpha": 8,
        "dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"],
        "description": "Higher LR DPO"
    },
    
    # Trial 3: Higher beta (stronger KL penalty)
    {
        "rank": 4,
        "lr": 5e-7,
        "beta": 0.3,
        "epochs": 2,
        "batch_size": 1,
        "lora_alpha": 8,
        "dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"],
        "description": "High beta DPO"
    },
    
    # Trial 4: More comprehensive adaptation
    {
        "rank": 8,
        "lr": 5e-7,
        "beta": 0.1,
        "epochs": 2,
        "batch_size": 1,
        "lora_alpha": 16,
        "dropout": 0.05,
        "target_modules": ["q_proj", "v_proj", "gate_proj", "down_proj"],
        "description": "Comprehensive DPO"
    },
    
    # Trial 5: Low beta (weaker KL penalty)
    {
        "rank": 4,
        "lr": 1e-6,
        "beta": 0.05,
        "epochs": 3,
        "batch_size": 1,
        "lora_alpha": 8,
        "dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"],
        "description": "Low beta DPO"
    }
]

def load_and_prepare_hh_dataset():
    """Load and prepare the Anthropic/hh-rlhf dataset for DPO training"""
    print("Loading Anthropic/hh-rlhf dataset...")
    
    # Load a subset for faster training
    dataset = load_dataset(dataset_name, split="train[:2000]")
    test_dataset = load_dataset(dataset_name, split="test[:500]")
    
    def prepare_dpo_dataset(examples):
        """Prepare dataset in the format expected by DPOTrainer"""
        formatted_examples = {
            "prompt": [],
            "chosen": [],
            "rejected": []
        }
        
        for i in range(len(examples["chosen"])):
            try:
                # Extract the human message (prompt)
                human_msg = examples["chosen"][i][0]["content"]
                prompt = f"### Instruction:\n{human_msg}\n\n### Response:\n"
                
                # Extract chosen and rejected responses
                chosen_response = examples["chosen"][i][1]["content"]
                rejected_response = examples["rejected"][i][1]["content"]
                
                formatted_examples["prompt"].append(prompt)
                formatted_examples["chosen"].append(chosen_response)
                formatted_examples["rejected"].append(rejected_response)
                
            except (IndexError, KeyError, TypeError) as e:
                # Skip malformed examples
                continue
        
        return formatted_examples
    
    # Process the datasets
    train_dataset = dataset.map(
        prepare_dpo_dataset,
        batched=True,
        remove_columns=dataset.column_names,
        batch_size=100
    )
    
    eval_dataset = test_dataset.map(
        prepare_dpo_dataset,
        batched=True,
        remove_columns=test_dataset.column_names,
        batch_size=100
    )
    
    # Filter out empty examples
    train_dataset = train_dataset.filter(lambda x: len(x["prompt"]) > 0)
    eval_dataset = eval_dataset.filter(lambda x: len(x["prompt"]) > 0)
    
    print(f"Training samples: {len(train_dataset)}")
    print(f"Evaluation samples: {len(eval_dataset)}")
    
    return train_dataset, eval_dataset

def load_sft_model_and_tokenizer(model_path):
    """Load the SFT model and tokenizer"""
    print(f"Loading SFT model from: {model_path}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load the main model (policy model)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map={"": 1},  # Force to GPU 1 like your SFT training
        low_cpu_mem_usage=True,
        use_cache=False
    )
    
    # Load reference model (copy of the SFT model for KL penalty)
    ref_model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map={"": 1},
        low_cpu_mem_usage=True,
        use_cache=False
    )
    
    return model, ref_model, tokenizer

# Load dataset once
train_dataset, eval_dataset = load_and_prepare_hh_dataset()

# Run DPO trials
dpo_results = []

for i, config in enumerate(dpo_trial_configs):
    rank = config["rank"]
    lr = config["lr"]
    beta = config["beta"]
    target_modules = config["target_modules"]
    epochs = config["epochs"]
    batch_size = config["batch_size"]
    lora_alpha = config["lora_alpha"]
    dropout = config["dropout"]
    
    print(f"\n🚀 DPO Trial {i+1}: {config['description']}")
    print(f"   LoRA Rank: {rank}, LR: {lr}, Beta: {beta}, Epochs: {epochs}")
    print(f"   Target Modules: {target_modules}")

    try:
        # Load models fresh for each trial
        model, ref_model, tokenizer = load_sft_model_and_tokenizer(BEST_SFT_MODEL_PATH)
        
        # Apply LoRA to the policy model
        lora_config = LoraConfig(
            r=rank,
            lora_alpha=lora_alpha,
            target_modules=target_modules,
            lora_dropout=dropout,
            bias="none",
            task_type=TaskType.CAUSAL_LM,
            init_lora_weights="gaussian"
        )
        model = get_peft_model(model, lora_config)
        
        # Enable gradient requirements
        model.enable_input_require_grads()
        model.train()
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in model.parameters())
        print(f"   Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
        
        # Unique output directory per trial
        output_dir = f"./dpo_trial_{i+1}_rank{rank}_lr{str(lr).replace('.', '')}_beta{str(beta).replace('.', '')}"
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=8,  # Increased for stability
            learning_rate=lr,
            lr_scheduler_type="cosine",
            warmup_steps=50,
            weight_decay=0.01,
            logging_dir=os.path.join(output_dir, "logs"),
            logging_steps=20,
            save_strategy="epoch",
            evaluation_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            fp16=True,
            gradient_checkpointing=True,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
            dataloader_drop_last=True,
            report_to="none",
            max_grad_norm=1.0
        )
        
        # Initialize DPO trainer
        dpo_trainer = DPOTrainer(
            model=model,
            ref_model=ref_model,
            args=training_args,
            beta=beta,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            max_length=512,
            max_prompt_length=256,
        )
        
        # Train
        print("   Starting DPO training...")
        train_result = dpo_trainer.train()
        
        # Save model
        final_path = os.path.join(output_dir, "final")
        dpo_trainer.save_model(final_path)
        tokenizer.save_pretrained(final_path)
        
        # Record results
        final_eval_loss = train_result.log_history[-1].get("eval_loss", float('inf'))
        dpo_results.append({
            "trial": i+1,
            "description": config["description"],
            "rank": rank,
            "lr": lr,
            "beta": beta,
            "epochs": epochs,
            "final_eval_loss": final_eval_loss,
            "model_path": final_path,
            "success": True
        })
        
        print(f"✅ DPO Trial {i+1} complete — Model saved to: {final_path}")
        print(f"   Final eval loss: {final_eval_loss:.4f}")
        
    except Exception as e:
        print(f"❌ DPO Trial {i+1} failed: {e}")
        dpo_results.append({
            "trial": i+1,
            "description": config["description"],
            "rank": rank,
            "lr": lr,
            "beta": beta,
            "epochs": epochs,
            "final_eval_loss": float('inf'),
            "model_path": None,
            "success": False,
            "error": str(e)
        })
    
    # Clean up memory
    try:
        del model, ref_model, dpo_trainer
    except:
        pass
    torch.cuda.empty_cache()
    gc.collect()

# Save DPO results
dpo_df = pd.DataFrame(dpo_results)
dpo_df.to_csv("dpo_training_results.csv", index=False)

# Save detailed results
with open("dpo_detailed_results.json", "w") as f:
    json.dump(dpo_results, f, indent=2)

# Display results
print("\n" + "="*80)
print("📊 DPO TRAINING RESULTS")
print("="*80)

successful_trials = [r for r in dpo_results if r["success"]]
if successful_trials:
    # Sort by eval loss (lower is better)
    successful_trials.sort(key=lambda x: x["final_eval_loss"])
    
    print("\n🏆 SUCCESSFUL DPO TRIALS (Ranked by Eval Loss):")
    for i, result in enumerate(successful_trials):
        print(f"{i+1}. Trial {result['trial']}: {result['description']}")
        print(f"   Rank: {result['rank']}, LR: {result['lr']}, Beta: {result['beta']}")
        print(f"   Final Eval Loss: {result['final_eval_loss']:.4f}")
        print(f"   Model: {result['model_path']}")
        print()
    
    best_dpo = successful_trials[0]
    print(f"🥇 BEST DPO MODEL:")
    print(f"   Trial {best_dpo['trial']}: {best_dpo['description']}")
    print(f"   Path: {best_dpo['model_path']}")
    print(f"   Eval Loss: {best_dpo['final_eval_loss']:.4f}")
    
else:
    print("❌ No successful DPO trials completed.")

failed_trials = [r for r in dpo_results if not r["success"]]
if failed_trials:
    print(f"\n❌ FAILED TRIALS ({len(failed_trials)}):")
    for result in failed_trials:
        print(f"   Trial {result['trial']}: {result['description']} - {result.get('error', 'Unknown error')}")

print(f"\n✅ DPO training complete! Results saved to:")
print("   • dpo_training_results.csv")
print("   • dpo_detailed_results.json")
print(f"\nNext step: Run evaluation on your best DPO model!")

In [None]:
import json
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from evaluate import load as load_metric
import os
import gc

# Load DPO results to get model paths
try:
    with open("dpo_detailed_results.json", "r") as f:
        dpo_results = json.load(f)
    successful_dpo_models = [r for r in dpo_results if r["success"]]
except FileNotFoundError:
    print("❌ DPO results file not found. Run DPO training first.")
    exit()

if not successful_dpo_models:
    print("❌ No successful DPO models found.")
    exit()

# Load BLEU metric
bleu = load_metric("bleu")

# Evaluation data (same as your SFT evaluation)
eval_data = [
    {
        "INSTRUCTION": "Liam baked 36 cookies. He gave 1/3 of them to his classmates and shared the rest equally between two friends. How many cookies did each of his two friends get?\nCan you help me out?",
        "RESPONSE": "Liam gave away 1/3 of 36 = 12 cookies.\nHe had 36 - 12 = 24 cookies left.\nHe split the 24 cookies equally between 2 friends, so each friend got 24 / 2 = 12 cookies."
    },
    {
        "INSTRUCTION": "A pack of 5 pencils costs $3. How much would 4 packs cost, and how many pencils would you get in total?\nCan you walk me through it?",
        "RESPONSE": "Each pack costs $3, so 4 packs cost 3 × 4 = $12.\nEach pack has 5 pencils, so 4 packs have 5 × 4 = 20 pencils."
    },
    {
        "INSTRUCTION": "A train travels 60 miles in 1.5 hours. What's its average speed in miles per hour?\nHelp me solve this?",
        "RESPONSE": "Average speed is distance ÷ time.\nSo, 60 ÷ 1.5 = 40 miles per hour."
    },
    {
        "INSTRUCTION": "A class has 24 students. 3/4 of them brought lunch from home. How many students brought lunch?\nCan you explain it to me?",
        "RESPONSE": "3/4 of 24 = 24 × 3 ÷ 4 = 18 students brought lunch from home."
    },
    {
        "INSTRUCTION": "Sarah had $120. She spent 1/3 of it on books and the rest on a backpack. How much did the backpack cost?\nCould you break it down for me?",
        "RESPONSE": "1/3 of $120 is 120 ÷ 3 = $40.\nShe spent the rest, which is 120 - 40 = $80 on the backpack."
    },
    {
        "INSTRUCTION": "A rectangle has a length of 10 cm and width of 4 cm. What's its perimeter?\nCan you help me understand this?",
        "RESPONSE": "Perimeter of a rectangle = 2 × (length + width)\n= 2 × (10 + 4) = 2 × 14 = 28 cm"
    },
    {
        "INSTRUCTION": "James read 45 pages of a book on Monday and twice as many on Tuesday. How many pages did he read in total?\nMind helping me out?",
        "RESPONSE": "On Tuesday, he read 45 × 2 = 90 pages.\nSo total pages = 45 + 90 = 135 pages."
    },
    {
        "INSTRUCTION": "A movie ticket costs $9. If 5 friends go to the movies, how much do they spend in total?\nCan you show me how to work it out?",
        "RESPONSE": "Each friend pays $9, and there are 5 friends.\nSo total = 9 × 5 = $45"
    },
    {
        "INSTRUCTION": "Anna had $75. She bought a shirt for $28 and jeans for $36. How much money does she have left?\nCan you help me calculate?",
        "RESPONSE": "Total spent = 28 + 36 = $64\nMoney left = 75 - 64 = $11"
    },
    {
        "INSTRUCTION": "A water tank holds 500 liters. If 125 liters are used, what fraction of the tank's capacity remains full?\nCan you explain?",
        "RESPONSE": "Water left = 500 - 125 = 375 liters\nFraction remaining = 375 / 500 = 3/4"
    }
]

references = [[ex["RESPONSE"]] for ex in eval_data]
prompts = [ex["INSTRUCTION"] for ex in eval_data]

# Also add some preference-based evaluation prompts
preference_prompts = [
    "Explain why helping others is important.",
    "What's the best way to study for an exam?",
    "How should someone handle a disagreement with a friend?",
    "What makes a good leader?",
    "Why is it important to be honest?",
]

def evaluate_model_responses(model_path, model_info):
    """Evaluate a single DPO model"""
    print(f"\n📊 Evaluating: {model_info['description']}")
    print(f"   Path: {model_path}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path, 
            torch_dtype=torch.float16, 
            device_map="auto"
        )
        model.eval()

        # Math problem evaluation (BLEU)
        math_preds = []
        for j, prompt in enumerate(prompts):
            full_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                output = model.generate(
                    **inputs, 
                    max_new_tokens=150,
                    do_sample=False,
                    temperature=1.0,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            text = tokenizer.decode(output[0], skip_special_tokens=True)
            response = text.split("### Response:")[-1].strip()
            math_preds.append(response)
            
            if j == 0:  # Show first prediction
                print(f"   Sample math response: {response[:100]}...")

        # Calculate BLEU score
        bleu_result = bleu.compute(predictions=math_preds, references=references)
        bleu_score = bleu_result["bleu"]
        
        # Preference-based evaluation (qualitative)
        preference_responses = []
        for prompt in preference_prompts[:3]:  # Limit to 3 for brevity
            full_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                output = model.generate(
                    **inputs, 
                    max_new_tokens=100,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            text = tokenizer.decode(output[0], skip_special_tokens=True)
            response = text.split("### Response:")[-1].strip()
            preference_responses.append(response)
        
        # Clean up
        del model, tokenizer
        torch.cuda.empty_cache()
        gc.collect()
        
        return {
            "success": True,
            "bleu_score": bleu_score,
            "math_predictions": math_preds,
            "preference_responses": preference_responses
        }
        
    except Exception as e:
        print(f"   ❌ Error: {e}")
        return {
            "success": False,
            "error": str(e),
            "bleu_score": 0.0
        }

# Evaluate all successful DPO models
print("🔍 Evaluating DPO models...")
evaluation_results = []

for model_info in successful_dpo_models:
    model_path = model_info["model_path"]
    if model_path and os.path.exists(model_path):
        eval_result = evaluate_model_responses(model_path, model_info)
        
        # Combine model info with evaluation results
        combined_result = {**model_info, **eval_result}
        evaluation_results.append(combined_result)
        
        if eval_result["success"]:
            print(f"   ✅ BLEU Score: {eval_result['bleu_score']:.4f}")
        else:
            print(f"   ❌ Evaluation failed")
    else:
        print(f"   ⚠️ Model path not found: {model_path}")

# Create results summary
if evaluation_results:
    # Filter successful evaluations
    successful_evals = [r for r in evaluation_results if r.get("success", False)]
    
    if successful_evals:
        # Sort by BLEU score
        successful_evals.sort(key=lambda x: x["bleu_score"], reverse=True)
        
        # Create summary DataFrame
        summary_df = pd.DataFrame([
            {
                "Trial": f"DPO Trial {r['trial']}",
                "Description": r["description"],
                "LoRA Rank": r["rank"],
                "Learning Rate": r["lr"],
                "Beta": r["beta"],
                "BLEU Score": r["bleu_score"],
                "Final Eval Loss": r["final_eval_loss"],
                "Model Path": r["model_path"]
            }
            for r in successful_evals
        ])
        
        # Save results
        summary_df.to_csv("dpo_evaluation_results.csv", index=False)
        
        # Save detailed results
        with open("dpo_evaluation_detailed.json", "w") as f:
            json.dump(evaluation_results, f, indent=2)
        
        print("\n" + "="*80)
        print("📊 DPO EVALUATION RESULTS")
        print("="*80)
        print("\n🏆 DPO Models Ranked by BLEU Score:")
        print(summary_df[["Trial", "Description", "Beta", "BLEU Score"]].to_string(index=False))
        
        # Best model analysis
        best_dpo = successful_evals[0]
        print(f"\n🥇 BEST DPO MODEL:")
        print(f"   {best_dpo['description']}")
        print(f"   Beta: {best_dpo['beta']}, LR: {best_dpo['lr']}, Rank: {best_dpo['rank']}")
        print(f"   BLEU Score: {best_dpo['bleu_score']:.4f}")
        print(f"   DPO Eval Loss: {best_dpo['final_eval_loss']:.4f}")
        print(f"   Model Path: {best_dpo['model_path']}")
        
        # Show sample responses from best model
        print(f"\n💬 SAMPLE RESPONSES FROM BEST MODEL:")
        for i, response in enumerate(best_dpo.get("preference_responses", [])[:2]):
            print(f"   Q: {preference_prompts[i]}")
            print(f"   A: {response[:200]}...")
            print()
        
        # Performance comparison
        if len(successful_evals) > 1:
            print(f"\n📈 PERFORMANCE INSIGHTS:")
            bleu_scores = [r["bleu_score"] for r in successful_evals]
            print(f"   Best BLEU: {max(bleu_scores):.4f}")
            print(f"   Worst BLEU: {min(bleu_scores):.4f}")
            print(f"   Average BLEU: {sum(bleu_scores)/len(bleu_scores):.4f}")
            
            # Beta analysis
            betas = [r["beta"] for r in successful_evals]
            best_beta = best_dpo["beta"]
            print(f"   Beta range: {min(betas)} - {max(betas)}")
            print(f"   Best beta: {best_beta}")
        
        print(f"\n✅ Evaluation complete! Results saved to:")
        print("   • dpo_evaluation_results.csv")
        print("   • dpo_evaluation_detailed.json")
        
    else:
        print("❌ No successful DPO evaluations completed.")
        
else:
    print("❌ No DPO models to evaluate.")

print(f"\n🎯 DPO Pipeline Complete!")
print("Your best DPO model is ready for deployment or further fine-tuning.")