### Cell 1: Verify GPU availability for LLM inference


In [None]:
import torch

print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))


### Cell 2: Install and update Hugging Face and acceleration libraries


In [None]:
!pip install -q --upgrade bitsandbytes accelerate transformers
!pip install -q torch sentencepiece tqdm pandas huggingface_hub

### Cell 3: Add HuggingFace Token and Verify.

In [None]:
from huggingface_hub import login

HF_TOKEN = "huggingface_token_here"

login(token=HF_TOKEN)
print("Logged in to Hugging Face")

# HuggingFace GUI Token Input.

# from huggingface_hub import notebook_login
# notebook_login()

### Cell 4: Define all the models.
• We load three instruction-tuned LLMs to evaluate mathematical reasoning performance.

• DeepSeek-Math-7B-Instruct serves as a high-capacity model for complex, multi-step math problems.

• Qwen2.5-Math-7B-Instruct provides a comparable large model with a different training strategy for fair comparison.

• Phi-3.5-mini is a lightweight Microsoft model used to analyze efficiency vs accuracy trade-offs.

• All models are downloaded once and cached locally, ensuring faster execution and reproducible results.

In [None]:
"""
ONE-TIME SETUP: Download all models to cache without loading into RAM
Run this once to download all models, then use the memory-efficient loader
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import gc

CACHE_DIR = "/content/gdrive/MyDrive/LLM_Math_Cache"
os.makedirs(CACHE_DIR, exist_ok=True)

MODELS = [
    {"name": "DeepSeek-Math-7B", "repo": "deepseek-ai/DeepSeek-Math-7B-Instruct"},
    {"name": "Qwen2.5-Math-7B-Instruct", "repo": "Qwen/Qwen2.5-Math-7B-Instruct"},
    {"name": "Phi-3.5-mini", "repo": "microsoft/Phi-3.5-mini-instruct"},
]

def is_model_cached(cache_path):
    """Check if model is already downloaded."""
    if not os.path.exists(cache_path):
        return False
    try:
        files = os.listdir(cache_path)
        has_config = "config.json" in files
        has_weights = any(f.endswith(".safetensors") or f.endswith(".bin") for f in files)

        snapshots_dir = os.path.join(cache_path, "snapshots")
        if os.path.exists(snapshots_dir):
            for snapshot in os.listdir(snapshots_dir):
                snapshot_path = os.path.join(snapshots_dir, snapshot)
                if os.path.isdir(snapshot_path):
                    snapshot_files = os.listdir(snapshot_path)
                    has_config = has_config or "config.json" in snapshot_files
                    has_weights = has_weights or any(
                        f.endswith(".safetensors") or f.endswith(".bin") for f in snapshot_files
                    )
        return has_config and has_weights
    except:
        return False

print("="*70)
print("DOWNLOADING ALL MODELS TO CACHE (One-time Setup)")
print("="*70)
print("This will download models WITHOUT loading them into RAM")
print("After this completes, use the memory-efficient loader to test them\n")

try:
    HF_TOKEN
except NameError:
    print("ERROR: HF_TOKEN not found! Run the login cell first.")
    raise

for i, m in enumerate(MODELS, 1):
    repo = m["repo"]
    name = m["name"]
    cache_subdir = repo.replace("/", "--")
    cache_path = os.path.join(CACHE_DIR, cache_subdir)

    print(f"\n[{i}/{len(MODELS)}] {name}")
    print("-"*70)

    if is_model_cached(cache_path):
        print("✓ Already cached, skipping download")
        continue

    try:
        print("Downloading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            repo,
            token=HF_TOKEN,
            trust_remote_code=True,
            cache_dir=CACHE_DIR,
        )
        del tokenizer
        gc.collect()

        print("Downloading model files (this may take several minutes)...")
        # Download model files without loading into RAM
        from huggingface_hub import snapshot_download
        snapshot_download(
            repo_id=repo,
            cache_dir=CACHE_DIR,
            token=HF_TOKEN,
            ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.md"]  # Skip unnecessary files
        )

        print(f"✓ {name} downloaded successfully")

        # Clear memory
        gc.collect()
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"✗ FAILED: {str(e)[:200]}")
        if "gated" in str(e).lower() or "access" in str(e).lower():
            print(f"   → Visit https://huggingface.co/{repo} and accept the license")
        continue

print("\n" + "="*70)
print("DOWNLOAD COMPLETE!")
print("="*70)
print("\nNext step: Use the memory-efficient loader to test models one at a time")
print("Example:")
print("  pipe = model_loaders['Deepseek-Math-7B']()")
print("  result = pipe('What is 2+2?')")
print("="*70)

### Cell 5: Check the cached model and initialize the 4-bit quantization
• Configures memory-efficient 4-bit quantization to load large language models with reduced GPU and CPU usage.

• Checks whether the DeepSeek-Math-7B model is already cached locally and avoids re-downloading if present.

• Implements lazy loading, ensuring the model is loaded only when required for inference.

• Creates a deterministic text-generation pipeline, enabling reproducible mathematical reasoning experiments.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from tqdm import tqdm
import os

# 4-bit config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

CACHE_DIR = "/content/gdrive/MyDrive/LLM_Math_Cache"

MODELS = [

    {"name": "Deepseek-Math-7B", "repo": "deepseek-ai/deepseek-math-7b-instruct"},
]

def is_model_cached(repo):
    """Check if model is fully downloaded by checking HF cache structure."""
    # HuggingFace uses models--org--name format
    cache_subdir = f"models--{repo.replace('/', '--')}"
    cache_path = os.path.join(CACHE_DIR, cache_subdir)

    if not os.path.exists(cache_path):
        return False

    try:
        # Check for snapshots directory (HF cache structure)
        snapshots_dir = os.path.join(cache_path, "snapshots")
        if not os.path.exists(snapshots_dir):
            return False

        # Check if any snapshot exists and has model files
        snapshots = os.listdir(snapshots_dir)
        if not snapshots:
            return False

        # Check the latest snapshot
        for snapshot in snapshots:
            snapshot_path = os.path.join(snapshots_dir, snapshot)
            if os.path.isdir(snapshot_path):
                snapshot_files = os.listdir(snapshot_path)
                has_config = "config.json" in snapshot_files
                has_tokenizer = any("tokenizer" in f for f in snapshot_files)
                has_weights = any(
                    f.endswith(".safetensors") or f.endswith(".bin")
                    for f in snapshot_files
                )

                if has_config and has_tokenizer and has_weights:
                    return True

        return False
    except Exception as e:
        print(f"  Warning checking cache: {e}")
        return False

def load_single_model(repo, name, local_files_only=False):
    """Load a single model and return pipeline."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            repo,
            token=HF_TOKEN,
            trust_remote_code=True,
            cache_dir=CACHE_DIR,
            local_files_only=local_files_only,
            force_download=False,
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Special handling for Phi-3.5-mini
        if "Deepseek-Math-7B" in name:
            print(f"  → Using eager attention for Phi-3.5-mini")
            model_kwargs = {
                "quantization_config": quant_config,
                "device_map": "auto",
                "torch_dtype": torch.bfloat16,
                "trust_remote_code": False,
                "attn_implementation": "eager",
                "token": HF_TOKEN,
                "cache_dir": CACHE_DIR,
                "local_files_only": local_files_only,
                "low_cpu_mem_usage": True
            }
        else:
            model_kwargs = {
                "quantization_config": quant_config,
                "device_map": "auto",
                "torch_dtype": torch.bfloat16,
                "trust_remote_code": True,
                "token": HF_TOKEN,
                "cache_dir": CACHE_DIR,
                "local_files_only": local_files_only,
                "low_cpu_mem_usage": True
            }

        model = AutoModelForCausalLM.from_pretrained(repo, **model_kwargs)

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=2048,
            temperature=0.0,
            do_sample=False
        )

        return pipe
    except Exception as e:
        print(f"  → Error: {str(e)[:200]}")
        return None

def get_model_loader(model_name):
    """
    Factory function that returns a loader for a specific model.
    This allows lazy loading - models are only loaded when you call the function.
    """
    repo = None
    for m in MODELS:
        if m["name"] == model_name:
            repo = m["repo"]
            break

    if repo is None:
        raise ValueError(f"Model {model_name} not found")

    # Check if cached
    local_files_only = is_model_cached(repo)

    def loader():
        """Load this specific model on demand."""
        print(f"Loading {model_name}...")
        pipe = load_single_model(repo, model_name, local_files_only)
        if pipe:
            print(f"✓ {model_name} loaded successfully")
        return pipe

    return loader

# Verify token
try:
    HF_TOKEN
    print(f"✓ HF_TOKEN is set\n")
except NameError:
    print("⚠️  ERROR: HF_TOKEN not found! Run the login cell first.")
    raise

os.makedirs(CACHE_DIR, exist_ok=True)

print("="*70)
print("MEMORY-EFFICIENT MODEL LOADER")
print("="*70)
print(f"Cache: {CACHE_DIR}\n")

# Check which models are cached
print("Checking cache status...\n")
model_loaders = {}
cached_count = 0

for m in MODELS:
    repo = m["repo"]
    name = m["name"]

    is_cached = is_model_cached(repo)
    status = "✓ CACHED" if is_cached else "✗ NOT CACHED"

    # Show actual folder name for debugging
    cache_folder = f"models--{repo.replace('/', '--')}"
    cache_path = os.path.join(CACHE_DIR, cache_folder)
    exists = "EXISTS" if os.path.exists(cache_path) else "MISSING"

    print(f"{status:15} {name:20} ({exists})")

    if is_cached:
        cached_count += 1

    # Create loader function for this model
    model_loaders[name] = get_model_loader(name)

print(f"\n{cached_count}/{len(MODELS)} models are cached")
print("="*70)
print("\n✓ model_loaders dictionary created!")
print(f"✓ Ready to run inference on {cached_count} cached models")
print("\nNext: Run the inference cell to test all models")
print("="*70)

### Cell 6: Mount GDrive and Load JSON File.

In [None]:
from google.colab import drive
from google.colab import files
import os, shutil

# 1) Mount your Google Drive at /content/gdrive
drive.mount('/content/gdrive')

# 2) (Optional) Choose where in Drive you want to store the file
#    This will be: My Drive / LLM_Math_Cache
gdrive_folder = "/content/gdrive/MyDrive/LLM_Math_Cache"
os.makedirs(gdrive_folder, exist_ok=True)

# 3) Upload the JSON from your local machine (Desktop, etc.)
print("➡️ Choose your 00_clean.json file from your computer in the uploader UI")
uploaded = files.upload()   # This opens a file picker

# 4) Move the uploaded file into your Google Drive folder
for local_name in uploaded.keys():
    src_path = local_name
    dst_path = os.path.join(gdrive_folder, local_name)
    shutil.move(src_path, dst_path)
    print(f"✅ Moved {local_name} → {dst_path}")


### Cell 7: Verify JSON File uploaded successfully or not.

In [None]:
import os
from pathlib import Path
import json

gdrive_folder = "/content/gdrive/MyDrive/LLM_Math_Cache"
json_path = Path(gdrive_folder) / "00_clean.json"

print(f"Checking for: {json_path}")

# 1) Check existence
if json_path.exists():
    print("✅ File exists in Google Drive!")

    # 2) Optional: list the folder contents
    print("\nFiles in folder:")
    !ls -l "/content/gdrive/MyDrive/LLM_Math_Cache"

    # 3) Optional: try loading it to be 100% sure it's valid JSON
    with open(json_path) as f:
        data = json.load(f)
    print(f"\n✅ Successfully loaded JSON with {len(data)} top-level items.")
else:
    print("❌ File NOT found. Double-check the folder name and that you uploaded to this location.")

### Cell 8: Final Inference Code

• This is the final working inference pipeline, used to run mathematical questions through the selected LLM (currently DeepSeek-Math-7B) using 4-bit quantization for memory-efficient execution on limited GPU resources.

• It loads a cleaned JSON dataset of math questions, performs deterministic step-by-step inference, and records model responses along with performance metrics such as response time and tokens per second.

• The code includes aggressive GPU and memory cleanup, ensuring stable long-running execution and preventing out-of-memory errors during batch inference.

• All results are saved to CSV files, including intermediate checkpoints and a final consolidated output, providing the definitive output for evaluation and analysis in this project.

In [None]:
# ==================== FINAL WORKING INFERENCE CELL (Default HF Cache) ====================
import json
import time
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import gc
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ------------------- CONFIG -------------------
JSON_PATH = Path("/content/gdrive/MyDrive/LLM_Math_Cache/00_clean.json")
OUTPUT_CSV = Path("00_clean_results_final.csv")
SYSTEM_PROMPT = "You are a precise, concise, and truthful math assistant. Solve step-by-step."

MAX_NEW_TOKENS = 1500

# 4-bit quantization (fits 7B models easily on free Colab T4)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# ------------------- MODELS TO TEST -------------------
MODELS = [
    {"name": "Deepseek-Math-7B", "repo": "deepseek-ai/deepseek-math-7b-instruct"},
    # {"name": "Qwen2.5-Math-7B-Instruct", "repo": "Qwen/Qwen2.5-Math-7B-Instruct"},
    # {"name": "Phi-3.5-mini", "repo": "microsoft/Phi-3.5-mini-instruct"},
]

# ------------------- HELPERS -------------------
def aggressive_cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

def load_questions(path):
    with open(path) as f:
        data = json.load(f)
    print(f"Loaded {len(data)} questions")
    return data

questions = load_questions(JSON_PATH)

# ------------------- MAIN LOOP -------------------
all_results = []

for model_info in MODELS:
    name = model_info["name"]
    repo = model_info["repo"]

    print("\n" + "="*80)
    print(f"LOADING & RUNNING: {name}")
    print("="*80)

    aggressive_cleanup()
    time.sleep(2)

    try:
        print(f"Loading {repo} (4-bit) ...")
        tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            repo,
            quantization_config=quant_config,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
        print(f"Model {name} loaded successfully!")

    except Exception as e:
        print(f"Failed to load {name}: {e}")
        continue

    # ------------------- Inference -------------------
    for idx, item in enumerate(tqdm(questions, desc=f"{name[:20]:<20}")):
        qid = item.get("id", idx)
        question = item["question"]

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": question}
        ]

        start = time.time()
        try:
            out = pipe(messages)
            response = out[0]["generated_text"][-1]["content"].strip()


            elapsed = time.time() - start
            out_tokens = len(tokenizer.encode(response))

            all_results.append({
                "id": qid,
                "model": name,
                "question": question,
                "response": response,
                "response_time_sec": round(elapsed, 3),
                "tokens_output": out_tokens,
                "tokens_per_sec": round(out_tokens / max(elapsed, 0.01), 2),
            })
        except Exception as e:
            all_results.append({
                "id": qid, "model": name, "question": question,
                "response": f"ERROR: {e}", "response_time_sec": -1,
                "tokens_output": 0, "tokens_per_sec": 0,
            })

        # Light cleanup every 10 questions
        if (idx + 1) % 10 == 0:
            gc.collect()
            torch.cuda.empty_cache()

    # ------------------- Cleanup model -------------------
    del model, tokenizer, pipe
    aggressive_cleanup()

    # Save checkpoint after each model
    pd.DataFrame(all_results).to_csv(f"results_up_to_{name}.csv", index=False)
    print(f"Checkpoint saved for {name}")

# ------------------- FINAL SAVE & SUMMARY -------------------
df = pd.DataFrame(all_results)
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nALL DONE! Saved {len(df)} rows → {OUTPUT_CSV}")

if len(df) > 0 and "response_time_sec" in df.columns:
    valid = df[df["response_time_sec"] > 0]
    if len(valid) > 0:
        summary = valid.groupby("model").agg({
            "tokens_per_sec": "mean",
            "response_time_sec": "mean",
            "id": "count"
        }).round(2).rename(columns={"id": "completed"})
        print("\nPerformance Summary:")
        print(summary.sort_values("tokens_per_sec", ascending=False))

### Socratic Debugging Code

### Cell 1: Setup: Drive, GPU, Packages and check the Import and GPU.

In [None]:
# Create cache folder
!mkdir -p /content/gdrive/MyDrive/LLM_Math_Cache

# Upgrade critical packages
!pip install -q --upgrade bitsandbytes accelerate transformers
!pip install -q torch sentencepiece tqdm pandas huggingface_hub

import os, re, time, json, random
import pandas as pd
from tqdm import tqdm

import torch
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
import torch
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU!")
print("CUDA available:", torch.cuda.is_available())
print("Drive mounted + packages updated")


print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


### Cell 2: Login to HuggingFace

In [None]:
from huggingface_hub import login

# REPLACE WITH YOUR REAL TOKEN → https://huggingface.co/settings/tokens
HF_TOKEN = "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"   # ← CHANGE THIS!

login(token=HF_TOKEN)
print("Logged in to Hugging Face")

### Cell 3: Load Path, Model, and quantization information

In [None]:
CACHE_DIR = "/content/drive/MyDrive/LLM_Math_Cache"
os.makedirs(CACHE_DIR, exist_ok=True)

OUT_DIR = "/content/drive/MyDrive/Socratic_Debugging_Raw"
os.makedirs(OUT_DIR, exist_ok=True)

MODELS = [
    {"name": "Deepseek-Math-7B", "repo": "deepseek-ai/DeepSeek-Math-7B-Instruct"},
    # {"name": "Qwen2.5-Math-7B-Instruct", "repo": "Qwen/Qwen2.5-Math-7B-Instruct"},
    # {"name": "Phi-3.5-mini", "repo": "microsoft/Phi-3.5-mini-instruct"},
]

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

### Cell 4: Load questions from a JSON file and the Clean and normalize the content.

In [None]:
QUESTIONS_PATH = "/content/00_clean.json"  # change if needed

with open(QUESTIONS_PATH, "r") as f:
    QUESTIONS = json.load(f)

print("Loaded questions:", len(QUESTIONS))
print("First item:", QUESTIONS[0])

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return str(s)
    # fix common mojibake seen in your sample
    s = (s.replace("â€™", "’")
           .replace("â€œ", "“")
           .replace("â€�", "”")
           .replace("â€“", "–")
           .replace("â€”", "—")
           .replace("â€¦", "…")
           .replace("Ëš", "°"))
    s = re.sub(r"\s+", " ", s).strip()
    return s

for q in QUESTIONS:
    q["question"] = clean_text(q["question"])

PROBLEMS = []
for q in QUESTIONS:
    PROBLEMS.append({
        "id": f"Q{q['id']}",          # string id
        "qid": int(q["id"]),          # numeric id
        "difficulty": q.get("difficulty", "Unknown"),
        "human_type": q.get("human_type", "Unknown"),
        "text": q["question"],
    })

print("Total problems:", len(PROBLEMS))
print("Example:", PROBLEMS[2])



### CELL 5 — Socratic Debugging prompts

In [None]:
TURN1_TEMPLATE = """Solve the following problem step-by-step.
Explain each reasoning step clearly, including why you chose specific methods or variables.
If applicable, identify and justify any assumptions.

Problem: {problem}
"""

TURN2_TEMPLATE = """You must critique and re-evaluate ONLY your previous solution above.

Rules:
- Do NOT restate the problem.
- Do NOT repeat your full Turn 1 solution.
- Then give the corrected solution concisely.
- Finish with a single line: FINAL: <final answer only> and if it needed to be corrected or not.

Now perform the re-evaluation.

"""

### Cell 6: Load model/tokenizer

In [None]:
def load_model(repo: str):
    tokenizer = AutoTokenizer.from_pretrained(repo, cache_dir=CACHE_DIR, use_fast=True)

    model = AutoModelForCausalLM.from_pretrained(
        repo,
        cache_dir=CACHE_DIR,
        device_map="auto",
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
    )
    model.eval()

    # Some tokenizers don't have pad token set; align to eos to avoid warnings
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


### Cell 7: Chat Formatting & Response Generation
• Formats user, system, and assistant messages into a chat-style prompt, using the model’s native chat template when available for compatibility and correctness.

• Provides a fallback prompt construction method to ensure robustness across models that do not support chat templates.

• Generates model responses in inference-only mode, disabling gradients to improve performance and reduce memory usage.

• Produces clean final answers by removing the prompt text from the generated output, returning only the assistant’s response.

In [None]:
def build_chat_input(tokenizer, messages):
    """
    messages: [{"role": "user"/"assistant"/"system", "content": "..."}]
    Uses apply_chat_template when available.
    """
    if hasattr(tokenizer, "apply_chat_template"):
        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    # fallback (rare): simple concatenation
    s = ""
    for m in messages:
        s += f"{m['role'].upper()}: {m['content']}\n"
    s += "ASSISTANT:"
    return s

@torch.inference_mode()
def generate_response(model, tokenizer, messages, max_new_tokens=512, temperature=0.0, top_p=1.0):
    prompt = build_chat_input(tokenizer, messages)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    do_sample = temperature > 0

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    if do_sample:
        gen_kwargs.update(dict(temperature=temperature, top_p=top_p))

    out = model.generate(**inputs, **gen_kwargs)
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)

    # strip prompt prefix if present
    if decoded.startswith(prompt):
        return decoded[len(prompt):].strip()
    return decoded.strip()


### Cell 8: Socratic Evaluation & Experiment Execution

• Runs the final experiment loop where each model is evaluated over multiple shuffled runs to ensure robustness and reduce order bias.

• Performs two-turn generation per problem:

1.   Turn 1: the model generates an initial solution.
2.   Turn 2: the model critiques and corrects its own previous answer without restating the problem.


• Records detailed metadata for each run, including model details, difficulty level, prompts, responses, decoding parameters, and timestamps.

• Saves all raw model outputs into a CSV file, producing the final experimental dataset used for quantitative and qualitative analysis.

In [None]:
MAX_NEW_TOKENS_T1 = 1024
MAX_NEW_TOKENS_T2 = 768
TEMPERATURE = 0.0
TOP_P = 1.0
N_RUNS = 3

USE_SUBSET = False
SUBSET_N = 10
problems_to_run = PROBLEMS[:SUBSET_N] if USE_SUBSET else PROBLEMS

SYSTEM_MSG = {
    "role": "system",
    "content": "Follow instructions exactly. Do not repeat the problem unless asked. In Turn 2, critique the prior answer rather than restarting."
}

TURN2_TEMPLATE = """You must critique and re-evaluate ONLY your previous solution above.

Rules:
- Do NOT restate the problem.
- Do NOT repeat your full Turn 1 solution.
- Start by identifying the first incorrect step (if any), or say "No conceptual error found".
- Provide a corrected solution concisely.
- Finish with a single line: FINAL: <final answer only>and if it needed to be corrected or not.

Now perform the re-evaluation.
"""

RESULTS = []

for m in MODELS:
    print(f"\n=== Loading: {m['name']} ({m['repo']}) ===")
    model, tokenizer = load_model(m["repo"])

    for run_id in range(1, N_RUNS + 1):
        probs = problems_to_run.copy()
        seed = abs(hash((m["name"], run_id))) % (2**32)
        rnd = random.Random(seed)
        rnd.shuffle(probs)

        for p in tqdm(probs, desc=f"{m['name']} run {run_id}", leave=False):
            q_text = p["text"]

            turn1_prompt = TURN1_TEMPLATE.format(problem=q_text)

            # Turn 1
            messages1 = [SYSTEM_MSG, {"role": "user", "content": turn1_prompt}]
            t1 = generate_response(
                model, tokenizer, messages1,
                max_new_tokens=MAX_NEW_TOKENS_T1,
                temperature=TEMPERATURE,
                top_p=TOP_P,
            )

            # Turn 2
            turn2_prompt = TURN2_TEMPLATE
            messages2 = [
                SYSTEM_MSG,
                {"role": "user", "content": turn1_prompt},
                {"role": "assistant", "content": t1},
                {"role": "user", "content": turn2_prompt},
            ]
            t2 = generate_response(
                model, tokenizer, messages2,
                max_new_tokens=MAX_NEW_TOKENS_T2,
                temperature=TEMPERATURE,
                top_p=TOP_P,
            )

            RESULTS.append({
                "model": m["name"],
                "repo": m["repo"],
                "run_id": run_id,
                "shuffle_seed": seed,

                "question_id": p["qid"],
                "problem_id": p["id"],
                "difficulty": p["difficulty"],
                "human_type": p["human_type"],

                "question_text": q_text,

                "turn1_prompt": turn1_prompt,
                "turn1_response": t1,

                "turn2_prompt": turn2_prompt,
                "turn2_response": t2,

                "max_new_tokens_t1": MAX_NEW_TOKENS_T1,
                "max_new_tokens_t2": MAX_NEW_TOKENS_T2,
                "temperature": TEMPERATURE,
                "top_p": TOP_P,

                "timestamp_utc": int(time.time()),
            })

    del model
    torch.cuda.empty_cache()

df = pd.DataFrame(RESULTS)
print("Rows:", len(df))

csv_path = os.path.join(OUT_DIR, f"socratic_debugging_raw_{int(time.time())}.csv")
df.to_csv(csv_path, index=False)
print("Saved raw responses to:", csv_path)

df.head()