In [1]:
!pip install -q transformers accelerate sentencepiece

import json
import math
import random
import gc
from dataclasses import dataclass
from typing import Dict, List, Optional

import torch
import pandas as pd
from torch.nn.functional import log_softmax
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm

# Configurations
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 1234
FEW_SHOT_K = 3
EVAL_ZERO_SHOT = True
EVAL_FEW_SHOT = True

# Normalize by length to compare shorter and longer words fairly.
NORMALIZE_BY_LENGTH = True

random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Using device:", DEVICE)

# Data
try:
    df = pd.read_csv("dataset.csv").reset_index(drop=True)
    print(f"Dataset Shape: {df.shape}")
except FileNotFoundError:
    print("dataset.csv not found. Please upload it.")
    df = pd.DataFrame()

# Helper functions

def parse_candidates(row) -> (List[str], List[str]):
    """
    Splits the 'Correct Words' and 'Wrong Words' strings into individual valid answers.
    Example: "working together" -> ['working', 'together']
    """
    # Cleanup to handle CSV formatting
    c_str = str(row["Correct Words"]).replace("[", "").replace("]", "").replace("'", "").replace(",", " ")
    w_str = str(row["Wrong Words"]).replace("[", "").replace("]", "").replace("'", "").replace(",", " ")

    # Split by whitespace
    return c_str.split(), w_str.split()

def get_few_shot_demo_indices(n_items: int, target_index: int, k: int = FEW_SHOT_K, seed: int = SEED) -> List[int]:
    assert n_items > 1
    rng = random.Random(seed + target_index)
    pool = [j for j in range(n_items) if j != target_index]
    return rng.sample(pool, k)

def append_think_directive(question: str, think_mode: Optional[str]) -> str:
    if think_mode == "think": return question + " /think"
    elif think_mode == "no_think": return question + " /no_think"
    else: return question

def build_zero_shot_prompt(row, think_mode: Optional[str] = None) -> str:
    context = str(row["Context"]).strip()
    question = append_think_directive(str(row["Question"]).strip(), think_mode)
    return f"{context}\n{question}\nAnswer with one word only. Answer:"

def build_few_shot_prompt(target_row, demo_rows: List, think_mode: Optional[str] = None) -> str:
    parts = ["Consider the following examples:\n\n"]
    for demo in demo_rows:
        correct_words, _ = parse_candidates(demo)
        correct_primary = correct_words[0] if correct_words else ""
        parts.append(f"{str(demo['Context']).strip()}\n")
        parts.append(f"{str(demo['Question']).strip()}\n")
        parts.append(f"The correct answer is: {correct_primary}\n\n")

    target_context = str(target_row["Context"]).strip()
    target_question = append_think_directive(str(target_row["Question"]).strip(), think_mode)

    parts.append(f"{target_context}\n")
    parts.append(f"{target_question}\n")
    parts.append("Answer with one word only. Answer:")
    return "".join(parts)

# Scoring engine

def score_candidates_for_prompt(
    model,
    tokenizer,
    prompt: str,
    candidates: List[str],
    device: torch.device = DEVICE,
    normalize: bool = True
) -> Dict[str, float]:
    """
    Calculates the probability for every candidate word.
    1. Tests " " + word (Space)
    2. Tests "" + word (No Space)
    3. Takes the max score of those two.
    """
    scores: Dict[str, float] = {}

    # Prefix encoding (Prompt without the answer)
    prompt_enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    prompt_ids = prompt_enc["input_ids"].to(device)
    prefix_len = prompt_ids.shape[-1]

    for cand_raw in candidates:
        # We test both variants to find what the model prefers
        # Variant A: " working"
        # Variant B: "working"
        variants = [(" " + cand_raw.strip()), (cand_raw.strip())]
        variant_scores = []

        for cand_text in variants:
            full_text = prompt + cand_text
            enc = tokenizer(full_text, return_tensors="pt", add_special_tokens=True)
            input_ids = enc["input_ids"].to(device)

            if input_ids.shape[-1] <= prefix_len:
                variant_scores.append(-9999.0)
                continue

            cand_ids = input_ids[0, prefix_len:]
            n_tokens = len(cand_ids)

            with torch.no_grad():
                outputs = model(input_ids)
            logits = outputs.logits

            logprob = 0.0
            for j, tok_id in enumerate(cand_ids):
                # Logit for token `j` is at index `prefix_len + j - 1`
                logit_pos = prefix_len + j - 1
                if logit_pos < 0: logit_pos = 0

                step_log_probs = log_softmax(logits[0, logit_pos], dim=-1)
                logprob += float(step_log_probs[tok_id])

            # Normalize by length (Average log-prob per token)
            if normalize and n_tokens > 0:
                logprob /= n_tokens

            variant_scores.append(logprob)

        scores[cand_raw] = max(variant_scores)

    return scores

# Models

@dataclass
class ModelSpec:
    label: str
    hf_name: str
    supports_think: bool = False

MODEL_SPECS: List[ModelSpec] = [
    # ---------- Pythia Suite ----------
    ModelSpec("Pythia-70m-deduped",  "EleutherAI/pythia-70m-deduped"),
    ModelSpec("Pythia-160m-deduped", "EleutherAI/pythia-160m-deduped"),
    ModelSpec("Pythia-410m-deduped", "EleutherAI/pythia-410m-deduped"),
    ModelSpec("Pythia-1b-deduped",   "EleutherAI/pythia-1b-deduped"),
    ModelSpec("Pythia-1.4b-deduped", "EleutherAI/pythia-1.4b-deduped"),
    ModelSpec("Pythia-2.8b-deduped", "EleutherAI/pythia-2.8b-deduped"),
    ModelSpec("Pythia-6.9b-deduped", "EleutherAI/pythia-6.9b-deduped"),
    ModelSpec("Pythia-12b-deduped",  "EleutherAI/pythia-12b-deduped"),

    # ---------- Olmo 3 ----------
    ModelSpec("Olmo-3-7B-Base",     "allenai/Olmo-3-1025-7B"),
    ModelSpec("Olmo-3-7B-Instruct", "allenai/Olmo-3-7B-Instruct"),
    ModelSpec("Olmo-3-7B-Think",    "allenai/Olmo-3-7B-Think"),

    # ---------- Mistral ----------
    ModelSpec("Mistral-7B-v0.3",          "mistralai/Mistral-7B-v0.3"),
    ModelSpec("Mistral-7B-Instruct-v0.3", "mistralai/Mistral-7B-Instruct-v0.3"),

    # ---------- Qwen3 ----------
    ModelSpec("Qwen3-4B-Base",     "Qwen/Qwen3-4B-Base"),
    ModelSpec("Qwen3-4B-Instruct", "Qwen/Qwen3-4B", supports_think=True),
    ModelSpec("Qwen3-4B-Thinking", "Qwen/Qwen3-4B-Thinking-2507"),

    # ---------- SmolLM3 ----------
    ModelSpec("SmolLM3-3B-Base",     "HuggingFaceTB/SmolLM3-3B-Base"),
    ModelSpec("SmolLM3-3B-Instruct", "HuggingFaceTB/SmolLM3-3B", supports_think=True),
]

MODEL_LABELS_TO_RUN = [
    # Pythia
    "Pythia-70m-deduped",
    "Pythia-160m-deduped",
    "Pythia-410m-deduped",
    "Pythia-1b-deduped",
    "Pythia-1.4b-deduped",
    "Pythia-2.8b-deduped",
    "Pythia-6.9b-deduped",
    "Pythia-12b-deduped",
    # Olmo 3
    "Olmo-3-7B-Base",
    "Olmo-3-7B-Instruct",
    "Olmo-3-7B-Think",
    # Mistral
    "Mistral-7B-v0.3",
    "Mistral-7B-Instruct-v0.3",
    # Qwen3
    "Qwen3-4B-Base",
    "Qwen3-4B-Instruct",
    "Qwen3-4B-Thinking",
    # SmolLM3
    "SmolLM3-3B-Base",
    "SmolLM3-3B-Instruct",
]


def load_model_and_tokenizer(spec: ModelSpec):
    print(f"\n=== Loading {spec.label} ({spec.hf_name}) ===")
    try:
        tokenizer = AutoTokenizer.from_pretrained(spec.hf_name)
        if tokenizer.pad_token is None:
            if tokenizer.eos_token is not None:
                tokenizer.pad_token = tokenizer.eos_token
            else:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        model = AutoModelForCausalLM.from_pretrained(
            spec.hf_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            trust_remote_code=True
        )
        model.eval()
        return tokenizer, model
    except Exception as e:
        print(f"Failed to load {spec.label}: {e}")
        return None, None

def evaluate_model_on_dataset(spec: ModelSpec, df: pd.DataFrame) -> pd.DataFrame:
    tokenizer, model = load_model_and_tokenizer(spec)
    if model is None: return pd.DataFrame()

    n_items = len(df)
    results: List[Dict] = []
    think_modes = ["no_think", "think"] if spec.supports_think else [None]

    for idx in tqdm(range(n_items), desc=f"Items for {spec.label}"):
        row = df.iloc[idx]
        correct_words, wrong_words = parse_candidates(row)
        options = sorted(set(correct_words + wrong_words))

        demo_indices = get_few_shot_demo_indices(n_items, idx, FEW_SHOT_K, SEED)
        demo_rows = [df.iloc[j] for j in demo_indices]

        for think_mode in think_modes:
            think_label = think_mode if think_mode else "standard"

            shot_settings = []
            if EVAL_ZERO_SHOT: shot_settings.append(("zero_shot", build_zero_shot_prompt(row, think_mode)))
            if EVAL_FEW_SHOT: shot_settings.append(("few_shot", build_few_shot_prompt(row, demo_rows, think_mode)))

            for prompt_type, prompt_str in shot_settings:
                setting_name = f"{prompt_type}_{think_label}"

                # Get Scores
                cand_scores = score_candidates_for_prompt(
                    model, tokenizer, prompt_str, options, DEVICE,
                    normalize=NORMALIZE_BY_LENGTH
                )

                # Pick Winner
                best_cand = max(cand_scores.items(), key=lambda kv: kv[1])[0]

                # Check Correctness
                is_correct = best_cand in set(correct_words)

                # Normalized probabilities
                try:
                    vals = torch.tensor(list(cand_scores.values()))
                    denom = torch.logsumexp(vals, dim=0)
                    probs = {k: math.exp(v - denom) for k, v in cand_scores.items()}
                except:
                    probs = {k: 0.0 for k in cand_scores}

                results.append(dict(
                    model_label=spec.label,
                    model_hf_name=spec.hf_name,
                    prompt_type=prompt_type,
                    think_mode=think_label,
                    setting=setting_name,
                    item_index=idx,
                    chosen_candidate=best_cand,
                    chosen_is_correct=is_correct,
                    candidate_logits=json.dumps(cand_scores),
                    candidate_probs_normalized=json.dumps(probs)
                ))

    del model; del tokenizer; gc.collect(); torch.cuda.empty_cache()
    return pd.DataFrame(results)

# Run

all_results = []
for spec in MODEL_SPECS:
    if spec.label not in MODEL_LABELS_TO_RUN: continue

    print(f"Starting evaluation for {spec.label}...")
    df_model = evaluate_model_on_dataset(spec, df)

    if not df_model.empty:
        settings = df_model['setting'].unique()
        for setting in settings:
            df_subset = df_model[df_model['setting'] == setting]
            out_name = f"{spec.label}_{setting}.csv"
            df_subset.to_csv(out_name, index=False)
            print(f"Saved {out_name}")
        all_results.append(df_model)

print("Done.")

Using device: cuda
Dataset Shape: (50, 4)
Starting evaluation for Pythia-70m-deduped...

=== Loading Pythia-70m-deduped (EleutherAI/pythia-70m-deduped) ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Items for Pythia-70m-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-70m-deduped_zero_shot_standard.csv
Saved Pythia-70m-deduped_few_shot_standard.csv
Starting evaluation for Pythia-160m-deduped...

=== Loading Pythia-160m-deduped (EleutherAI/pythia-160m-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

Items for Pythia-160m-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-160m-deduped_zero_shot_standard.csv
Saved Pythia-160m-deduped_few_shot_standard.csv
Starting evaluation for Pythia-410m-deduped...

=== Loading Pythia-410m-deduped (EleutherAI/pythia-410m-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

Items for Pythia-410m-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-410m-deduped_zero_shot_standard.csv
Saved Pythia-410m-deduped_few_shot_standard.csv
Starting evaluation for Pythia-1b-deduped...

=== Loading Pythia-1b-deduped (EleutherAI/pythia-1b-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Items for Pythia-1b-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-1b-deduped_zero_shot_standard.csv
Saved Pythia-1b-deduped_few_shot_standard.csv
Starting evaluation for Pythia-1.4b-deduped...

=== Loading Pythia-1.4b-deduped (EleutherAI/pythia-1.4b-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

Items for Pythia-1.4b-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-1.4b-deduped_zero_shot_standard.csv
Saved Pythia-1.4b-deduped_few_shot_standard.csv
Starting evaluation for Pythia-2.8b-deduped...

=== Loading Pythia-2.8b-deduped (EleutherAI/pythia-2.8b-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Items for Pythia-2.8b-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-2.8b-deduped_zero_shot_standard.csv
Saved Pythia-2.8b-deduped_few_shot_standard.csv
Starting evaluation for Pythia-6.9b-deduped...

=== Loading Pythia-6.9b-deduped (EleutherAI/pythia-6.9b-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Items for Pythia-6.9b-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-6.9b-deduped_zero_shot_standard.csv
Saved Pythia-6.9b-deduped_few_shot_standard.csv
Starting evaluation for Pythia-12b-deduped...

=== Loading Pythia-12b-deduped (EleutherAI/pythia-12b-deduped) ===


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.81G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/4.11G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Items for Pythia-12b-deduped:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Pythia-12b-deduped_zero_shot_standard.csv
Saved Pythia-12b-deduped_few_shot_standard.csv
Starting evaluation for Olmo-3-7B-Base...

=== Loading Olmo-3-7B-Base (allenai/Olmo-3-1025-7B) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`rope_scaling`'s beta_fast field must be a float, got 32
`rope_scaling`'s beta_slow field must be a float, got 1


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.64G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Items for Olmo-3-7B-Base:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Olmo-3-7B-Base_zero_shot_standard.csv
Saved Olmo-3-7B-Base_few_shot_standard.csv
Starting evaluation for Olmo-3-7B-Instruct...

=== Loading Olmo-3-7B-Instruct (allenai/Olmo-3-7B-Instruct) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/204 [00:00<?, ?B/s]

Items for Olmo-3-7B-Instruct:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Olmo-3-7B-Instruct_zero_shot_standard.csv
Saved Olmo-3-7B-Instruct_few_shot_standard.csv
Starting evaluation for Olmo-3-7B-Think...

=== Loading Olmo-3-7B-Think (allenai/Olmo-3-7B-Think) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`rope_scaling`'s beta_fast field must be a float, got 32
`rope_scaling`'s beta_slow field must be a float, got 1


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.64G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/182 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Items for Olmo-3-7B-Think:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Olmo-3-7B-Think_zero_shot_standard.csv
Saved Olmo-3-7B-Think_few_shot_standard.csv
Starting evaluation for Mistral-7B-v0.3...

=== Loading Mistral-7B-v0.3 (mistralai/Mistral-7B-v0.3) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Items for Mistral-7B-v0.3:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Mistral-7B-v0.3_zero_shot_standard.csv
Saved Mistral-7B-v0.3_few_shot_standard.csv
Starting evaluation for Mistral-7B-Instruct-v0.3...

=== Loading Mistral-7B-Instruct-v0.3 (mistralai/Mistral-7B-Instruct-v0.3) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Items for Mistral-7B-Instruct-v0.3:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Mistral-7B-Instruct-v0.3_zero_shot_standard.csv
Saved Mistral-7B-Instruct-v0.3_few_shot_standard.csv
Starting evaluation for Qwen3-4B-Base...

=== Loading Qwen3-4B-Base (Qwen/Qwen3-4B-Base) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Items for Qwen3-4B-Base:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Qwen3-4B-Base_zero_shot_standard.csv
Saved Qwen3-4B-Base_few_shot_standard.csv
Starting evaluation for Qwen3-4B-Instruct...

=== Loading Qwen3-4B-Instruct (Qwen/Qwen3-4B) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Items for Qwen3-4B-Instruct:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Qwen3-4B-Instruct_zero_shot_no_think.csv
Saved Qwen3-4B-Instruct_few_shot_no_think.csv
Saved Qwen3-4B-Instruct_zero_shot_think.csv
Saved Qwen3-4B-Instruct_few_shot_think.csv
Starting evaluation for Qwen3-4B-Thinking...

=== Loading Qwen3-4B-Thinking (Qwen/Qwen3-4B-Thinking-2507) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Items for Qwen3-4B-Thinking:   0%|          | 0/50 [00:00<?, ?it/s]

Saved Qwen3-4B-Thinking_zero_shot_standard.csv
Saved Qwen3-4B-Thinking_few_shot_standard.csv
Starting evaluation for SmolLM3-3B-Base...

=== Loading SmolLM3-3B-Base (HuggingFaceTB/SmolLM3-3B-Base) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

Items for SmolLM3-3B-Base:   0%|          | 0/50 [00:00<?, ?it/s]

Saved SmolLM3-3B-Base_zero_shot_standard.csv
Saved SmolLM3-3B-Base_few_shot_standard.csv
Starting evaluation for SmolLM3-3B-Instruct...

=== Loading SmolLM3-3B-Instruct (HuggingFaceTB/SmolLM3-3B) ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/182 [00:00<?, ?B/s]

Items for SmolLM3-3B-Instruct:   0%|          | 0/50 [00:00<?, ?it/s]

Saved SmolLM3-3B-Instruct_zero_shot_no_think.csv
Saved SmolLM3-3B-Instruct_few_shot_no_think.csv
Saved SmolLM3-3B-Instruct_zero_shot_think.csv
Saved SmolLM3-3B-Instruct_few_shot_think.csv
Done.


In [2]:
from google.colab import files
import glob

!zip -r final_results.zip *.csv

files.download('final_results.zip')

  adding: dataset.csv (deflated 55%)
  adding: Mistral-7B-Instruct-v0.3_few_shot_standard.csv (deflated 74%)
  adding: Mistral-7B-Instruct-v0.3_zero_shot_standard.csv (deflated 75%)
  adding: Mistral-7B-v0.3_few_shot_standard.csv (deflated 73%)
  adding: Mistral-7B-v0.3_zero_shot_standard.csv (deflated 74%)
  adding: Olmo-3-7B-Base_few_shot_standard.csv (deflated 72%)
  adding: Olmo-3-7B-Base_zero_shot_standard.csv (deflated 73%)
  adding: Olmo-3-7B-Instruct_few_shot_standard.csv (deflated 73%)
  adding: Olmo-3-7B-Instruct_zero_shot_standard.csv (deflated 73%)
  adding: Olmo-3-7B-Think_few_shot_standard.csv (deflated 73%)
  adding: Olmo-3-7B-Think_zero_shot_standard.csv (deflated 73%)
  adding: Pythia-12b-deduped_few_shot_standard.csv (deflated 74%)
  adding: Pythia-12b-deduped_zero_shot_standard.csv (deflated 74%)
  adding: Pythia-1.4b-deduped_few_shot_standard.csv (deflated 74%)
  adding: Pythia-1.4b-deduped_zero_shot_standard.csv (deflated 74%)
  adding: Pythia-160m-deduped_few_shot

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>