In [None]:
import sys
import os

def find_repo_root(marker_file="requirements.txt"):
    prev, curr = None, os.path.abspath(os.getcwd())
    while prev != curr:
        if os.path.exists(os.path.join(curr, marker_file)):
            return curr
        prev, curr = curr, os.path.dirname(curr)
    return None

repo_root = find_repo_root()
if repo_root and repo_root not in sys.path:
    sys.path.insert(0, repo_root)



# Chapter 3: Evaluating Reasoning Models

  ## Learning Objectives
  - Extract and parse final answers from LLM text responses reliably
  - Verify answer correctness using symbolic math solvers (calculator-like verification)
  - Build an evaluation pipeline: load model → generate outputs → grade against dataset
  - Implement verifiable rewards system (foundation for Chapter 6 reinforcement learning)

 <img src="figure1.png" alt="Figure 1" width="600">

In [ ]:
import sys
import os

def find_repo_root(marker_file="requirements.txt"):
    prev, curr = None, os.path.abspath(os.getcwd())
    while prev != curr:
        if os.path.exists(os.path.join(curr, marker_file)):
            return curr
        prev, curr = curr, os.path.dirname(curr)
    return None

repo_root = find_repo_root()
if repo_root and repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Python executable:", sys.executable)
print("Python version:", sys.version)
print("Python path:")
for p in sys.path:
    print(f"  {p}")

# Test if the package is available
try:
    import reasoning_from_scratch
    print(f"\nreasoning_from_scratch found at: {reasoning_from_scratch.__file__}")
except ImportError as e:
    print(f"\nImport error: {e}")

# Try the specific import
try:
    from reasoning_from_scratch.qwen3 import download_qwen3_small
    print("qwen3 import successful!")
except ImportError as e:
    print(f"qwen3 import failed: {e}")

from pathlib import Path
import torch

from reasoning_from_scratch.qwen3 import (
    download_qwen3_small,
    Qwen3Tokenizer,
    Qwen3Model,
    QWEN_CONFIG_06_B
)

def load_model_and_tokenizer(
    which_model, device, use_compile, local_dir="qwen3"
):
    if which_model == "base":

        download_qwen3_small(
            kind="base", tokenizer_only=False, out_dir=local_dir
        )

        tokenizer_path = Path(local_dir) / "tokenizer-base.json"
        model_path = Path(local_dir) / "qwen3-0.6B-base.pth"
        tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)

    elif which_model == "reasoning":

        download_qwen3_small(
            kind="reasoning", tokenizer_only=False, out_dir=local_dir
        )

        tokenizer_path = Path(local_dir) / "tokenizer-reasoning.json"
        model_path = Path(local_dir) / "qwen3-0.6B-reasoning.pth"
        tokenizer = Qwen3Tokenizer(
            tokenizer_file_path=tokenizer_path,
            apply_chat_template=True,
            add_generation_prompt=True,
            add_thinking=True,
        )

    else:
        raise ValueError(f"Invalid choice: which_model={which_model}")

    model = Qwen3Model(QWEN_CONFIG_06_B)
    model.load_state_dict(torch.load(model_path))

    model.to(device)

    if use_compile:
        torch._dynamo.config.allow_unspec_int_on_nn_module = True
        model = torch.compile(model)

    return model, tokenizer