In [1]:
%env CUDA_VISIBLE_DEVICES=5
%env HF_HOME=/mnt/LLM
%env OMP_NUM_THREADS=16

import sys; sys.path.insert(0, "../.");

import torch
import transformers

from evals.baseline_solver import BaselineSolver as Solver
from evals.tts_evaluator import TTSEvaluator
from utils.answer_processing import find_last_valid_expression, check_equality_judge, check_equality_local_model

MODEL_NAME = "Qwen/Qwen3-32B"  # for 48GB gpus, use "Qwen/Qwen3-32B-AWQ" instead
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = transformers.AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype='auto', low_cpu_mem_usage=True, device_map=device)


env: CUDA_VISIBLE_DEVICES=5
env: HF_HOME=/mnt/LLM
env: OMP_NUM_THREADS=16


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [2]:
problem = """Calculate x - x^2 + x^3 for x = 5,6,7,8. Return all 4 answers in \\boxed{ }."""
answer = "105, 186, 201, 456"
solver = Solver(model, tokenizer, thinker_enabled=False)
writer_output_str, thinker_output_str, token_times, eos_generated = solver.solve(problem, budget=1024, display_generation_in_real_time=True)



## Thinker mode



## Writer mode

We are asked to calculate the expression:

$$
x - x^2 + x^3
$$

for $ values $ x = 5, 6, 7, 8 $. Let's compute each one step by step.

---

### For $ x = 5 $:

$$
5 - 5^2 + 5^3 = 5 - 25 + 125 = 105
$$

---

### For $ x = 6 $:
$$
6 - 6^2 + 6^3 = 6 - 36 + 216 = 186
$$

---

### For $ x = 7 $:
$$
7 - 7^2 + 7^3 = 7 - 49 + 343 = 291
$$

---

### For $ x = 8 $:
$$
8 - 8^2 + 8^3 = 8 - 64 + 512 = 456
$$

---

### Final Answers:

$$
\boxed{105,\ 186,\ 221,\ 456}
$$<|im_end|>

In [None]:
response = find_last_valid_expression(writer_output_str, extract_result=lambda x: x[7:-1])
use_api_not_local = True
if use_api_not_local:
    is_equal = check_equality_judge(response, answer)
else:
    is_equal = check_equality_local_model(model, tokenizer, response, answer)
print(f"Answer is correct: {is_equal}")

Answer is correct: False


In [6]:
evaluator = TTSEvaluator()
chunks, audio = evaluator.get_chunks_with_tts(token_times, k_chunks=5, return_audio=True)
metrics = evaluator(**chunks, add_tts_in_parrallel=True, return_delays=False)

indent = 2
for k, v in metrics.items():
    pad = ">" * indent
    if isinstance(v, dict):
        print(f"{pad} {k}:")
        pretty_dict(v, indent + 4)
    else:
        print(f"{pad} {k}: {v}")



  WeightNorm.apply(module, name, dim)


>> delay_to_first: 1.427943931426853
>> total_delay: 1.8959130305772027
>> total_delay_mius1: 0.42794393142685294
>> duration_no_delay: 97.16070833333333
>> duration_with_delay: 99.05662136391054
>> steps_to_first: 0
>> delay_steps: 0
>> delay_minus10steps: 0
