In [None]:
%load_ext autoreload
%autoreload 2

# Search for search strategies and hyper-parameters


In [None]:
import argparse
import torch
import json
import os

import time
from collections import defaultdict
from datetime import datetime
from vllm import LLM, SamplingParams
from transformers import set_seed

from agent_search.agent import VLLMPythonMathAgent, AgentSearchCfg

In [None]:
GPU_DTYPE = torch.float16
LOCAL_GPU_MEM_GB = 80
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
parser = argparse.ArgumentParser(
    description="Search for search strategies and hyper-parameters", allow_abbrev=False
)

parser.add_argument(
    "--model_id_or_path",
    type=str,
    default="deepseek-ai/deepseek-math-7b-rl",
    help="The HF model ID or path to the model",
)
parser.add_argument(
    "--seed",
    type=int,
    default=42,
    help="The HF model ID or path to the model",
)
parser.add_argument(
    "--gpu_mem_util",
    type=float,
    default=0.9,
    help="The fraction of GPU memory to use",
)
parser.add_argument(
    "--gen_dset_fpath",
    type=str,
    help="The path to the dataset to generate the search space from",
)

parser.add_argument(
    "--temperature",
    type=float,
    default=0.6,
    help="The temperature to use for sampling",
)

parser.add_argument(
    "--code_temperature",
    type=float,
    default=0.6,
    help="The temperature to use for sampling code",
)

parser.add_argument(
    "--init_temperature",
    type=float,
    default=1.0,
    help="The temperature to use for init sampling",
)

parser.add_argument(
    "--top_p",
    type=float,
    default=0.95,
    help="The top_p to use for sampling",
)

parser.add_argument(
    "--allow_timeout",
    action="store_true",
    help="Whether to allow timeout when executing code",
)

parser.add_argument(
    "--allow_err",
    action="store_true",
    help="Whether to allow errors when executing code",
)

parser.add_argument(
    "--allow_empty_output",
    action="store_true",
    help="Whether to allow empty outputs when executing code",
)

parser.add_argument(
    "--spec_neg",
    action="store_true",
    help="Whether to speculate negative answers",
)

parser.add_argument(
    "--n_fin_ans",
    type=int,
    default=8,
    help="The number of answers to sample per problem",
)


parser.add_argument(
    "--n_max_out_tok",
    type=int,
    default=4096,
    help="The maximum number of tokens to generate. Few completion are longer than 1024 tokens",
)

parser.add_argument(
    "--n_max_tot_tok",
    type=int,
    default=4096,
    help="The maximum number of tokens to generate",
)

parser.add_argument(
    "--n_init_gen",
    type=int,
    default=1,
    help="The number of init generations",
)

parser.add_argument(
    "--n_init_sample",
    type=int,
    default=16,
    help="The number of samples when init sampling",
)

parser.add_argument(
    "--n_max_sample",
    type=int,
    default=64,
    help="The maximum number of samples for trial per problem",
)

parser.add_argument(
    "--beam_width",
    type=int,
    default=1,
    help="The beam width to use for sampling",
)

parser.add_argument(
    "--n_max_call",
    type=int,
    default=None,
    help="DeepSeekMath-7B-RL usually can not correctly solve problems with more than 2 calls",
)

parser.add_argument(
    "--instr",
    type=str,
    default="The final answer must be a non-negative integer.",
    # default="""To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step.
    # Be clear so even an idiot can follow your instructions.
    # Write the entire script covering all the steps (use comments and document it well) and print the result.
    # The intermediary calculations may be free-form, but the final answer should always be an non-negative integer.
    # Your final answer should always be a non-negative integer, not an algebraic expression!
    # Your final answer should always be a non-negative integer, not an algebraic expression!
    # Your final answer should always be a non-negative integer, not an algebraic expression!
    # After solving the problem, output the final answer within \\boxed{}.""",
    help="The instruction to append to the problem.",
)

parser.add_argument(
    "--rag",
    action="store_true",
    help="Whether to use RAG model for generation",
)

args, _ = parser.parse_known_args()

In [None]:
set_seed(args.seed)

In [None]:
ICL_EG_QA_MAP = {
    r"""
The points $\left(x, y\right)$ satisfying $((\vert x + y \vert - 10)^2 + ( \vert x - y \vert - 10)^2)((\vert x \vert - 8)^2 + ( \vert y \vert - 8)^2) = 0$ enclose a convex polygon.
What is the area of this convex polygon?
    """: r'''
```python
from sympy import symbols, Eq, solve

def polygon_vertices():
    """The points $\left(x, y\right)$ satisfying $((\vert x + y \vert - 10)^2 + ( \vert x - y \vert - 10)^2)((\vert x \vert - 8)^2 + ( \vert y \vert - 8)^2) = 0$ enclose a convex polygon. What is the area of this convex polygon?"""
    # Define the variables
    x, y = symbols("x y")
    # Define the equations from the first factor
    eq1_cases = [Eq(x + y, 10), Eq(x + y, -10)]
    eq2_cases = [Eq(x - y, 10), Eq(x - y, -10)]
    # Define the equations from the second factor
    eq3_cases = [Eq(x, 8), Eq(x, -8)]
    eq4_cases = [Eq(y, 8), Eq(y, -8)]
    # Solve each combination of equations
    solutions = []
    # Solving combinations from the first factor
    for eq1 in eq1_cases:
        for eq2 in eq2_cases:
            sol = solve([eq1, eq2], (x, y))
            if sol:
                solutions.append(sol)
    # Solving combinations from the second factor
    for eq3 in eq3_cases:
        for eq4 in eq4_cases:
            sol = solve([eq3, eq4], (x, y))
            if sol:
                solutions.append(sol)
    # Extract unique solutions
    unique_solutions = {tuple(sol.items()) for sol in solutions}
    return unique_solutions

result = polygon_vertices()
print(result)
```

```output
{((x, 10), (y, 0)), ((x, 0), (y, -10)), ((x, -8), (y, 8)), ((x, 8), (y, 8)), ((x, -10), (y, 0)), ((x, -8), (y, -8)), ((x, 8), (y, -8)), ((x, 0), (y, 10))}
```

Now we have the coordinates of all the vertices. To find the area of this polygon, we can use the Shoelace formula (Gauss's area formula for polygons): $\text{Area} = \frac{1}{2} \left| \sum_{i=1}^{n} (x_i y_{i+1} - y_i x_{i+1}) \right|$

```python
def polygon_area():
    # Reorder the vertices in a logical sequence
    vertices_ordered = [(10, 0), (8, 8), (0, 10), (-8, 8), (-10, 0), (-8, -8), (0, -10), (8, -8)]
    # Repeat the first vertex at the end
    vertices = vertices_ordered + [vertices_ordered[0]]
    # Calculate the area using the Shoelace formula
    area = 0
    for i in range(len(vertices) - 1):
        area += (
            vertices[i][0] * vertices[i + 1][1] - vertices[i + 1][0] * vertices[i][1]
        )
    return abs(area) / 2

result = polygon_area()
print(result)
```

```output
320.0
```

Therefore, the area of the convex polygon is $320$. The answer is $\boxed{320}$.
''',
}

ICL_EG_QA_MAP = {
    k.replace("\n", " ").strip(): v.strip() for k, v in ICL_EG_QA_MAP.items()
}
for i, (q, a) in enumerate(ICL_EG_QA_MAP.items()):
    print(f"### Problem {i}")
    print("#### Problem")
    print(q)
    print("#### Solution")
    print(a)

### Problem 0
#### Problem
Let $ABCD$ be a unit square. Let $P$ be the point on $AB$ such that $|AP| = 1/{20}$ and let $Q$ be the point on $AD$ such that $|AQ| = 1/{24}$. The lines $DP$ and $BQ$ divide the square into four regions. Find the ratio between the areas of the largest region and the smallest region.
#### Solution
```python
import numpy as np

def area_ratio():
    """Let $ABCD$ be a unit square. Let $P$ be the point on $AB$ such that $|AP| = 1/{20}$ and let $Q$ be the point on $AD$ such that $|AQ| = 1/{24}$. The lines $DP$ and $BQ$ divide the square into four regions. Find the ratio between the areas of the largest region and the smallest region. The final answer must be a non-negative integer."""
    # Define points of the square
    A = np.array([0, 0])
    B = np.array([1, 0])
    C = np.array([1, 1])
    D = np.array([0, 1])
    # Define points P and Q
    P = np.array([1 / 20, 0])
    Q = np.array([0, 1 / 24])
    # Calculate intersection of DP and BQ
    # Line DP: y =

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

args.gen_dset_fpath = (
    "/ssddata/tongyx/projects/agent-search/data/iid_hardest_non_neg_int_ans_math.json"
)
args.allow_timeout = True
args.allow_err = True
args.allow_empty_output = True

args.rag = True

args.gpu_mem_util *= 32 / LOCAL_GPU_MEM_GB

In [None]:
llm = LLM(
    model=args.model_id_or_path,
    tokenizer=args.model_id_or_path,
    tensor_parallel_size=torch.cuda.device_count(),
    dtype=GPU_DTYPE,
    seed=args.seed,
    trust_remote_code=True,
    gpu_memory_utilization=args.gpu_mem_util,
    # swap_space=args.gpu_mem_util * LOCAL_GPU_MEM_GB,
)



INFO 06-26 22:05:18 llm_engine.py:161] Initializing an LLM engine (v0.5.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', speculative_config=None, tokenizer='deepseek-ai/deepseek-math-7b-rl', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=42, served_model_name=deepseek-ai/deepseek-math-7b-rl)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-26 22:05:19 weight_utils.py:218] Using model weights format ['*.safetensors']
INFO 06-26 22:05:24 model_runner.py:160] Loading model weights took 12.8725 GB
INFO 06-26 22:05:25 gpu_executor.py:83] # GPU blocks: 1982, # CPU blocks: 546
INFO 06-26 22:05:30 model_runner.py:889] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-26 22:05:30 model_runner.py:893] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-26 22:05:35 model_runner.py:965] Graph capturing finished in 6 secs.


In [None]:
with open(args.gen_dset_fpath, "r") as f:
    dset = json.load(f)
print(f"Loaded dataset with {len(dset)} problems")

Loaded dataset with 151 problems


In [None]:
sampling_params = SamplingParams(
    n=args.beam_width,
    temperature=args.temperature,
    top_p=args.top_p,
    max_tokens=args.n_max_out_tok,
    skip_special_tokens=True,
    # prompt_logprobs=1,
    logprobs=2,
    seed=args.seed,
)
print(f"{sampling_params=}")
search_cfg = AgentSearchCfg(
    sampling_params=sampling_params,
    n_fin_ans=args.n_fin_ans,
    n_init_sample=args.n_init_sample,
    n_max_sample=args.n_max_sample,
    n_init_gen=args.n_init_gen,
    init_temperature=args.init_temperature,
    code_temperature=args.code_temperature,
    n_max_tot_tok=args.n_max_tot_tok,
    n_max_call=args.n_max_call,
    allow_timeout=args.allow_timeout,
    allow_err=args.allow_err,
    allow_empty_output=args.allow_empty_output,
    spec_neg=args.spec_neg,
)
print(f"{search_cfg.__dict__=}")


agent = VLLMPythonMathAgent(
    llm,
    prompt_template="deepseekmath-tool",
    search_cfg=search_cfg,
    code_cell_template="deepseekmath",
    rag_eg_qa_map=ICL_EG_QA_MAP if args.rag else None,
)

agent.ctx_template.prompt_template.prompt_after_query = (
    " " + args.instr + agent.ctx_template.prompt_template.prompt_after_query
)

sampling_params=SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.6, top_p=0.95, top_k=-1, min_p=0.0, seed=42, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=4096, min_tokens=0, logprobs=2, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)
search_cfg.__dict__={'sampling_params': SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.6, top_p=0.95, top_k=-1, min_p=0.0, seed=42, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=4096, min_tokens=0, logprobs=2, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None), 'n_fin_ans': 8, 'n_init

In [None]:
test_home = os.path.splitext(args.gen_dset_fpath)[0]
test_timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
test_dir = os.path.join(test_home, f"test-{test_timestamp}")
domain_correct_idxs = defaultdict(list)

problem_times = []

try:
    for idx, dp in enumerate(dset):
        search_timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        # search_dir = os.path.join(seed_dir, f"problem-{idx}-{search_timestamp}")
        search_dir = os.path.join(test_dir, f"problem-{idx}-{search_timestamp}")
        print(f"### {idx}")
        print("#### Problem")
        print(dp["query"])
        print("#### Solution")
        problem_start = time.time()
        ans = agent.search(init_input=dp["query"], output_dir=search_dir)
        problem_time = time.time() - problem_start
        problem_times.append(problem_time)
        eq = agent.eq(ans, dp["ref_ans"])
        if eq and "domain" in dp:
            domain_correct_idxs[dp["domain"]].append(idx)
        print(f"Pred. ({ans}) {'=' if eq else '!'}= Ref. ({dp['ref_ans']})")
        print(f"Problem time: {problem_time:.2f}s")
        print(
            f"Avg. Time so far: {sum(problem_times)/len(problem_times):.2f}s (= {sum(problem_times):.2f} / {len(problem_times)})"
        )
        all_correct_idxs = sum(domain_correct_idxs.values(), [])
        print(
            f"Acc. so far: {len(all_correct_idxs) / (idx + 1):.2%} (= {len(all_correct_idxs)} / {idx + 1})"
        )
except BaseException as e:
    raise e
finally:
    all_correct_idxs = sum(domain_correct_idxs.values(), [])
    domain_correct_cnt = {k: len(v) for k, v in domain_correct_idxs.items()}

    metrics = {
        "search_cfg": {
            **{k: v for k, v in search_cfg.__dict__.items() if k != "sampling_params"},
            "sampling_params": search_cfg.sampling_params.__repr__(),
        },
        "acc": len(all_correct_idxs) / len(dset),
        "domain_correct_cnt": domain_correct_cnt,
        "avg_time": sum(problem_times) / len(problem_times),
        "domain_correct_idxs": domain_correct_idxs,
        "problem_times": problem_times,
    }

    with open(os.path.join(test_dir, f"metrics.json"), "w") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

### 0
#### Problem
Suppose $a$ and $b$ are positive real numbers with $a > b$ and $ab = 8.$  Find the minimum value of $\frac{a^2 + b^2}{a - b}.$
#### Solution
len(trajs)=1 * sampling_params.n=16 = 16 traj.s


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.48s/it, est. speed input: 187.46 toks/s, output: 845.94 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 0 | dropped: 0 | continue: 16
After multi-selection,  # of trajectories: finished: 0 | dropped: 0 | continue: 16



Executing: 100%|██████████| 16/16 [00:04<00:00,  3.56it/s]


After execution, # of trajectories: finished: 0 | dropped: 0 | continue: 16
len(trajs)=16 * sampling_params.n=1 = 16 traj.s


Processed prompts: 100%|██████████| 16/16 [00:43<00:00,  2.71s/it, est. speed input: 495.95 toks/s, output: 293.47 toks/s] 

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 6 | dropped: 3 | continue: 7
After multi-selection,  # of trajectories: finished: 6 | dropped: 3 | continue: 7



Executing: 100%|██████████| 7/7 [00:04<00:00,  1.73it/s]


After execution, # of trajectories: finished: 6 | dropped: 3 | continue: 7
len(trajs)=7 * sampling_params.n=1 = 7 traj.s


Processed prompts: 100%|██████████| 7/7 [00:06<00:00,  1.06it/s, est. speed input: 1755.26 toks/s, output: 163.52 toks/s]


After completion,       # of trajectories: finished: 6 | dropped: 3 | continue: 7
After single selection, # of trajectories: finished: 8 | dropped: 3 | continue: 5
After multi-selection,  # of trajectories: finished: 8 | dropped: 3 | continue: 5
ans2tok_avg_prob=defaultdict(<class 'list'>, {18: [0.8389669786233007, 0.9166454608881364, 0.9507009586102121], 8: [0.9893709467212803, 0.9787260923131383, 0.9708379501237893, 0.9745693668277444], 5: [0.7892704505244805]})
sorted_ans2agg_tok_avg_prob=[(8, 3.9135043559859524), (18, 2.706313398121649), (5, 0.7892704505244805)] -> fin_ans=8
Pred. (8) == Ref. (8)
Problem time: 65.78s
Avg. Time so far: 65.78s (= 65.78 / 1)
Acc. so far: 100.00% (= 1 / 1)
### 1
#### Problem
If $f(x)$, whose graph is shown below, is defined on $1 \le x \le 6$, what is the maximum value of $f^{-1}(x)$? [asy]

import graph; size(7.94cm); real lsf=0.5; pen dps=linewidth(0.7)+fontsize(10); defaultpen(dps); pen ds=black; real xmin=-0.96,xmax=8.96,ymin=-2.66,ymax=4.38;

Labe

Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.28s/it, est. speed input: 148.05 toks/s, output: 836.34 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 1 | dropped: 0 | continue: 15
After multi-selection,  # of trajectories: finished: 1 | dropped: 0 | continue: 15



Executing: 100%|██████████| 15/15 [00:00<00:00, 301.35it/s]


After execution, # of trajectories: finished: 1 | dropped: 0 | continue: 15
len(trajs)=15 * sampling_params.n=1 = 15 traj.s


Processed prompts: 100%|██████████| 15/15 [00:02<00:00,  6.18it/s, est. speed input: 11510.24 toks/s, output: 148.41 toks/s]


After completion,       # of trajectories: finished: 1 | dropped: 0 | continue: 15
After single selection, # of trajectories: finished: 16 | dropped: 0 | continue: 0
After multi-selection,  # of trajectories: finished: 16 | dropped: 0 | continue: 0
ans2tok_avg_prob=defaultdict(<class 'list'>, {5: [0.8793645505429308, 0.9768417643902756, 0.9784069777061792], 3: [0.969654307922498, 0.9720533671141908, 0.9756380231001192, 0.9840014616688503, 0.9888450252234219, 0.9879715711530092, 0.9886018361598493, 0.9832731364418864, 0.9809245226678402, 0.9704903135947331, 0.9764445152415129, 0.9732775391190699], 6: [0.9807452928118641]})
sorted_ans2agg_tok_avg_prob=[(3, 11.75117561940698), (5, 2.8346132926393857), (6, 0.9807452928118641)] -> fin_ans=3
Pred. (3) != Ref. (6)
Problem time: 12.15s
Avg. Time so far: 38.96s (= 77.93 / 2)
Acc. so far: 50.00% (= 1 / 2)
### 2
#### Problem
Let $a,$ $b,$ and $c$ be distinct real numbers.  Find the degree of the polynomial
\[p(x) = \frac{(x - b)(x - c)}{(a - b)(a

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.42s/it, est. speed input: 242.14 toks/s, output: 836.84 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 2 | dropped: 0 | continue: 14
After multi-selection,  # of trajectories: finished: 2 | dropped: 0 | continue: 14



Executing: 100%|██████████| 14/14 [00:00<00:00, 19.79it/s]


After execution, # of trajectories: finished: 2 | dropped: 0 | continue: 14
len(trajs)=14 * sampling_params.n=1 = 14 traj.s


Processed prompts: 100%|██████████| 14/14 [00:06<00:00,  2.25it/s, est. speed input: 2989.19 toks/s, output: 227.43 toks/s]


After completion,       # of trajectories: finished: 2 | dropped: 0 | continue: 14
After single selection, # of trajectories: finished: 15 | dropped: 0 | continue: 1
After multi-selection,  # of trajectories: finished: 15 | dropped: 0 | continue: 1
ans2tok_avg_prob=defaultdict(<class 'list'>, {1: [0.7156893929731961, 0.7524068850951818, 0.7442906931830023, 0.9327018712871544, 0.8850735487287252], 2: [0.8402308349648631, 0.8987274901593588, 0.9299886053445137, 0.844788006870976, 0.8959512551503738, 0.9317633417838372, 0.9224508111354368, 0.8236884782298348, 0.9214658807506991, 0.8115033479299575]})
sorted_ans2agg_tok_avg_prob=[(2, 8.82055805231985), (1, 4.030162391267259)] -> fin_ans=2
Pred. (2) != Ref. (0)
Problem time: 12.19s
Avg. Time so far: 30.04s (= 90.12 / 3)
Acc. so far: 33.33% (= 1 / 3)
### 3
#### Problem
Find the smallest positive integer $n$ such that there exists $r \in (0, \tfrac{1}{1000})$ such that the number $(n+r)^3$ is an integer.
#### Solution
len(trajs)=1 * sampling_

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it, est. speed input: 151.87 toks/s, output: 676.90 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 0 | dropped: 0 | continue: 16
After multi-selection,  # of trajectories: finished: 0 | dropped: 0 | continue: 16



Executing: 100%|██████████| 16/16 [00:05<00:00,  2.85it/s]


After execution, # of trajectories: finished: 0 | dropped: 0 | continue: 16
len(trajs)=16 * sampling_params.n=1 = 16 traj.s


Processed prompts: 100%|██████████| 16/16 [00:08<00:00,  1.89it/s, est. speed input: 2498.26 toks/s, output: 367.67 toks/s]


After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 12 | dropped: 0 | continue: 4
After multi-selection,  # of trajectories: finished: 12 | dropped: 0 | continue: 4
ans2tok_avg_prob=defaultdict(<class 'list'>, {17146: [0.9973095459759531], 7: [0.9069285940567761, 0.948087055705256, 0.9472575216932998, 0.8386003300932895, 0.9514657190251148], 1: [0.9925245500815543, 0.9957095059377339, 0.9684953703213288, 0.892216656876095, 0.9926951541652946], 9: [0.9483248084110646]})
sorted_ans2agg_tok_avg_prob=[(1, 4.841641237382007), (7, 4.592339220573736), (17146, 0.9973095459759531), (9, 0.9483248084110646)] -> fin_ans=1
Pred. (1) != Ref. (19)
Problem time: 21.49s
Avg. Time so far: 27.90s (= 111.61 / 4)
Acc. so far: 25.00% (= 1 / 4)
### 4
#### Problem
The four-digit number $25AB$ is divisible by nine, with $A$ the tens digit and $B$ the units digit. How many different such four-digit numbers could $25AB$ represent

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it, est. speed input: 316.07 toks/s, output: 811.11 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 0 | dropped: 0 | continue: 16
After multi-selection,  # of trajectories: finished: 0 | dropped: 0 | continue: 16



Executing: 100%|██████████| 16/16 [00:00<00:00, 27.84it/s]


After execution, # of trajectories: finished: 0 | dropped: 0 | continue: 16
len(trajs)=16 * sampling_params.n=1 = 16 traj.s


Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  7.80it/s, est. speed input: 9346.05 toks/s, output: 264.85 toks/s]


After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 16 | dropped: 0 | continue: 0
After multi-selection,  # of trajectories: finished: 16 | dropped: 0 | continue: 0
ans2tok_avg_prob=defaultdict(<class 'list'>, {11: [0.9950227988670534, 0.9950227988670534, 0.9937649750834674, 0.9012079577939697, 0.8352117288673242, 0.8352117288673242, 0.9923779718262087, 0.9927673479274142, 0.9912373203333409, 0.9950961548718285, 0.9928236807232774, 0.9921862079374711, 0.9964716650795776, 0.9938409084187336, 0.99103050087723, 0.7829097352827475]})
sorted_ans2agg_tok_avg_prob=[(11, 15.27618348162402)] -> fin_ans=11
Pred. (11) == Ref. (11)
Problem time: 6.24s
Avg. Time so far: 23.57s (= 117.84 / 5)
Acc. so far: 40.00% (= 2 / 5)
### 5
#### Problem
I have two arithmetic sequences. The first term of the first sequence is $0$. The second term of the first sequence is the first term of the first sequence plus the first term of 

Processed prompts: 100%|██████████| 1/1 [00:39<00:00, 39.09s/it, est. speed input: 27.35 toks/s, output: 247.11 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 0 | dropped: 1 | continue: 15
After multi-selection,  # of trajectories: finished: 0 | dropped: 1 | continue: 15



Executing: 100%|██████████| 15/15 [00:00<00:00, 25.66it/s]


After execution, # of trajectories: finished: 0 | dropped: 1 | continue: 15
len(trajs)=15 * sampling_params.n=1 = 15 traj.s


Processed prompts: 100%|██████████| 15/15 [00:40<00:00,  2.72s/it, est. speed input: 560.26 toks/s, output: 275.25 toks/s] 


After completion,       # of trajectories: finished: 0 | dropped: 1 | continue: 15
After single selection, # of trajectories: finished: 13 | dropped: 3 | continue: 0
After multi-selection,  # of trajectories: finished: 13 | dropped: 3 | continue: 0
ans2tok_avg_prob=defaultdict(<class 'list'>, {9: [0.9915495560628245], 3: [0.9159044781533415, 0.9595200450450888, 0.8167765573692837, 0.8335565343346075, 0.9783537402436743, 0.6770810207263254], 6: [0.8847653078401057, 0.8958603733035883], 12: [0.9918630742842286], 1: [0.9332032498385547], 2: [0.9272441542444555], 15: [0.9233172833985136]})
sorted_ans2agg_tok_avg_prob=[(3, 5.181192375872321), (6, 1.780625681143694), (12, 0.9918630742842286), (9, 0.9915495560628245), (1, 0.9332032498385547), (2, 0.9272441542444555), (15, 0.9233172833985136)] -> fin_ans=3
Pred. (3) != Ref. (12)
Problem time: 81.13s
Avg. Time so far: 33.16s (= 198.98 / 6)
Acc. so far: 33.33% (= 2 / 6)
### 6
#### Problem
Let
\[x^8 + 98x^4 + 1 = p(x) q(x),\]where $p(x)$ and $q(x

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.41s/it, est. speed input: 236.67 toks/s, output: 854.91 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 0 | dropped: 0 | continue: 16
After multi-selection,  # of trajectories: finished: 0 | dropped: 0 | continue: 16



Executing: 100%|██████████| 16/16 [00:00<00:00, 258.20it/s]


After execution, # of trajectories: finished: 0 | dropped: 0 | continue: 16
len(trajs)=16 * sampling_params.n=1 = 16 traj.s


Processed prompts: 100%|██████████| 16/16 [00:01<00:00,  8.30it/s, est. speed input: 10670.70 toks/s, output: 232.49 toks/s]


After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 16 | dropped: 0 | continue: 0
After multi-selection,  # of trajectories: finished: 16 | dropped: 0 | continue: 0
ans2tok_avg_prob=defaultdict(<class 'list'>, {20: [0.9816965571851494, 0.9632492811021509, 0.9898536625984343, 0.9805986846638503, 0.967383981024876, 0.9812027586098919, 0.9654137306405626, 0.9697689260526788, 0.9733874702517687, 0.9757432025484618, 0.9736935629146422, 0.9789894229092315, 0.9799756134064459, 0.9530886456701233, 0.9642168867907362, 0.974056947445252]})
sorted_ans2agg_tok_avg_prob=[(20, 15.572319333814253)] -> fin_ans=20
Pred. (20) == Ref. (20)
Problem time: 6.97s
Avg. Time so far: 29.42s (= 205.95 / 7)
Acc. so far: 42.86% (= 3 / 7)
### 7
#### Problem
Let $a,$ $b,$ and $c$ be real numbers such that $ab + ac + bc = 0$ and $(a + b + c + 1)^2 = abc.$  Find all possible values of
\[(ab - c)(ac - b)(bc - a).\]Enter the possible val

Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.42s/it, est. speed input: 115.42 toks/s, output: 663.72 toks/s]

After completion,       # of trajectories: finished: 0 | dropped: 0 | continue: 16
After single selection, # of trajectories: finished: 1 | dropped: 0 | continue: 15
After multi-selection,  # of trajectories: finished: 1 | dropped: 0 | continue: 15



Executing: 100%|██████████| 15/15 [00:05<00:00,  2.67it/s]


After execution, # of trajectories: finished: 1 | dropped: 0 | continue: 15
len(trajs)=15 * sampling_params.n=1 = 15 traj.s


Processed prompts:  80%|████████  | 12/15 [00:08<00:02,  1.36it/s, est. speed input: 2110.88 toks/s, output: 236.93 toks/s]