In [1]:
import os
import json
from tqdm import tqdm
import torch
import numpy as np
from openai import OpenAI
import backoff

In [2]:
port = 8002
model = 'Qwen/QwQ-32B'

OPENAI_REQUEST_TIMEOUT = 60*60*24 
client = OpenAI(base_url=f"http://localhost:8002/v1", api_key="EMPTY", timeout=OPENAI_REQUEST_TIMEOUT)
print(client.models.list())

# @backoff.on_exception(backoff.constant, Exception, interval=5)
def run_chat_completion_with_backoff(client, **kwargs):
    return client.chat.completions.create(**kwargs)


@backoff.on_exception(backoff.constant, Exception, interval=5)
def run_generate_with_backoff(client, **kwargs):
    return client.completions.create(**kwargs)

SyncPage[Model](data=[Model(id='Qwen/QwQ-32B', created=1751830298, object='model', owned_by='vllm', root='Qwen/QwQ-32B', parent=None, max_model_len=40960, permission=[{'id': 'modelperm-60b82168111b4793b94396f8a9068320', 'object': 'model_permission', 'created': 1751830298, 'allow_create_engine': False, 'allow_sampling': True, 'allow_logprobs': True, 'allow_search_indices': False, 'allow_view': True, 'allow_fine_tuning': False, 'organization': '*', 'group': None, 'is_blocking': False}])], object='list')


In [3]:
# load direct pred logs
def load_direct_pred_logs(task):
    if task == 'gpqa':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/gpqa.qwq.direct/diamond.7.1,20:8.json'))
    elif task == 'aime':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/aime.qwq.direct/test.7.1,19:42.json'))
    elif task == 'amc':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/amc.qwq.direct/test.7.1,19:24.json'))
    elif task == 'math500':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/math500.qwq.direct/test.7.1,20:17.json'))
    elif task == 'livecode':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/livecode.qwq.direct/test_1to4.7.1,22:30.json'))
    elif task == 'bamboogle':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/runs.qa/bamboogle.qwq.direct/test.7.1,16:22.json'))
    else:
        raise NotImplementedError
    return logs

# Analysis: thought segmentation

In [4]:
# Thought segmentation

def segment_thoughts_v1(x):
    return x.strip().split('\n\n')

def segment_thoughts_v2(x):
    # note: excluding things like "so" "therefore", "but", "let me" 
    reasoning_word_list = [
        'okay', 'hmm', 'wait', 'but wait', 'oh wait', 'no wait', 'no, wait', 'but let me', 'but actually', 'alternatively', 
        'now', 'the question', 'ah', 'oh', 'next', 'another angle', 'another approach', 'also', 'hold on', 'looking it up', 
        'another point', 'I don\'t think', 'perhaps I', 'putting this together', 'Putting it all together', 'i\'m', 'but i\'m',   
        'let me think again', 'I don\'t see', 'maybe I', 'alternative', "I wonder if", "another way", 'an alternative', 
    ]
    prefix_len = max([len(x) for x in reasoning_word_list])
    newline_segmented_thoughts = segment_thoughts_v1(x)
    final_thoughts = []
    for t in newline_segmented_thoughts:
        t_lower = t.lower()
        is_segment_start = False
        for r_w in reasoning_word_list:
            if t_lower.startswith(r_w.lower()):
                is_segment_start = True
                break
        if is_segment_start or not final_thoughts:
            final_thoughts.append(t)
        else:
            final_thoughts[-1] += '\n\n' + t
    return final_thoughts

def segment_thoughts_v3(x):
    # maybe implement with a small auxiliary model?
    return x


def visualize_thoughts_v2(x):
    print(json.dumps({f'Step {i+1}': t for i, t in enumerate(segment_thoughts_v2(x))}, indent=4))


In [7]:
# analyses (direct prediction thought length)

# for task in ['bamboogle']:
for task in ['gpqa', 'aime', 'amc', 'math500', 'livecode', 'bamboogle']:

    logs = load_direct_pred_logs(task)
    print(f'Task: {task}')
    # print(logs[0]['Metrics'])
    # break
    
    if task in ['bamboogle']:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['em']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['em']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    elif task in ['livecode']:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['pass@1']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['pass@1']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    else:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['math_equal']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['math_equal']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    print(f'Loaded {len(logs)} examples: {len(ids_correct)} correct and {len(ids_wrong)} wrong.')
    
    n_steps_v1, n_steps_v2, n_steps_v2_correct, n_steps_v2_wrong = [], [], [], []
    for i, entry in enumerate(logs):
        # if entry['id'] not in ids_wrong:
        #     continue
        
        thoughts = entry['Output'].split('</think>')[0]
        thoughts_segmented_v1 = segment_thoughts_v1(thoughts)
        n_steps_v1.append(len(thoughts_segmented_v1))
    
        thoughts_segmented_v2 = segment_thoughts_v2(thoughts)
        n_steps_v2.append(len(thoughts_segmented_v2))

        if entry['id'] in ids_wrong:
            n_steps_v2_wrong.append(len(thoughts_segmented_v2))
        else:
            n_steps_v2_correct.append(len(thoughts_segmented_v2))

        # if i < 10:
        #     for t in thoughts_segmented_v2:
        #         if t.count('\n\n') > 1:
        #             print('\n===================================================\n')
        #             print(t)
        #     # break

    print(round(np.mean(n_steps_v1), 0), round(np.mean(n_steps_v2), 0), round(np.mean(n_steps_v2_correct), 0), round(np.mean(n_steps_v2_wrong), 0))

Task: gpqa
Loaded 198 examples: 126 correct and 72 wrong.
170.0 51.0 41.0 69.0
Task: aime
Loaded 30 examples: 26 correct and 4 wrong.
398.0 77.0 66.0 150.0
Task: amc
Loaded 40 examples: 40 correct and 0 wrong.
235.0 37.0 37.0 nan
Task: math500
Loaded 500 examples: 457 correct and 43 wrong.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


89.0 19.0 17.0 46.0
Task: livecode
Loaded 112 examples: 51 correct and 61 wrong.
590.0 96.0 70.0 117.0
Task: bamboogle
Loaded 125 examples: 60 correct and 65 wrong.
15.0 10.0 6.0 13.0


# Analysis: hint injection

In [6]:
# Step 1: visualize 
task = 'math500'
logs = load_direct_pred_logs(task)
logs_wrong = [x for x in logs if not x['Metrics']['math_equal']]
# logs_wrong = [x for x in logs if x['Metrics']['f1'] < 0.3]

idx = 14
entry = logs_wrong[idx]
print('Task id: {}\nQuestion: {}\nAnswer: {}\nPrediction: {} (wrong)\nThinking Process:'.format(entry['id'], entry['Question'].split('Question:')[1].split('<|im_end|>')[0].strip(), entry['answer'], entry['Pred_Answer']))
# print(json.dumps(segment_thoughts_v2(entry['Output'].split('</think>')[0]), indent=4))
print(json.dumps({f'Step {i}': t for i, t in enumerate(segment_thoughts_v2(entry['Output'].split('</think>')[0]))}, indent=4))
# print('\n\n==================\n\n'.join(segment_thoughts_v2(entry['Output'].split('</think>')[0])))
print(entry['Metrics'])


Task id: 154
Question: A gecko is in a room that is 12 feet long, 10 feet wide and 8 feet tall. The gecko is currently on a side wall ($10^{\prime}$ by $8^{\prime}$), one foot from the ceiling and one foot from the back wall ($12^{\prime}$ by $8^{\prime}$). The gecko spots a fly on the opposite side wall, one foot from the floor and one foot from the front wall. What is the length of the shortest path the gecko can take to reach the fly assuming that it does not jump and can only walk across the ceiling and the walls? Express your answer in simplest radical form.
Answer: 2\sqrt{113}
Prediction: 2\sqrt{106} (wrong)
Thinking Process:
{
    "Step 0": "Okay, so there's this gecko in a room that's 12 feet long, 10 feet wide, and 8 feet tall. The gecko is on a side wall which is 10 feet by 8 feet. It's one foot away from the ceiling and one foot from the back wall (which is 12 feet by 8 feet). The fly is on the opposite side wall, one foot from the floor and one foot from the front wall. We 

In [7]:
# Step 2: inject hint from o3 and rerun inference

def process_question_instruct_add_hint(instruction):
    if task in ['gpqa']:
        raise NotImplementedError
    elif task in ['aime', 'amc', 'math500']:
        instruction_new = instruction.replace('Please answer the following math question.',
                                              'Please answer the following math question. Hints might be provided during your question answering wrapped within [hint] and [end of hint]. If you see hints, try to leverage them to guide your thinking process.')
    elif task in ['livecode']:
        raise NotImplementedError
    elif task in ['bamboogle']:
        instruction_new = instruction.replace('Please answer the following question.',
                                              'Please answer the following question. Hints might be provided during your question answering wrapped within [hint] and [end of hint]. If you see hints, try to leverage them to guide your thinking process.')
    assert instruction_new != instruction
    return instruction_new


# building the prompt
intervene_step = 44
hint = 'I should list every reasonable line of attack, evaluate each one fully, and only then decide which gives the optimum result, instead of locking in the first promising outcome.'
formatted_question = process_question_instruct_add_hint(entry['Question'])
hint_str = f'[hint] {hint} [end of hint]'
final_prompt = formatted_question + '\n\n'.join([x.strip() for x in segment_thoughts_v2(entry['Output'].split('</think>')[0])[:intervene_step]] + [hint_str]) + '\n\n'
print(final_prompt)

# prompting
print('==================\n')
print('Running prompting...')
completion = run_generate_with_backoff(client, model=model, prompt=final_prompt, max_tokens=30000, temperature=0.7, top_p=0.8, timeout=60*60, n=5,
                                       extra_body={'top_k': 20, 'repetition_penalty': 1.05})

print('Prompting done. Continued thoughts:')
print('\n==================\n\n')
# print(completion.choices[0].text)
for c in completion.choices:
    
    visualize_thoughts_v2(c.text.split('</think>')[0])
    print('\n==================\n\n')


<|im_start|>user
Please answer the following math question. Hints might be provided during your question answering wrapped within [hint] and [end of hint]. If you see hints, try to leverage them to guide your thinking process. You should provide your final answer in the format \boxed{YOUR_ANSWER}.

Question:
A gecko is in a room that is 12 feet long, 10 feet wide and 8 feet tall. The gecko is currently on a side wall ($10^{\prime}$ by $8^{\prime}$), one foot from the ceiling and one foot from the back wall ($12^{\prime}$ by $8^{\prime}$). The gecko spots a fly on the opposite side wall, one foot from the floor and one foot from the front wall. What is the length of the shortest path the gecko can take to reach the fly assuming that it does not jump and can only walk across the ceiling and the walls? Express your answer in simplest radical form.

<|im_end|>
<|im_start|>assistant
<think>
Okay, so there's this gecko in a room that's 12 feet long, 10 feet wide, and 8 feet tall. The gecko i

# Analysis: rollout-based failure point identification


In [None]:
# at different locations, rollout to the end (10 samples) 

# check pass rate to see if there are significant points of failure

# let's start with first



