In [1]:
import os
import json
from tqdm import tqdm
import torch
import numpy as np
from openai import OpenAI
import backoff

In [2]:
port = 8001
model = 'Qwen/QwQ-32B'

OPENAI_REQUEST_TIMEOUT = 60*60*24 
client = OpenAI(base_url=f"http://localhost:{port}/v1", api_key="EMPTY", timeout=OPENAI_REQUEST_TIMEOUT)
print(client.models.list())

@backoff.on_exception(backoff.constant, Exception, interval=5)
def run_chat_completion_with_backoff(client, **kwargs):
    return client.chat.completions.create(**kwargs)


@backoff.on_exception(backoff.constant, Exception, interval=5)
def run_generate_with_backoff(client, **kwargs):
    return client.completions.create(**kwargs)

SyncPage[Model](data=[Model(id='Qwen/QwQ-32B', created=1751487253, object='model', owned_by='vllm', root='Qwen/QwQ-32B', parent=None, max_model_len=40960, permission=[{'id': 'modelperm-3171155324c54aa6ae5ccbaed41161ff', 'object': 'model_permission', 'created': 1751487253, 'allow_create_engine': False, 'allow_sampling': True, 'allow_logprobs': True, 'allow_search_indices': False, 'allow_view': True, 'allow_fine_tuning': False, 'organization': '*', 'group': None, 'is_blocking': False}])], object='list')


In [59]:
# analyses (direct prediction without any search)


# for task in ['bamboogle']:
for task in ['gpqa', 'aime', 'amc', 'math500', 'livecode', 'bamboogle']:
    if task == 'gpqa':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/gpqa.qwq.direct/diamond.7.1,20:8.json'))
    elif task == 'aime':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/aime.qwq.direct/test.7.1,19:42.json'))
    elif task == 'amc':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/amc.qwq.direct/test.7.1,19:24.json'))
    elif task == 'math500':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/math500.qwq.direct/test.7.1,20:17.json'))
    elif task == 'livecode':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/livecode.qwq.direct/test_1to4.7.1,22:30.json'))
    elif task == 'bamboogle':
        logs = json.load(open('/fsx-comem/diwu0162/Search-o1/outputs/runs.qa/bamboogle.qwq.direct/test.7.1,16:22.json'))
    else:
        raise NotImplementedError
    
    print(f'Task: {task}')
    # print(logs[0]['Metrics'])
    # break
    
    if task in ['bamboogle']:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['em']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['em']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    elif task in ['livecode']:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['pass@1']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['pass@1']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    else:
        ids_correct = set([x['id'] for x in logs if x['Metrics']['math_equal']])
        ids_wrong = set([x['id'] for x in logs if not x['Metrics']['math_equal']])
        assert len(ids_correct) + len(ids_wrong) == len(logs)
    print(f'Loaded {len(logs)} examples: {len(ids_correct)} correct and {len(ids_wrong)} wrong.')
    
    
    
    n_steps_v1, n_steps_v2 = [], []
    for i, entry in enumerate(logs):
        if entry['id'] not in ids_wrong:
            continue
        
        thoughts = entry['Output'].split('</think>')[0]
        thoughts_segmented_v1 = segment_thoughts_v1(thoughts)
        n_steps_v1.append(len(thoughts_segmented_v1))
    
        thoughts_segmented_v2 = segment_thoughts_v2(thoughts)
        n_steps_v2.append(len(thoughts_segmented_v2))

        # if i == 9:
        #     for t in thoughts_segmented_v2:
        #         # if t.count('\n\n') > 1:
        #         print('\n===================================================\n')
        #         print(t)
        #     # break

    print(round(np.mean(n_steps_v2), 2))

Task: gpqa
Loaded 198 examples: 126 correct and 72 wrong.
68.46
Task: aime
Loaded 30 examples: 26 correct and 4 wrong.
147.25
Task: amc
Loaded 40 examples: 40 correct and 0 wrong.
nan
Task: math500
Loaded 500 examples: 457 correct and 43 wrong.
45.47
Task: livecode
Loaded 112 examples: 51 correct and 61 wrong.
113.85
Task: bamboogle
Loaded 125 examples: 60 correct and 65 wrong.
13.08


In [50]:
# experiment with thought segmentation

def segment_thoughts_v1(x):
    return x.strip().split('\n\n')

def segment_thoughts_v2(x):
    # note: excluding things like "so" "therefore", "but", "let me" 
    reasoning_word_list = [
        'okay', 'hmm', 'wait', 'but wait', 'but let me', 'but actually', 'alternatively', 'now', 'the question',
        'ah', 'next', 'another angle', 'another approach', 'also', 'hold on', 'looking it up', 'another point', 
        'I don\'t think', 'perhaps I', 'putting this together', 'Putting it all together', 'i\'m', 'but i\'m',   
        'let me think again', 'I don\'t see'
    ]
    prefix_len = max([len(x) for x in reasoning_word_list])
    newline_segmented_thoughts = segment_thoughts_v1(x)
    final_thoughts = []
    for t in newline_segmented_thoughts:
        t_lower = t.lower()
        is_segment_start = False
        for r_w in reasoning_word_list:
            if t_lower.startswith(r_w.lower()):
                is_segment_start = True
                break
        if is_segment_start or not final_thoughts:
            final_thoughts.append(t)
        else:
            final_thoughts[-1] += '\n\n' + t
    return final_thoughts

    




9.59
