In [50]:
import json
import os
import sys
sys.path.append("../..")
from tqdm import trange, tqdm
from langagent.base_llm import InferenceLogger, HfChatModel, VALID_ROLES_PREFIX, DETERMINISTIC_TEMPERATURE
from langagent.metrics import get_inference_cost_metrics
from langagent.eval import ResultToTxtLine, ResultDictToJsonl
import matplotlib.pyplot as plt
from langagent.langreason.common import load_qa_dataset

run_id = "rest_cot"
dataset_name ="math500"  #"gsm8k" #
run_id = f"{dataset_name}_{run_id}"
model_name =  "Meta-Llama-3-8B-Instruct" #"Qwen3-32B-AWQ" #"Meta-Llama-3-8B" #
root_dir = f"{model_name}_results/{run_id}/" #run_qwen2/
assert os.path.exists(root_dir), f"Root directory {root_dir} does not exist."

In [None]:
inference_logger, metrics = get_inference_cost_metrics(root_dir, return_metrics=['num_calls', 'input_tokens', 'output_tokens', 'total_hours'])   
print(metrics)

Result file Meta-Llama-3-8B-Instruct_results/math500_rest_cot/inferencelogger.log already exists. I will append to it. 
Efficiency metrics - Roles
default :  {'num_calls': '100', 'input_tokens': '13911', 'output_tokens': '37922', 'total_hours': '0.30664499554369185'}
dynamics :  {'num_calls': '0', 'input_tokens': '0', 'output_tokens': '0', 'total_hours': '0.0'}
policy :  {'num_calls': '0', 'input_tokens': '0', 'output_tokens': '0', 'total_hours': '0.0'}
evaluator :  {'num_calls': '0', 'input_tokens': '0', 'output_tokens': '0', 'total_hours': '0.0'}
bn_eval :  {'num_calls': '0', 'input_tokens': '0', 'output_tokens': '0', 'total_hours': '0.0'}
bn_entropy :  {'num_calls': '0', 'input_tokens': '0', 'output_tokens': '0', 'total_hours': '0.0'}
{'num_calls': 100, 'input_tokens': 13911, 'output_tokens': 37922, 'total_hours': 0.31}


In [52]:
results_file = ResultDictToJsonl(run_id='', root_dir=root_dir, override=False)
existing_results = results_file.results
print(len(results_file.results))
# full_dataset = load_qa_dataset(dataset_name)
# metrics = get_accuracy(full_dataset, results_file)
# print("Accuracy:", metrics["accuracy"])


Result file Meta-Llama-3-8B-Instruct_results/math500_rest_cot/resultdicttojsonl.jsonl already exists. Loading existing results.
100


In [53]:
def get_accuracy( existing_results, exclude_idx=[], include_idx=[]):
    num_correct = 0
    num_total = 0
    incorrect_idx = []
    num_error = 0
    for idx, rec in enumerate(existing_results):
        if idx in exclude_idx:
            continue
        if include_idx and idx not in include_idx:
            continue
        num_total += 1
        try:
            if float(rec.get("label", "")) == float(rec.get("truth", "")):
                num_correct += 1
            else: 
                incorrect_idx.append(idx)
                # print("Predicted:", float(rec.get("label", "")), "Truth:", float(rec.get("truth", "")))
        # catch TypeError, ValueError
        except (TypeError, ValueError) as e:
            num_error += 1
            # print(f"Error for record {rec}: {e}")
    print(f"Correct #: {num_correct}; Incorrect #: {len(incorrect_idx)}; Errors #: {num_error};Total #: {num_total}")
    accuracy = num_correct / num_total if num_total > 0 else 0
    return accuracy, num_correct, incorrect_idx

In [55]:
acc, num_correct, incorrect_idx = get_accuracy(existing_results) # include_idx=list(range(77))
print(acc)

Correct #: 35; Incorrect #: 49; Errors #: 16;Total #: 100
0.35


In [49]:
exclude_idx = []
with open("data/math500_float_answer_idx_by_level.jsonl", "r") as f:
    loaded_idx_by_level = json.load(f)

max_idx = 100
overall_num_correct = 0
incorrect_idx = []
for level in range(1, 6):
    include_idx = loaded_idx_by_level.get(str(level), [])
    if max_idx is not None:
        include_idx = [idx for idx in include_idx if idx < max_idx]
    print("Level:", level)
    acc, num_correct, incorrect_idx = get_accuracy(existing_results, exclude_idx=exclude_idx, include_idx=include_idx)
    overall_num_correct += num_correct
    print(f"Accuracy ({len(incorrect_idx)}/{len(include_idx)}): {acc}")
    print("Incorrect indices:", incorrect_idx)
print("Overall:", get_accuracy(existing_results, exclude_idx=exclude_idx, include_idx=list(range(max_idx)) if max_idx is not None else None))

Level: 1
Correct #: 8; Incorrect #: 3; Errors #: 0;Total #: 11
Accuracy (3/11): 0.7272727272727273
Incorrect indices: [8, 47, 54]
Level: 2
Correct #: 11; Incorrect #: 11; Errors #: 0;Total #: 22
Accuracy (11/22): 0.5
Incorrect indices: [1, 15, 18, 22, 27, 35, 37, 51, 60, 71, 95]
Level: 3
Correct #: 8; Incorrect #: 10; Errors #: 4;Total #: 22
Accuracy (10/22): 0.36363636363636365
Incorrect indices: [2, 4, 10, 19, 38, 50, 57, 63, 75, 77]
Level: 4
Correct #: 4; Incorrect #: 13; Errors #: 4;Total #: 21
Accuracy (13/21): 0.19047619047619047
Incorrect indices: [17, 23, 33, 43, 52, 56, 62, 68, 78, 79, 81, 82, 99]
Level: 5
Correct #: 2; Incorrect #: 18; Errors #: 4;Total #: 24
Accuracy (18/24): 0.08333333333333333
Incorrect indices: [5, 11, 12, 13, 28, 30, 31, 40, 49, 55, 59, 61, 74, 91, 92, 93, 96, 97]
Correct #: 33; Incorrect #: 55; Errors #: 12;Total #: 100
Overall: (0.33, 33, [1, 2, 4, 5, 8, 10, 11, 12, 13, 15, 17, 18, 19, 22, 23, 27, 28, 30, 31, 33, 35, 37, 38, 40, 43, 47, 49, 50, 51, 52,

In [None]:
# 0.31645569620253167
# 0.3227848101265823
overall_num_correct

38