In [1]:
import os
import json
import pandas as pd
from glob import glob


In [2]:
data = glob("data/eval_outputs/**/*CoT@1.json", recursive=True)
print(data)

def parse_file(path):
    splited = path.split("/")
    return {
        "task_id": splited[-1].replace(".json", ""),
        "query_file": splited[-2],
        "type": splited[-3],
        "model": splited[-4],
        "path": path
    }

def load_json(path):
    with open(path) as f:
        return json.load(f)

df = pd.DataFrame([parse_file(path) for path in data])
df["json"] = df["path"].apply(load_json)
print(f"{len(df)} Loaded")


['data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/32807_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/69206_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/26892_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/44482_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/71402_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/55489_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/79053_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/10160_CoT@1.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/G1_instruction/59954_CoT@1.json', 'data/eval_outputs/chat_completion:g

In [3]:
df["returned_final_answer"] = df["json"].apply(lambda x: x["win"])


In [4]:
df.groupby([
    "model",
    "query_file",
    "type",
])["returned_final_answer"].agg(["mean", "count"])\
    .rename(columns={"mean": "pct_valid_trajectory", "count": "count"})\
    .sort_index(level=[0, 1, 2], ascending=[True, True, True])\
    .unstack(level=[2])\
    .style.background_gradient(cmap='Blues', axis=1, subset=["pct_valid_trajectory"])


Unnamed: 0_level_0,Unnamed: 1_level_0,pct_valid_trajectory,pct_valid_trajectory,count,count
Unnamed: 0_level_1,type,code_as_action,json_as_action,code_as_action,json_as_action
model,query_file,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
chat_completion:gpt-3.5-turbo-16k-0613,G1_instruction,0.29,0.075,200,200
chat_completion:gpt-3.5-turbo-16k-0613,G2_instruction,0.36,0.055,200,200
chat_completion:gpt-3.5-turbo-16k-0613,G3_instruction,0.23,0.03,100,100


## Results

In [5]:
result_files = glob("data/eval_outputs/**/pass_rate_results/*.json", recursive=True)
print(result_files)

def parse_result_file_path(path):
    splited = path.split("/")
    return {
        "query_file": splited[-1].replace(".json", ""),
        "type": splited[-3],
        "model": splited[-4],
        "path": path
    }


result_df = pd.DataFrame([parse_result_file_path(path) for path in result_files])
result_df["json"] = result_df["path"].apply(load_json)
result_df["pass_rate"] = result_df["json"].apply(lambda x: sum([v["passed"] for v in x.values()]) / len(x))
result_df["count"] = result_df["json"].apply(lambda x: len(x))
print(f"{len(result_df)} Loaded")


['data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/pass_rate_results/G1_instruction.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/pass_rate_results/G2_instruction.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/code_as_action/pass_rate_results/G3_instruction.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/json_as_action/pass_rate_results/G1_instruction.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/json_as_action/pass_rate_results/G3_instruction.json', 'data/eval_outputs/chat_completion:gpt-3.5-turbo-16k-0613/json_as_action/pass_rate_results/G2_instruction.json']
6 Loaded


In [6]:
result_df[["query_file", "type", "model", "pass_rate", "count"]]


Unnamed: 0,query_file,type,model,pass_rate,count
0,G1_instruction,code_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,200
1,G2_instruction,code_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,200
2,G3_instruction,code_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,100
3,G1_instruction,json_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,200
4,G3_instruction,json_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,100
5,G2_instruction,json_as_action,chat_completion:gpt-3.5-turbo-16k-0613,0.0,200
