In [24]:
from glob import glob
import os
import re
import sys
import json
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from collections import Counter


In [25]:
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))
# Should be your path to the repo `mint`
sys.path.insert(0, ROOT_DIR)
DATA_DIR = os.path.join(ROOT_DIR, "data", "outputs")
print(f"Data directory: {DATA_DIR}")
glob_pattern = f"{DATA_DIR}/**/*results.jsonl"
filepaths = list(set(glob(glob_pattern, recursive=True)))
print(f"Matching glob pattern: `{glob_pattern}`. **{len(filepaths)}** files found.")


def parse_filepath(filepath):
    # e.g., gpt-3.5-turbo-0613/F=gpt-3.5-turbo-16k-0613/PHF=no_GT-textual/max5_p2+tool+cd/code_generation/humaneval/results.jsonl
    # e.g., gpt-3.5-turbo-0613/F=None/max5_p2+tool+cd/code_generation/humaneval/results.jsonl
    splited = filepath.replace(DATA_DIR, "").lstrip("/").split("/")
    
    agent_model_name = splited[0]
    feedback_model_name = splited[1].split("=")[1]
    if feedback_model_name != "None":
        feedback_setting = splited[2]
    else:
        feedback_setting = "None"
    split = splited[-2]
    task_name = splited[-3]
    task_type = splited[-4]
    exp_setting = splited[-5]
    return {
        "agent_model_name": agent_model_name,
        "feedback_model_name": feedback_model_name,
        "feedback_setting": feedback_setting,
        "task_name": task_name,
        "task_type": task_type,
        "split": split,
        "exp_setting": exp_setting,
        "filepath": filepath,
    }

df = pd.DataFrame(list(map(parse_filepath, filepaths)))

def load_results(filepath):
    results = []
    with open(filepath) as f:
        for line in f:
            try:
                results.append(json.loads(line))
            except Exception as e:
                print(f"Error loading {filepath}: {e}\n{line}")
                globals()["error_line"] = line
    return pd.DataFrame(results)

df["results"] = df.filepath.apply(load_results)


def rename_model(model_name):
    if "-hf" in model_name:
        model_name = model_name.rstrip("-hf")
    return model_name

all_results = []
for row in df.itertuples():
    row.results["agent_model_name"] = rename_model(row.agent_model_name)
    row.results["feedback_model_name"] = rename_model(row.feedback_model_name)
    row.results["feedback_setting"] = row.feedback_setting
    row.results["exp_setting"] = row.exp_setting
    row.results["task_name"] = row.task_name
    row.results["task_type"] = row.task_type
    all_results.append(row.results)


all_results = pd.concat(all_results)
def get_stats(row):
    state = row["state"]
    task = row["task"]
    return {
        "task_id": task["task_id"],
        "n_turns": len(state["history"]) // 2,
        "success": state["success"],
        "agent_action_count": state["agent_action_count"],
        "token_counter": {'a': Counter(state["token_counter"]), 'b': 1},
        "terminate_reason": state["terminate_reason"],
    }


# combine this with the original dataset
stats = all_results.apply(get_stats, axis=1, result_type="expand")
all_results = pd.concat([all_results, stats], axis=1)
all_results["token_counter"] = all_results["token_counter"].apply(lambda x: x["a"])

# turn bool to int
all_results['success'] = all_results['success'].astype(int)

all_results_unfiltered = all_results.copy()

# Special handling
# make all code_generation from gpt-3.5-turbo-16k-0613 to gpt-3.5-turbo-0613
all_results.loc[all_results.agent_model_name == "gpt-3.5-turbo-16k-0613", "agent_model_name"] = "gpt-3.5-turbo-0613"


Data directory: /shared/nas2/xingyao6/projects/llm-agent/data/outputs
Matching glob pattern: `/shared/nas2/xingyao6/projects/llm-agent/data/outputs/**/*results.jsonl`. **46** files found.


In [4]:
# Remove duplicates in case of weird bugs
all_results_no_dup = all_results.drop_duplicates(
    subset=["task_type", "task_name", "task_id", "agent_model_name", "feedback_model_name", "feedback_setting", "exp_setting"],
    keep="first"
)
if len(all_results_no_dup) != len(all_results):
    print(f"WARNING: Removed {len(all_results) - len(all_results_no_dup)} duplicated rows.")
    all_results = all_results_no_dup


In [26]:
# Sanity check of experiments - check whether they are all completed
all_results_count = all_results.groupby([
    "agent_model_name",
    "feedback_model_name",
    "feedback_setting",
    "exp_setting",
    # "task_type",
    "task_name",
])["task_id"] \
.count().unstack().fillna(0)\
# .sum(axis=1).unstack().fillna(0).astype(int)

display(all_results_count.T.astype(int).style.background_gradient(cmap='Blues', axis=1))
display(all_results_count.sum(axis=1).unstack().fillna(0).astype(int).style.background_gradient(cmap='Blues', axis=0))

# separate the results from gpt4
gpt4_results = all_results[all_results.agent_model_name.str.startswith("gpt-4")]
all_results = all_results[~all_results.agent_model_name.str.startswith("gpt-4")]


agent_model_name,claude-2,claude-instant-1,gpt-3.5-turbo-0613,gpt-4-0613
feedback_model_name,None,None,None,None
feedback_setting,None,None,None,None
exp_setting,max5_p2+tool+cd,max5_p2+tool+cd,max5_p2+tool+cd,max5_p2+tool+cd
task_name,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
APPS,4439,4439,4439,595
alfworld,3553,3553,3553,0
algebra,1226,1226,1226,334
counting_and_probability,602,602,602,311
geometry,727,727,727,500
hotpotqa,3000,3000,3000,500
intermediate_algebra,1070,1070,1070,500
number_theory,691,691,691,252
prealgebra,750,750,750,172
precalculus,520,520,520,381


Unnamed: 0_level_0,Unnamed: 1_level_0,exp_setting,max5_p2+tool+cd
agent_model_name,feedback_model_name,feedback_setting,Unnamed: 3_level_1
claude-2,,,21868
claude-instant-1,,,21868
gpt-3.5-turbo-0613,,,21868
gpt-4-0613,,,3574


In [27]:
pd.concat(
    [gpt4_results, gpt4_results["token_counter"].apply(pd.Series)],
    axis=1
)[["n_turns", "success"]].sort_values("n_turns")


Unnamed: 0,n_turns,success
464,1,1
463,1,1
419,1,1
300,1,1
305,1,1
...,...,...
6,5,0
5,5,0
498,5,0
499,5,0


In [32]:
gpt4_results[["n_turns", "success"]].sort_values("n_turns")


Unnamed: 0,n_turns,success
464,1,1
463,1,1
419,1,1
300,1,1
305,1,1
...,...,...
6,5,0
5,5,0
498,5,0
499,5,0


In [28]:
pd.concat(
    [gpt4_results, gpt4_results["token_counter"].apply(pd.Series)],
    axis=1
)[["n_turns", "success"]].groupby("success")[["n_turns"]].describe()


Unnamed: 0_level_0,n_turns,n_turns,n_turns,n_turns,n_turns,n_turns,n_turns,n_turns
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
success,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2535.0,4.344773,1.055423,2.0,4.0,5.0,5.0,5.0
1,1039.0,2.666987,1.079132,1.0,2.0,2.0,3.0,5.0


In [34]:
len(gpt4_results.query("n_turns > 2 and success == 1"))


423

In [36]:
# Filter out experiments that are not completed

# find all index that are not [136, 134, 320]
TASK_COUNT = {
    "prealgebra": 750,
    "number_theory": 691,
    "algebra": 1226,
    "precalculus": 520,
    "hotpotqa": 3000,
    "counting_and_probability": 602,
    "strategyqa": 2290,
    "intermediate_algebra": 1070,
    "geometry": 727,
    "APPS": 4439, # 2156,
    "alfworld": 3553,
    "wiki_table_questions": 3000
}

# GLOBAL_MAX = all_results_count.max()
# assert (GLOBAL_MAX == pd.Series([136, 134, 316], index=["code_generation", "decision_making", "reasoning"])).all()
TASK_COUNT_ROW = pd.Series(TASK_COUNT).sort_index()
display(TASK_COUNT_ROW)
def _exp_completed(row):
    assert len(row) == len(TASK_COUNT_ROW), f"row: {row}, TASK_COUNT_ROW: {TASK_COUNT_ROW}"
    # sort by index
    row = row.sort_index()
    return (row == TASK_COUNT_ROW).all()

completed_exp = all_results_count.apply(_exp_completed, axis=1)
# select only completed exp
completed_exp = completed_exp.drop(completed_exp[completed_exp == False].index)#.reset_index().drop(columns=[0])
# display(completed_exp.to_frame().style.background_gradient(cmap='Blues', axis=0))

completed_exp_lst = set(map(tuple, completed_exp.reset_index().drop(columns=[0]).to_numpy().tolist()))
# agent_model_name	feedback_model_name	feedback_setting	exp_setting
# completed_exp_lst
_completed_mask = all_results.apply(lambda row: (row["agent_model_name"], row["feedback_model_name"], row["feedback_setting"], row["exp_setting"]) in completed_exp_lst, axis=1)
print(f"Before filtering: {len(all_results)}")
not_completed = all_results[~_completed_mask]
completed_results = all_results[_completed_mask]
completed_results_w_stats = pd.concat([
    completed_results,
    completed_results["agent_action_count"].apply(pd.Series)
], axis=1)
print(f"After filtering: {len(completed_results)}")
print(completed_exp)


APPS                        4439
alfworld                    3553
algebra                     1226
counting_and_probability     602
geometry                     727
hotpotqa                    3000
intermediate_algebra        1070
number_theory                691
prealgebra                   750
precalculus                  520
strategyqa                  2290
wiki_table_questions        3000
dtype: int64

Before filtering: 65604
After filtering: 65604
agent_model_name    feedback_model_name  feedback_setting  exp_setting    
claude-2            None                 None              max5_p2+tool+cd    True
claude-instant-1    None                 None              max5_p2+tool+cd    True
gpt-3.5-turbo-0613  None                 None              max5_p2+tool+cd    True
dtype: bool


In [37]:
gpt4_completed_results_w_stats = pd.concat([
    gpt4_results,
    gpt4_results["agent_action_count"].apply(pd.Series)
], axis=1)


In [11]:
FILTERED_MODELS =[
    # TODO(user): fill in models you want to filter out
]
all_results = all_results[~all_results["agent_model_name"].isin(FILTERED_MODELS)]
sorted(list(all_results["agent_model_name"].unique()))


['claude-2', 'claude-instant-1', 'gpt-3.5-turbo-0613']

### Cost Analysis

In [38]:
def get_token_count(df, cost_per_1k_token):
    n_round_of_feedback = (df["n_turns"] - 1).sum()
    print(f"Total rounds of feedback: {n_round_of_feedback}")

    print(f"Cost per 1k tokens:\n{cost_per_1k_token}")
    token_counter = df.groupby(["task_type", "task_name"])["token_counter"].apply(lambda x: (sum(x, Counter()), len(x)))
    token_counter = pd.DataFrame(token_counter.tolist(), index=token_counter.index, columns=["token_counter", "n_tasks"])
    # expand the token_counter
    token_counter = pd.concat([token_counter.drop(columns=["token_counter"]), token_counter["token_counter"].apply(pd.Series)], axis=1)
    # normalize by n_tasks
    n_tasks = token_counter["n_tasks"]
    print("Total Tokens")
    display(token_counter.sum(axis=0))
    token_counter = token_counter.div(token_counter.n_tasks, axis=0)#.drop(columns=["n_tasks"])
    token_counter["n_tasks"] = n_tasks
    print(f"Number of tokens per example")

    display(token_counter)

    costs_per_ex = token_counter.drop(columns=["n_tasks"])
    if "total_tokens" in costs_per_ex.columns:
        costs_per_ex = costs_per_ex.drop(columns=["total_tokens"])
    if "feedback_total_tokens" in costs_per_ex.columns:
        costs_per_ex = costs_per_ex.drop(columns=["feedback_total_tokens"])

    costs_per_ex = costs_per_ex.div(1000).apply(lambda x: x * cost_per_1k_token[x.name])
    costs_per_ex["USD_per_example"] = costs_per_ex.sum(axis=1)
    costs_per_ex["n_tasks"] = token_counter["n_tasks"]
    # display(costs_per_ex)
    # styler gradient
    display(costs_per_ex.style.background_gradient(cmap="Blues", axis=None, subset=["USD_per_example"]))

    # Total cost
    total_cost = (costs_per_ex["USD_per_example"] * costs_per_ex["n_tasks"]).sum()
    print(f"Total cost: ${total_cost:.2f}")

print("** No feedback **")
GPT35_COST_PER_1K_TOKEN = {
    # 3.5-turbo
    "prompt_tokens": 0.0015,
    "completion_tokens": 0.002,
    # chat-bison-001
    "token_count": 0,
    # 4
    # "prompt_tokens": 0.03,
    # "completion_tokens": 0.04,
    # 3.5-turbo-16k
    # "feedback_prompt_tokens": 0.003,
    # "feedback_completion_tokens": 0.004,
    # 4
    "feedback_prompt_tokens": 0.03,
    "feedback_completion_tokens": 0.04,
}
get_token_count(all_results.query("agent_model_name == 'gpt-3.5-turbo-0613' and feedback_setting == 'None'"), GPT35_COST_PER_1K_TOKEN)


** No feedback **
Total rounds of feedback: 57521
Cost per 1k tokens:
{'prompt_tokens': 0.0015, 'completion_tokens': 0.002, 'token_count': 0, 'feedback_prompt_tokens': 0.03, 'feedback_completion_tokens': 0.04}
Total Tokens


n_tasks                  21868
prompt_tokens        131130326
completion_tokens     10727062
total_tokens         141857388
dtype: int64

Number of tokens per example


Unnamed: 0_level_0,Unnamed: 1_level_0,n_tasks,prompt_tokens,completion_tokens,total_tokens
task_type,task_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
code_generation,APPS,4439,8695.328452,799.622212,9494.950665
decision_making,alfworld,3553,7361.853645,508.993527,7870.847171
reasoning,algebra,1226,4333.333605,413.539152,4746.872757
reasoning,counting_and_probability,602,4911.335548,544.951827,5456.287375
reasoning,geometry,727,6246.950481,707.078404,6954.028886
reasoning,hotpotqa,3000,5531.042333,275.697,5806.739333
reasoning,intermediate_algebra,1070,6260.781308,765.708411,7026.48972
reasoning,number_theory,691,4423.749638,408.120116,4831.869754
reasoning,prealgebra,750,3800.381333,328.548,4128.929333
reasoning,precalculus,520,6542.832692,831.525,7374.357692


Unnamed: 0_level_0,Unnamed: 1_level_0,prompt_tokens,completion_tokens,USD_per_example,n_tasks
task_type,task_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
code_generation,APPS,0.013043,0.001599,0.014642,4439
decision_making,alfworld,0.011043,0.001018,0.012061,3553
reasoning,algebra,0.0065,0.000827,0.007327,1226
reasoning,counting_and_probability,0.007367,0.00109,0.008457,602
reasoning,geometry,0.00937,0.001414,0.010785,727
reasoning,hotpotqa,0.008297,0.000551,0.008848,3000
reasoning,intermediate_algebra,0.009391,0.001531,0.010923,1070
reasoning,number_theory,0.006636,0.000816,0.007452,691
reasoning,prealgebra,0.005701,0.000657,0.006358,750
reasoning,precalculus,0.009814,0.001663,0.011477,520


Total cost: $218.15


# Dataset Analysis

## Dataset Selection

In [38]:
is_failed = completed_results_w_stats["success"].apply(lambda x: not bool(x))


In [13]:
_groupby = ["agent_model_name", "task_name"]
pd.concat([
    completed_results_w_stats.groupby(_groupby)["success"].count().rename("n_tasks"),
    completed_results_w_stats.groupby(_groupby)["success"].sum().rename("n_success_tasks"),
    completed_results_w_stats.groupby(_groupby)["success"].mean().rename("success_rate"),
    # completed_results_w_stats.groupby(_groupby)["success"].std().rename("std"),
], axis=1).unstack(0).swaplevel(0, 1, axis=1).sort_index(axis=1).style.background_gradient(cmap="Blues", axis=0)


agent_model_name,claude-2,claude-2,claude-2,claude-instant-1,claude-instant-1,claude-instant-1,gpt-3.5-turbo-0613,gpt-3.5-turbo-0613,gpt-3.5-turbo-0613
Unnamed: 0_level_1,n_success_tasks,n_tasks,success_rate,n_success_tasks,n_tasks,success_rate,n_success_tasks,n_tasks,success_rate
task_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
APPS,1187,4439,0.267403,247,4439,0.055643,2109,4439,0.475107
alfworld,617,3553,0.173656,1826,3553,0.513932,1975,3553,0.555868
algebra,521,1226,0.424959,557,1226,0.454323,633,1226,0.516313
counting_and_probability,160,602,0.265781,138,602,0.229236,190,602,0.315615
geometry,119,727,0.163686,73,727,0.100413,81,727,0.111417
hotpotqa,1482,3000,0.494,1497,3000,0.499,1218,3000,0.406
intermediate_algebra,151,1070,0.141121,160,1070,0.149533,219,1070,0.204673
number_theory,220,691,0.318379,218,691,0.315485,344,691,0.497829
prealgebra,423,750,0.564,360,750,0.48,406,750,0.541333
precalculus,57,520,0.109615,54,520,0.103846,78,520,0.15


In [14]:
print("Mean number of actions (Success)")
completed_results_w_stats[~is_failed].groupby("task_name")[["propose_solution", "use_tool", "invalid_action"]].mean().style.background_gradient(cmap="Blues", axis=0)


Mean number of actions (Success)


Unnamed: 0_level_0,propose_solution,use_tool,invalid_action
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
APPS,1.104714,0.714366,0.076207
alfworld,0.057266,2.578769,0.000453
algebra,1.17066,0.897721,0.052016
counting_and_probability,1.213115,0.864754,0.067623
geometry,1.205128,0.619048,0.07326
hotpotqa,1.111032,1.226114,0.113653
intermediate_algebra,1.237736,1.273585,0.064151
number_theory,1.193095,1.190537,0.025575
prealgebra,1.160639,0.690496,0.047939
precalculus,1.26455,1.095238,0.084656


In [15]:
print("Mean number of actions (Failed)")
completed_results_w_stats[is_failed].groupby("task_name")[["propose_solution", "use_tool", "invalid_action"]].mean().style.background_gradient(cmap="Blues", axis=0)


Mean number of actions (Failed)


Unnamed: 0_level_0,propose_solution,use_tool,invalid_action
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
APPS,1.900757,1.243299,0.121649
alfworld,1.542541,2.675212,0.004967
algebra,1.811388,1.27758,0.162684
counting_and_probability,1.895296,1.10091,0.167678
geometry,1.783543,1.302411,0.240566
hotpotqa,1.652925,1.974807,0.372684
intermediate_algebra,1.701866,1.562687,0.238806
number_theory,1.865221,1.349342,0.093726
prealgebra,1.895382,1.038643,0.157399
precalculus,1.715536,1.512035,0.326039


### Easy Tier - These model can complete with <= 2 steps

In [16]:
easy_instances = completed_results_w_stats[~is_failed].query("n_turns <= 2")
print(f"Number of easy instances: {len(easy_instances)} / {len(completed_results_w_stats)} ({len(easy_instances) / len(completed_results_w_stats):.2%})")

all_unique_instances_task_name_id_pairs = set(completed_results_w_stats[["task_type", "task_name", "task_id"]].apply(tuple, axis=1))
easy_instances_task_name_id_pairs = set(easy_instances[["task_type", "task_name", "task_id"]].apply(tuple, axis=1))
print(f"Number of unique easy instances: {len(easy_instances_task_name_id_pairs)} / {len(all_unique_instances_task_name_id_pairs)} ({len(easy_instances_task_name_id_pairs) / len(all_unique_instances_task_name_id_pairs):.2%})")

easy_instances\
    .groupby("task_name")["success"].count().rename("n_tasks")\
    .sort_values(ascending=False).to_frame().style.background_gradient(cmap="Blues", axis=0)


Number of easy instances: 18447 / 65604 (28.12%)


Number of unique easy instances: 11277 / 21868 (51.57%)


Unnamed: 0_level_0,n_tasks
task_name,Unnamed: 1_level_1
strategyqa,4243
APPS,2876
wiki_table_questions,2740
hotpotqa,2567
alfworld,2319
algebra,1244
prealgebra,955
number_theory,512
counting_and_probability,368
intermediate_algebra,294


In [17]:
# output dir data/datasets/easy_instances
def save_instances(df, output_dir):
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    # Save the easy instances by task_name
    for task_name, cur_df in df.groupby("task_name"):
        cur_output_dir = os.path.join(output_dir, f"{task_name.lower()}.jsonl")
        cur_df.to_json(
            cur_output_dir,
            orient="records",
            lines=True
        )
        print(f"Saved {len(cur_df)} instances to {cur_output_dir}")

save_instances(easy_instances, os.path.join(ROOT_DIR, "data", "trajectories", "easy_instances"))


Saved 2876 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/apps.jsonl
Saved 2319 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/alfworld.jsonl
Saved 1244 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/algebra.jsonl
Saved 368 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/counting_and_probability.jsonl
Saved 213 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/geometry.jsonl
Saved 2567 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/hotpotqa.jsonl
Saved 294 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/intermediate_algebra.jsonl
Saved 512 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/easy_instances/number_theory.jsonl
Saved 955 instances to /shared/nas2/xingyao6/projects/llm-agent/data/tra

### Medium Tier - These model can success with 5 steps

In [18]:
medium_instances = completed_results_w_stats[~is_failed].query("n_turns > 2")
print(f"Number of medium instances: {len(medium_instances)} / {len(completed_results_w_stats)} ({len(medium_instances) / len(completed_results_w_stats):.2%})")

medium_instances_task_name_id_pairs = set(medium_instances[["task_type", "task_name", "task_id"]].apply(tuple, axis=1))
print(f"Number of unique medium instances: {len(medium_instances_task_name_id_pairs)} / {len(all_unique_instances_task_name_id_pairs)} ({len(medium_instances_task_name_id_pairs) / len(all_unique_instances_task_name_id_pairs):.2%})")

medium_instances\
    .groupby("task_name")["success"].count().rename("n_tasks")\
    .sort_values(ascending=False).to_frame().style.background_gradient(cmap="Blues", axis=0)


Number of medium instances: 8580 / 65604 (13.08%)
Number of unique medium instances: 6941 / 21868 (31.74%)


Unnamed: 0_level_0,n_tasks
task_name,Unnamed: 1_level_1
alfworld,2099
hotpotqa,1630
strategyqa,1595
wiki_table_questions,1129
APPS,667
algebra,467
number_theory,270
intermediate_algebra,236
prealgebra,234
counting_and_probability,120


In [19]:
save_instances(medium_instances, os.path.join(ROOT_DIR, "data", "trajectories", "medium_instances"))


Saved 667 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/apps.jsonl
Saved 2099 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/alfworld.jsonl
Saved 467 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/algebra.jsonl
Saved 120 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/counting_and_probability.jsonl
Saved 60 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/geometry.jsonl
Saved 1630 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/hotpotqa.jsonl
Saved 236 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/intermediate_algebra.jsonl
Saved 270 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/medium_instances/number_theory.jsonl
Saved 234 instances to /shared/nas2/xingyao6/projects/llm-a

### Hard Tier - Need to involve GPT-4

In [20]:
unsolvable_name_id_pairs = all_unique_instances_task_name_id_pairs - easy_instances_task_name_id_pairs - medium_instances_task_name_id_pairs
print(f"Number of unsolvable instances: {len(unsolvable_name_id_pairs)} / {len(all_unique_instances_task_name_id_pairs)} ({len(unsolvable_name_id_pairs) / len(all_unique_instances_task_name_id_pairs):.2%})")
unsolvable_df = pd.DataFrame(list(unsolvable_name_id_pairs), columns=["task_type", "task_name", "task_id"])
unsolvable_df


Number of unsolvable instances: 7635 / 21868 (34.91%)


Unnamed: 0,task_type,task_name,task_id
0,reasoning,hotpotqa,9900
1,reasoning,prealgebra,875
2,code_generation,APPS,2558
3,code_generation,APPS,4544
4,decision_making,alfworld,look_at_obj_in_light-Book-None-DeskLamp-305/tr...
...,...,...,...
7630,reasoning,hotpotqa,15148
7631,code_generation,APPS,3605
7632,reasoning,precalculus,5087
7633,code_generation,APPS,1390


In [23]:
unsolvable_df["task_type"].value_counts()


reasoning          3836
code_generation    2003
tabular            1010
decision_making     786
Name: task_type, dtype: int64

In [48]:
# Save Raw file for unsolvable instances (for GPT-4 generation)

output_dir = os.path.join(ROOT_DIR, "data", "trajectories", "hard_instances", "raw")
for (task_type, task_name), cur_df in unsolvable_df.groupby(["task_type", "task_name"]):
    # get all the task_ids and select those from the original dataset into new df
    task_ids = set(cur_df["task_id"].unique())
    print(f"Task type: {task_type}, task name: {task_name}, # of task_ids: {len(task_ids)}")

    if not task_name == "alfworld":        
        original_data_path = os.path.join(ROOT_DIR, "data", "processed", task_type, "train", f"{task_name.lower()}.jsonl")
        original_df = pd.read_json(original_data_path, orient="records", lines=True)
        
        # filter by task_ids
        filtered_df = original_df[original_df["id"].isin(task_ids)]

        # save to output dir
        cur_output_filepath = os.path.join(output_dir, task_type, "train", f"{task_name.lower()}.jsonl")
        pathlib.Path(os.path.dirname(cur_output_filepath)).mkdir(parents=True, exist_ok=True)

        print(f"Saving {len(filtered_df)} instances (original: {len(original_df)}) to {cur_output_filepath}")
        filtered_df.to_json(
            cur_output_filepath,
            orient="records",
            lines=True  
        )
    else:
        # Only save the task_ids to a text file
        cur_output_filepath = os.path.join(output_dir, task_type, "train", f"{task_name}.txt")
        pathlib.Path(os.path.dirname(cur_output_filepath)).mkdir(parents=True, exist_ok=True)

        print(f"Saving {len(task_ids)} ALFWorld hard instances to {cur_output_filepath}")

        with open(cur_output_filepath, "w") as f:
            for task_id in task_ids:
                f.write(f"{task_id}\n")


Task type: code_generation, task name: APPS, # of task_ids: 2003
Saving 2003 instances (original: 4439) to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_instances/raw/code_generation/train/apps.jsonl
Task type: decision_making, task name: alfworld, # of task_ids: 786
Saving 786 ALFWorld hard instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_instances/raw/decision_making/train/alfworld.txt
Task type: reasoning, task name: algebra, # of task_ids: 334
Saving 334 instances (original: 1226) to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_instances/raw/reasoning/train/algebra.jsonl
Task type: reasoning, task name: counting_and_probability, # of task_ids: 311
Saving 311 instances (original: 602) to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_instances/raw/reasoning/train/counting_and_probability.jsonl
Task type: reasoning, task name: geometry, # of task_ids: 545
Saving 545 instances (original: 727) to /shared/

## GPT-4 Solved Instances

In [46]:
gpt4_is_failed = gpt4_completed_results_w_stats["success"].apply(lambda x: not bool(x))
gpt4_instances = gpt4_completed_results_w_stats[~gpt4_is_failed].query("n_turns > 2")
print(f"Number of medium instances: {len(gpt4_instances)} / {len(gpt4_completed_results_w_stats)} ({len(gpt4_instances) / len(gpt4_completed_results_w_stats):.2%})")

gpt4_all_unique_instances_task_name_id_pairs = set(gpt4_completed_results_w_stats[["task_type", "task_name", "task_id"]].apply(tuple, axis=1))

gpt4_instances_task_name_id_pairs = set(gpt4_instances[["task_type", "task_name", "task_id"]].apply(tuple, axis=1))
print(f"Number of unique medium instances: {len(gpt4_instances_task_name_id_pairs)} / {len(gpt4_all_unique_instances_task_name_id_pairs)} ({len(gpt4_instances_task_name_id_pairs) / len(gpt4_all_unique_instances_task_name_id_pairs):.2%})")

gpt4_instances\
    .groupby("task_name")["success"].count().rename("n_tasks")\
    .sort_values(ascending=False).to_frame().style.background_gradient(cmap="Blues", axis=0)


Number of medium instances: 423 / 3574 (11.84%)
Number of unique medium instances: 423 / 3574 (11.84%)


Unnamed: 0_level_0,n_tasks
task_name,Unnamed: 1_level_1
APPS,74
algebra,62
intermediate_algebra,58
hotpotqa,46
number_theory,45
geometry,40
counting_and_probability,34
precalculus,31
prealgebra,22
strategyqa,11


In [48]:
save_instances(gpt4_instances, os.path.join(ROOT_DIR, "data", "trajectories", "hard_gpt4_instances"))


Saved 74 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/apps.jsonl
Saved 62 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/algebra.jsonl
Saved 34 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/counting_and_probability.jsonl
Saved 40 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/geometry.jsonl
Saved 46 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/hotpotqa.jsonl
Saved 58 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/intermediate_algebra.jsonl
Saved 45 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/number_theory.jsonl
Saved 22 instances to /shared/nas2/xingyao6/projects/llm-agent/data/trajectories/hard_gpt4_instances/prealgebra.jsonl
Saved 31 instances to /shared/nas2/xingyao