In [1]:
from glob import glob
import os
import re
import sys
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from collections import Counter
import math

In [2]:
ROOT_DIR = os.path.dirname(os.getcwd()) # Should be your path to the repo `mint`
sys.path.insert(0, ROOT_DIR)
DATA_DIR = os.path.join(ROOT_DIR, "data", "outputs")
print(f"Data directory: {DATA_DIR}")
glob_pattern = f"{DATA_DIR}/**/*results.jsonl"
filepaths = list(set(glob(glob_pattern, recursive=True)))
print(f"Matching glob pattern: `{glob_pattern}`. **{len(filepaths)}** files found.")


def parse_filepath(filepath):
    # e.g., gpt-3.5-turbo-0613/F=gpt-3.5-turbo-16k-0613/PHF=no_GT-textual/max5_p2+tool+cd/code_generation/humaneval/results.jsonl
    # e.g., gpt-3.5-turbo-0613/F=None/max5_p2+tool+cd/code_generation/humaneval/results.jsonl
    splited = filepath.replace(DATA_DIR, "").lstrip("/").split("/")
    
    agent_model_name = splited[0]
    feedback_model_name = splited[1].split("=")[1]
    if feedback_model_name != "None":
        feedback_setting = splited[2]
    else:
        feedback_setting = "None"
    task_name = splited[-2]
    task_type = splited[-3]
    exp_setting = splited[-4]
    return {
        "agent_model_name": agent_model_name,
        "feedback_model_name": feedback_model_name,
        "feedback_setting": feedback_setting,
        "task_name": task_name,
        "task_type": task_type,
        "exp_setting": exp_setting,
        "filepath": filepath,
    }

df = pd.DataFrame(list(map(parse_filepath, filepaths)))

def load_results(filepath):
    results = []
    with open(filepath) as f:
        for line in f:
            try:
                results.append(json.loads(line))
            except Exception as e:
                print(f"Error loading {filepath}: {e}\n{line}")
                globals()["error_line"] = line
    return pd.DataFrame(results)

df["results"] = df.filepath.apply(load_results)


def rename_model(model_name):
    MODEL_RENAME = {
        "llama-2-lemur-70b-chat-v1": "Lemur-70b-chat-v1",
        "llama-2-lemur-70b-v1": "Lemur-70b-v1",
    }
    model_name = MODEL_RENAME.get(model_name, model_name)
    if "-hf" in model_name:
        model_name = model_name.rstrip("-hf")
    return model_name

all_results = []
for row in df.itertuples():
    row.results["agent_model_name"] = rename_model(row.agent_model_name)
    row.results["feedback_model_name"] = row.feedback_model_name
    row.results["feedback_setting"] = row.feedback_setting
    row.results["exp_setting"] = row.exp_setting
    row.results["task_name"] = row.task_name
    row.results["task_type"] = row.task_type
    all_results.append(row.results)


all_results = pd.concat(all_results)
def get_stats(row):
    state = row["state"]
    task = row["task"]
    return {
        "task_id": task["task_id"],
        "n_turns": len(state["history"]) // 2,
        "success": state["success"],
        "agent_action_count": state["agent_action_count"],
        "token_counter": {'a': Counter(state["token_counter"]), 'b': 1},
        "terminate_reason": state["terminate_reason"],
    }


# combine this with the original dataset
stats = all_results.apply(get_stats, axis=1, result_type="expand")
all_results = pd.concat([all_results, stats], axis=1)

# turn bool to int
all_results['success'] = all_results['success'].astype(int)

all_results_unfiltered = all_results.copy()

Data directory: /shared/nas/data/m1/xingyao6/projects/mint-bench/data/outputs
Matching glob pattern: `/shared/nas/data/m1/xingyao6/projects/mint-bench/data/outputs/**/*results.jsonl`. **1208** files found.


In [3]:
# Sanity check of experiments - check whether they are all completed
all_results_count = all_results.groupby([
    "agent_model_name",
    "feedback_model_name",
    "feedback_setting",
    "exp_setting",
    "task_type",
    "task_name",
])["task_id"]\
.count().unstack().fillna(0)\
.sum(axis=1).unstack().fillna(0).astype(int)

# all_results_count.style.background_gradient(cmap='Blues', axis=0)

In [4]:
# Filter out experiments that are not completed

# find all index that are not [136, 134, 320]
GLOBAL_MAX = all_results_count.max()
assert (GLOBAL_MAX == pd.Series([136, 134, 316], index=["code_generation", "decision_making", "reasoning"])).all()
def _exp_completed(row):
    return (row == GLOBAL_MAX).all()

completed_exp = all_results_count.apply(_exp_completed, axis=1)
# select only completed exp
completed_exp = completed_exp.drop(completed_exp[completed_exp == False].index)#.reset_index().drop(columns=[0])
# display(completed_exp.to_frame().style.background_gradient(cmap='Blues', axis=0))

completed_exp_lst = set(map(tuple, completed_exp.reset_index().drop(columns=[0]).to_numpy().tolist()))
# agent_model_name	feedback_model_name	feedback_setting	exp_setting
# completed_exp_lst
_completed_mask = all_results.apply(lambda row: (row["agent_model_name"], row["feedback_model_name"], row["feedback_setting"], row["exp_setting"]) in completed_exp_lst, axis=1)
print(f"Before filtering: {len(all_results)}")
all_results = all_results[_completed_mask]
print(f"After filtering: {len(all_results)}")

Before filtering: 88486
After filtering: 88486


# Results

In [5]:
def generate_table(
    query, mode = 'sr', return_raw = False,
    consider_exceed_context = False,
    display_table = True,
):
    results = all_results.query(query)
    n_invalid_actions = results['agent_action_count'].apply(lambda x: x['invalid_action'])
    n_turns = results['n_turns']
    n_error = results['agent_action_count'].apply(lambda x: x.get('error', 0))

    results = results.assign(
        n_invalid_actions=n_invalid_actions,
        n_turns=n_turns,
        n_error=n_error,
    )
    if consider_exceed_context:
        results["final_length_exceeds_4096"] = results["final_length_exceeds_4096"].fillna(False)
        results["success"] = results.apply(lambda row: int(row["success"] and not row["final_length_exceeds_4096"]), axis=1)

    results_raw = results.copy()
    # draw a table with the performance on four tasks as the horizontal axis and the micro mean success rate and different models as the vertical axis. display it with seaborn
    # firstly calculate micro mean
    if mode == 'sr':
        micro_sr = results.groupby("agent_model_name")['success'].mean()
        if display_table:
            display(
                (results.groupby(["agent_model_name", "task_name"])['success'].mean().unstack() * 100).style.background_gradient(cmap='Blues', axis=0)
            )
        results_sr = results.groupby(["agent_model_name", "task_type"])['success'].mean().unstack()
        results_sr['avg_micro'] = micro_sr
        results = (results_sr * 100).round(2)
    elif mode == "invalid_count":
        micro_invalid_count = results.groupby("agent_model_name")['n_invalid_actions'].mean()
        results_invalid_count = results.groupby(["agent_model_name", "task_type"])['n_invalid_actions'].mean().unstack()
        results_invalid_count['avg_micro'] = micro_invalid_count

        micro_n_turns = results.groupby("agent_model_name")['n_turns'].mean()
        results_n_turns = results.groupby(["agent_model_name", "task_type"])['n_turns'].mean().unstack()
        results_n_turns['avg_micro'] = micro_n_turns

        micro_n_error = results.groupby("agent_model_name")['n_error'].mean()
        results_n_error = results.groupby(["agent_model_name", "task_type"])['n_error'].mean().unstack()
        results_n_error['avg_micro'] = micro_n_error

        results = pd.concat(
            [results_invalid_count, results_n_turns, results_n_error],
            axis=1,
            keys=["invalid_count", "n_turns", "n_error"]
        ).round(2)
    else:
        raise ValueError(f"Unknown mode: {mode}")
    if return_raw:
        return results, results_raw
    else:
        return results

def parse_modelname(name):
    if "gpt" in name or "claude" in name or "bison" in name:
        name = r"\texttt{" + name + "}"
        return {"model": name, "size": "-", "type": "-"}

    import re
    size = int(re.search(r'(\d+)b', name).group(1))
    model_variant = "1Base"
    if "Instruct" in name or "vicuna" in name or ("Lemur" in name and "chat" in name):
        model_variant = "2SIFT"
    elif "chat" in name and "Llama" in name:
        model_variant = "3RLHF"

    if "Llama-2" in name:
        model_name = "LLaMA-2"
    elif "Lemur" in name and "v1" in name:
        model_name = "Lemur-v1"
    elif "vicuna" in name and "v1.5" in name:
        if "-16k" in name:
            model_name = "Vicuna-v1.5-16k"
        else:
            model_name = "Vicuna-v1.5"
    else:
        model_name = name.split("-")[0]
    model_name = r"\texttt{" + model_name + "}"
    return {
        "model": model_name,
        "size": size,
        "type": model_variant
    }

## Success Rate @ k=5

In [6]:
print("SR")
main_table_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'None' and feedback_model_name == 'None'", mode="sr", display_table=False
)
display(main_table_sr.style.format("{:.2f}").background_gradient(cmap='Blues', axis=1))

SR


task_type,code_generation,decision_making,reasoning,avg_micro
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CodeLlama-13b,4.41,55.97,8.54,18.43
CodeLlama-13b-Instruct,2.21,50.0,4.75,14.51
CodeLlama-34b,18.38,63.43,17.41,28.16
CodeLlama-34b-Instruct,2.21,37.31,14.87,17.06
CodeLlama-7b,0.0,18.66,0.0,4.27
CodeLlama-7b-Instruct,2.21,17.16,7.91,8.7
Lemur-70b-chat-v1,0.74,0.0,13.92,7.68
Lemur-70b-v1,15.44,61.19,16.14,26.28
Llama-2-13b,5.15,50.0,3.48,14.51
Llama-2-13b-chat,2.21,3.73,19.62,11.95


## Tool-augmented Task-solving (SR vs. k)

In [7]:

def is_filtered_model(model_name):
    # do not print 16k models
    FILTER_MODEL_NAMES = [
        "gpt-3.5-turbo-16k-0613",
    ]
    # or "CodeLlama" in model_name \
    if model_name in FILTER_MODEL_NAMES \
        or ("vicuna" in model_name and "16k" in model_name):
        return True
    return False


# Combine results
interaction_turns = []
interaction_turns_raw = []
for setting in ['max1_p1+tool+cd', 'max2_p2+tool+cd', 'max3_p2+tool+cd', 'max4_p2+tool+cd', 'max5_p2+tool+cd']:
    _cur_table, _cur_table_raw = generate_table(
        f"exp_setting == '{setting}' and feedback_setting == 'None' and feedback_model_name == 'None'",
        mode="sr",
        return_raw=True,
        display_table=False
    )
    _cur_table = _cur_table.unstack().to_frame().rename(columns={0: 'success_rate'}).reset_index()
    cur_turn = int(re.search(r'max(\d+)_', setting).group(1))
    _cur_table['n_turns'] = cur_turn
    interaction_turns.append(_cur_table)
    
    _turn_vs_success = _cur_table_raw[["agent_model_name", "success"]]
    _turn_vs_success = _turn_vs_success.assign(n_turns=cur_turn)
    interaction_turns_raw.append(_turn_vs_success)
interaction_turns = pd.concat(interaction_turns)
interaction_turns = interaction_turns[~interaction_turns.agent_model_name.map(is_filtered_model)]
interaction_turns_raw = pd.concat(interaction_turns_raw)

# Run regression and generate a table
_view = interaction_turns.set_index(['agent_model_name', 'n_turns', 'task_type']).unstack() \
    .sort_index()["success_rate"][["avg_micro"]].unstack()

def run_regression_on_row(series):
    if not len(series.dropna()):
        print(series)
        return {}
    x = list(series.index)
    y = series.values
    _df = pd.DataFrame({"x": x, "y": y})
    model = sm.ols(formula='y ~ x', data=_df)
    results = model.fit()
    return {
        **{k: v for k, v in results.params.to_dict().items()},
        "rsquared": results.rsquared,
        # "pvalues": results.pvalues.to_dict()
        **{f"pvalue_{k}": v for k, v in results.pvalues.to_dict().items()}
    }
_view = pd.concat([_view['avg_micro'], _view['avg_micro'].apply(run_regression_on_row, axis=1, result_type='expand')], axis=1)

_view = pd.concat([pd.DataFrame(_view.reset_index()["agent_model_name"].apply(parse_modelname).tolist()), _view.reset_index()], axis=1)
_view.set_index(["model", "size", "type"], inplace=True)
_view.sort_index(inplace=True)
_view.index = _view.index.map(lambda x: (x[0], f"{x[1]}B" if x[1] != "-" else "-", x[2].replace('1', '').replace('2', '').replace('3', '')))

# display(_view.style.background_gradient(cmap='Blues', axis=0))

  return 1 - self.ssr/self.centered_tss


In [8]:
def index_to_name(index):
    model_name, size, model_type = index
    model_name = model_name.replace(r"\texttt{", "").replace("}", "")
    if size == "-":
        name = f"{model_name} (closed-source)"
    else:
        if "B" not in str(size):
            size = f"{size}B"
        name = f"{model_name} ({size}, {model_type})"
    return name
print("All models:")
list(map(index_to_name, _view.index.tolist()))

All models:


['CodeLlama (7B, Base)',
 'CodeLlama (7B, SIFT)',
 'CodeLlama (13B, Base)',
 'CodeLlama (13B, SIFT)',
 'CodeLlama (34B, Base)',
 'CodeLlama (34B, SIFT)',
 'LLaMA-2 (7B, Base)',
 'LLaMA-2 (7B, RLHF)',
 'LLaMA-2 (13B, Base)',
 'LLaMA-2 (13B, RLHF)',
 'LLaMA-2 (70B, Base)',
 'LLaMA-2 (70B, RLHF)',
 'Lemur-v1 (70B, Base)',
 'Lemur-v1 (70B, SIFT)',
 'Vicuna-v1.5 (7B, SIFT)',
 'Vicuna-v1.5 (13B, SIFT)',
 'chat-bison-001 (closed-source)',
 'claude-2 (closed-source)',
 'claude-instant-1 (closed-source)',
 'gpt-3.5-turbo-0613 (closed-source)',
 'gpt-4-0613 (closed-source)']

In [9]:

VISIBLE_MODELS = [
    'gpt-3.5-turbo-0613 (closed-source)',
    'claude-2 (closed-source)',
    'claude-instant-1 (closed-source)',
    # 'Vicuna-v1.5 (7B, SIFT)',
    # 'LLaMA-2 (7B, Base)',
    'LLaMA-2 (70B, Base)',
    'LLaMA-2 (70B, RLHF)',
    'CodeLlama (34B, Base)',
    'CodeLlama (34B, SIFT)',
]

d = []
for row in _view[[1, 2, 3, 4, 5]].iterrows():
    name = index_to_name(row[0])
    if "gpt-4" in name: continue    
    data = row[1].values.tolist()

    d.append({
        "label": name,
        "data": data,
    })
# print(json.dumps(d, indent=2))
# print("sr_vs_k_series = " + json.dumps(d, indent=2) + ";")
with open("../website/data/sr_vs_k_series.json", "w") as f:
    json.dump(d, f, indent=2)

## Ability to Leverage Natural Language Feedback (Delta Feedback)

In [10]:
# generate feedback ablation results (gpt-4-0613)
gpt4_feedback = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613'",
    mode="sr",
    display_table=False,
)
print('gpt4 feedback diff')
gpt4_feedback_diff = gpt4_feedback - main_table_sr
display(pd.concat([
    main_table_sr, gpt4_feedback, gpt4_feedback_diff
    ], axis=1, keys=["1SR", "2SR (gpt-4-0613)", "3Diff"]).swaplevel(axis=1).sort_index(axis=1).style.format("{:.2f}").background_gradient(cmap='Blues')
)

gpt4 feedback diff


task_type,avg_micro,avg_micro,avg_micro,code_generation,code_generation,code_generation,decision_making,decision_making,decision_making,reasoning,reasoning,reasoning
Unnamed: 0_level_1,1SR,2SR (gpt-4-0613),3Diff,1SR,2SR (gpt-4-0613),3Diff,1SR,2SR (gpt-4-0613),3Diff,1SR,2SR (gpt-4-0613),3Diff
agent_model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
CodeLlama-13b,18.43,31.91,13.48,4.41,27.94,23.53,55.97,73.88,17.91,8.54,15.82,7.28
CodeLlama-13b-Instruct,14.51,22.35,7.84,2.21,14.71,12.5,50.0,58.96,8.96,4.75,10.13,5.38
CodeLlama-34b,28.16,42.32,14.16,18.38,30.15,11.77,63.43,84.33,20.9,17.41,29.75,12.34
CodeLlama-34b-Instruct,17.06,27.3,10.24,2.21,3.68,1.47,37.31,67.91,30.6,14.87,20.25,5.38
CodeLlama-7b,4.27,16.21,11.94,0.0,0.0,0.0,18.66,59.7,41.04,0.0,4.75,4.75
CodeLlama-7b-Instruct,8.7,25.94,17.24,2.21,10.29,8.08,17.16,62.69,45.53,7.91,17.09,9.18
Lemur-70b-chat-v1,7.68,15.19,7.51,0.74,7.35,6.61,0.0,0.75,0.75,13.92,24.68,10.76
Lemur-70b-v1,26.28,33.79,7.51,15.44,27.94,12.5,61.19,70.15,8.96,16.14,20.89,4.75
Llama-2-13b,14.51,23.21,8.7,5.15,15.44,10.29,50.0,60.45,10.45,3.48,10.76,7.28
Llama-2-13b-chat,11.95,17.58,5.63,2.21,10.29,8.08,3.73,9.7,5.97,19.62,24.05,4.43


In [11]:
def sr_df_to_json(df):
    sr_json = {
        "avg_micro": [],
        "reasoning": [],
        "decision_making": [],
        "code_generation": [],
    }

    for i, row in df.iterrows():
        agent_model_name = row["agent_model_name"]
        parsed = parse_modelname(agent_model_name)
        parsed["type"] = parsed["type"][1:]
        name = index_to_name((parsed["model"], parsed["size"], parsed["type"]))
        if "16k" in name:
            continue

        for key in sr_json.keys():
            sr_json[key].append([
                name, row[key]
            ])
    return sr_json

In [12]:
with open("../website/data/sr_without_feedback.json", "w") as f:
    no_feedback_json = sr_df_to_json(main_table_sr.reset_index())
    json.dump(no_feedback_json, f, indent=2)
with open("../website/data/sr_with_feedback.json", "w") as f:
    feedback_json = sr_df_to_json(gpt4_feedback.reset_index())
    json.dump(feedback_json, f, indent=2)

In [21]:
list(map(lambda x: x[0], no_feedback_json['avg_micro']))

['CodeLlama (13B, Base)',
 'CodeLlama (13B, SIFT)',
 'CodeLlama (34B, Base)',
 'CodeLlama (34B, SIFT)',
 'CodeLlama (7B, Base)',
 'CodeLlama (7B, SIFT)',
 'Lemur-v1 (70B, SIFT)',
 'Lemur-v1 (70B, Base)',
 'LLaMA-2 (13B, Base)',
 'LLaMA-2 (13B, RLHF)',
 'LLaMA-2 (70B, Base)',
 'LLaMA-2 (70B, RLHF)',
 'LLaMA-2 (7B, Base)',
 'LLaMA-2 (7B, RLHF)',
 'chat-bison-001 (closed-source)',
 'claude-2 (closed-source)',
 'claude-instant-1 (closed-source)',
 'gpt-3.5-turbo-0613 (closed-source)',
 'gpt-4-0613 (closed-source)',
 'Vicuna-v1.5 (13B, SIFT)',
 'Vicuna-v1.5 (7B, SIFT)']

### Efficacy of different LLM in simulating feedback

In [13]:
def plot_multiple_feedback_diff_table(feedback_perfs, nofeedback_perf, names=[]):

    to_concat = [nofeedback_perf.applymap(lambda x: f"${x:.2f}$")]
    keys = ["1w/o feedback"]
    for i, _feedback_perf in enumerate(feedback_perfs):
        _cur_name = names[i]
        diff = _feedback_perf - nofeedback_perf
        # to_concat.append(_feedback_perf.applymap(lambda x: f"${x:.2f}$"))
        # keys.append(f"{i+2}w/ {_cur_name} feedback")
        to_concat.append(diff.applymap(lambda x: f"${x:+.2f}$"))#.applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"))
        # keys.append(str(i+3) + r"$\Delta_\texttt{feedback, "+ _cur_name + "}$")
        keys.append(str(i+3) + r"\texttt{"+ _cur_name + "}")

    feedback_table = pd.concat(
        to_concat, axis=1,
        keys=keys
    )

    # wrap all index with \texttt
    feedback_table.index = feedback_table.index.map(lambda x: "\texttt{" + x + "}")

    feedback_table.columns = feedback_table.columns.set_names(['metric', 'task_type'])
    COLUMN_NAME_MAP = {
        "code_generation": "Code Generation",
        "decision_making": "Decision-Making",
        "reasoning": "Reasoning",
        # "vision_language": "Vision-Language",
        "avg_micro": "Micro Average",
    }
    feedback_table = feedback_table.stack(level='metric').rename(columns=COLUMN_NAME_MAP)
    feedback_table = feedback_table.rename(index={
        k: k[1:] for k in keys
    }) # remove the first character for sorting

    feedback_table = feedback_table.reindex(columns=[
        "Micro Average",
    ])
    # drop the first column
    baseline_perf = main_table_sr.loc["gpt-3.5-turbo-0613"]
    feedback_baseline_perf_gap = []
    for model in names:
        perf = main_table_sr.loc[model]
        diff = perf - baseline_perf
        feedback_baseline_perf_gap.append({"model": model, "gap": diff["avg_micro"]})
    feedback_baseline_perf_gap = pd.DataFrame(feedback_baseline_perf_gap).set_index("model")
    feedback_baseline_perf_gap.index = feedback_baseline_perf_gap.index.map(lambda x: r"\texttt{" + x + "}")
    feedback_baseline_perf_gap = feedback_baseline_perf_gap.applymap(lambda x: f"${x:+.2f}$")
    feedback_baseline_perf_gap.reset_index(inplace=True)
    feedback_table = feedback_table.reset_index().drop(columns=["agent_model_name"])
    feedback_table = feedback_table.rename(columns={"metric": "model"})
    # feedback_table = pd.concat([feedback_baseline_perf_gap, feedback_table], axis=1)
    feedback_table = pd.merge(feedback_baseline_perf_gap, feedback_table, on="model")
    display(feedback_table)

    # print latex
    print(feedback_table.to_latex(
        index=False,
        escape=False,
        multirow=True,
        column_format="llllll|l"
    ))
    return feedback_table

gpt4_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613'",
    mode='sr', display_table=False
)
gpt4_feedback_sr = gpt4_feedback_sr.drop(index=list(
    set(gpt4_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))
gpt35_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-3.5-turbo-16k-0613'",
    mode='sr', display_table=False
)
gpt35_feedback_sr = gpt35_feedback_sr.drop(index=list(
    set(gpt35_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))
claude1_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'claude-instant-1'",
    mode='sr', display_table=False
)
claude1_feedback_sr = claude1_feedback_sr.drop(index=list(
    set(claude1_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))

llama2_70b_base_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'Llama-2-70b-hf'",
    mode='sr', display_table=False
)
llama2_70b_base_feedback_sr = llama2_70b_base_feedback_sr.drop(index=list(
    set(llama2_70b_base_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))

llama2_70b_chat_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'Llama-2-70b-chat-hf'",
    mode='sr', display_table=False
)
llama2_70b_chat_feedback_sr = llama2_70b_chat_feedback_sr.drop(index=list(
    set(llama2_70b_chat_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))

codellama_34b_base_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'CodeLlama-34b-hf'",
    mode='sr', display_table=False
)
codellama_34b_base_feedback_sr = codellama_34b_base_feedback_sr.drop(index=list(
    set(codellama_34b_base_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))

codellama_34b_instruct_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'CodeLlama-34b-Instruct-hf'",
    mode='sr', display_table=False
)
codellama_34b_instruct_feedback_sr = codellama_34b_instruct_feedback_sr.drop(index=list(
    set(codellama_34b_instruct_feedback_sr.index) - {"gpt-3.5-turbo-0613"}
))


feedback_efficacy_table = plot_multiple_feedback_diff_table([
    gpt4_feedback_sr,
    gpt35_feedback_sr,
    claude1_feedback_sr,
    llama2_70b_base_feedback_sr,
    llama2_70b_chat_feedback_sr,
    codellama_34b_base_feedback_sr,
    codellama_34b_instruct_feedback_sr
    ],
    main_table_sr.drop(index=list(
        set(main_table_sr.index) - {"gpt-3.5-turbo-0613"}
    )),
    names=[
        'gpt-4-0613',
        'gpt-3.5-turbo-0613',
        "claude-instant-1",
        "Llama-2-70b",
        "Llama-2-70b-chat",
        "CodeLlama-34b",
        "CodeLlama-34b-Instruct"
    ]
)
# gpt4_feedback_wsr = generate_table("exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613'", mode='rwp')
# plot_feedback_diff_table(gpt4_feedback_wsr, main_table_wsr)

Unnamed: 0,model,gap,Micro Average
0,\texttt{gpt-4-0613},$+32.93$,$+15.19$
1,\texttt{gpt-3.5-turbo-0613},$+0.00$,$-10.41$
2,\texttt{claude-instant-1},$+9.55$,$+1.53$
3,\texttt{Llama-2-70b},$-9.73$,$-0.69$
4,\texttt{Llama-2-70b-chat},$-18.26$,$-14.00$
5,\texttt{CodeLlama-34b},$-8.02$,$+2.22$
6,\texttt{CodeLlama-34b-Instruct},$-19.12$,$+3.07$


\begin{tabular}{llllll|l}
\toprule
                          model &      gap & Micro Average \\
\midrule
            \texttt{gpt-4-0613} & $+32.93$ &      $+15.19$ \\
    \texttt{gpt-3.5-turbo-0613} &  $+0.00$ &      $-10.41$ \\
      \texttt{claude-instant-1} &  $+9.55$ &       $+1.53$ \\
           \texttt{Llama-2-70b} &  $-9.73$ &       $-0.69$ \\
      \texttt{Llama-2-70b-chat} & $-18.26$ &      $-14.00$ \\
         \texttt{CodeLlama-34b} &  $-8.02$ &       $+2.22$ \\
\texttt{CodeLlama-34b-Instruct} & $-19.12$ &       $+3.07$ \\
\bottomrule
\end{tabular}



  print(feedback_table.to_latex(


In [27]:
feedback_efficacy_res = feedback_efficacy_table.applymap(lambda x: x.replace("$", '').replace(r"\texttt{", '').replace("}", ''))
d_json = []

for i, row in feedback_efficacy_res.iterrows():
    agent_model_name = row["model"]
    parsed = parse_modelname(agent_model_name)
    parsed["type"] = parsed["type"][1:]
    name = index_to_name((parsed["model"], parsed["size"], parsed["type"]))

    d_json.append({
        "feedback_provider": name,
        "SR5_difference": float(row["gap"]),
        "evaluated_LLM_feedback": float(row["Micro Average"]),
    })

with open("../website/data/feedback_comparison.json", 'w') as f:
    json.dump(d_json, f, indent=4)

# Ablation Study

## Informativeness of Feedback

#### GT vs NoGT

In [15]:
def plot_gt_feedback_diff_table(nogt_feedback_perf, gt_feedback_perf, nofeedback_perf):
    diff = gt_feedback_perf - nogt_feedback_perf
    feedback_table = pd.concat([
        nofeedback_perf.applymap(lambda x: f"${x:.2f}$"),
        (nogt_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"),#.applymap(lambda x: f"${x:.2f}$"),
        (gt_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"),#.applymap(lambda x: f"${x:.2f}$"),
        diff.applymap(lambda x: "\gcc{" + f"{x:+.2f}" + "}")
    ], axis=1, keys=[
        '1no feedback',
        '2w/o GT',
        '3w/ GT',
        r'4\Delta_{\text{GT feedback}}'
    ])

    # wrap all index with \texttt
    feedback_table.index = feedback_table.index.map(lambda x: "\texttt{" + x + "}")

    feedback_table.columns = feedback_table.columns.set_names(['metric', 'task_type'])
    COLUMN_NAME_MAP = {
        "code_generation": "Code Generation",
        "decision_making": "Decision-Making",
        "reasoning": "Reasoning",
        "vision_language": "Vision-Language",
        "avg_micro": "Micro Average",
    }
    feedback_table = feedback_table.stack(level='metric').rename(columns=COLUMN_NAME_MAP)
    feedback_table = feedback_table.rename(index={
        # "1w/o feedback": "w/o feedback",
        # "2w/ feedback": "w/ feedback",
        # r"3\Delta_{\text{feedback}}": r"$\Delta_{\text{feedback}}$",
        "1no feedback": "no feedback",
        # "2w/o GT": "feedback, w/o GT",
        "2w/o GT": r"$\Delta_{\texttt{feedback, textual, w/o GT}}$",
        # "3w/ GT": "feedback, w/ GT",
        "3w/ GT": r"$\Delta_{\texttt{feedback, textual, w/ GT}}$",
        r"4\Delta_{\text{GT feedback}}": r"$\Delta_{+\text{GT feedback}}$",
    })
    # re-order Micro Average to the last column
    feedback_table = feedback_table.reindex(columns=[
        "Reasoning",
        "Decision-Making",
        "Code Generation",
        # "Vision-Language",
        "Micro Average",
    ])
    feedback_table = feedback_table.reset_index().drop(columns=['agent_model_name'])
    display(feedback_table)

    # print latex
    print(feedback_table.to_latex(
        index=False,
        escape=False,
        multirow=True,
        column_format="llllll|l"
    ))
    return feedback_table


gpt4_nogt_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
# gpt4_nogt_feedback_sr.drop(index=["gpt-4-0613"], inplace=True)
gpt4_gt_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=GT-textual' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
feedback_table = plot_gt_feedback_diff_table(gpt4_nogt_feedback_sr, gpt4_gt_feedback_sr, main_table_sr.drop(index=list(set(main_table_sr.index) - set(gpt4_nogt_feedback_sr.index))))

task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,66.41791,70.833333,32.55814,51.111111,43.0,32.967033,60.526316,44.897959


task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,32.835821,77.083333,44.186047,68.888889,43.0,37.362637,63.157895,44.897959


task_type,metric,Reasoning,Decision-Making,Code Generation,Micro Average
0,no feedback,$36.71$,$41.79$,$29.41$,$36.18$
1,"$\Delta_{\texttt{feedback, textual, w/o GT}}$",\gcb{+13.61},\gcb{+24.63},\gcb{+9.56},\gcb{+15.19}
2,"$\Delta_{\texttt{feedback, textual, w/ GT}}$",\gcb{+16.77},\gcb{-8.95},\gcb{+18.38},\gcb{+11.26}
3,$\Delta_{+\text{GT feedback}}$,\gcc{+3.16},\gcc{-33.58},\gcc{+8.82},\gcc{-3.93}


\begin{tabular}{llllll|l}
\toprule
                                       metric &    Reasoning & Decision-Making & Code Generation & Micro Average \\
\midrule
                                  no feedback &      $36.71$ &         $41.79$ &         $29.41$ &       $36.18$ \\
$\Delta_{\texttt{feedback, textual, w/o GT}}$ & \gcb{+13.61} &    \gcb{+24.63} &     \gcb{+9.56} &  \gcb{+15.19} \\
 $\Delta_{\texttt{feedback, textual, w/ GT}}$ & \gcb{+16.77} &     \gcb{-8.95} &    \gcb{+18.38} &  \gcb{+11.26} \\
               $\Delta_{+\text{GT feedback}}$ &  \gcc{+3.16} &    \gcc{-33.58} &     \gcc{+8.82} &   \gcc{-3.93} \\
\bottomrule
\end{tabular}



  print(feedback_table.to_latex(


#### Binary vs. Textual

In [16]:
def plot_binary_feedback_diff_table(textual_feedback_perf, binary_feedback_perf, nofeedback_perf):
    diff = binary_feedback_perf - textual_feedback_perf
    feedback_table = pd.concat([
        nofeedback_perf.applymap(lambda x: f"${x:.2f}$"),
        (binary_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"), #.applymap(lambda x: f"${x:.2f}$"),
        (textual_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"), #.applymap(lambda x: f"${x:.2f}$"),
        diff.applymap(lambda x: "\gcc{" + f"{x:+.2f}" + "}")
    ], axis=1, keys=[
        '1no feedback',
        '2w/ binary feedback',
        '3w/ textual feedback',
        '4\Delta_{\text{textual feedback}}'
    ])

    # wrap all index with \texttt
    feedback_table.index = feedback_table.index.map(lambda x: "\texttt{" + x + "}")

    feedback_table.columns = feedback_table.columns.set_names(['metric', 'task_type'])
    COLUMN_NAME_MAP = {
        "code_generation": "Code Generation",
        "decision_making": "Decision-Making",
        "reasoning": "Reasoning",
        "vision_language": "Vision-Language",
        "avg_micro": "Micro Average",
    }
    feedback_table = feedback_table.stack(level='metric').rename(columns=COLUMN_NAME_MAP)
    feedback_table = feedback_table.rename(index={
        "1no feedback": "No feedback",
        "2w/ binary feedback": r"$\Delta_{\texttt{feedback, binary}}$",
        "3w/ textual feedback": r"$\Delta_{\texttt{feedback, textual}}$",
        "4\Delta_{\text{textual feedback}}": r"$\Delta_{-\text{textual feedback}}$",
    })
    # re-order Micro Average to the last column
    feedback_table = feedback_table.reindex(columns=[
        "Reasoning",
        "Decision-Making",
        "Code Generation",
        # "Vision-Language",
        "Micro Average",
    ])
    feedback_table = feedback_table.reset_index().drop(columns=['agent_model_name'])
    display(feedback_table)

    # print latex
    print(feedback_table.to_latex(
        index=False,
        escape=False,
        multirow=True,
        column_format="ll|lll|l"
    ))
    return feedback_table


gpt4_textual_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
gpt4_binary_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-binary' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
# remove index that is in dense feedback but not in sparse feedback
gpt4_textual_feedback_sr.drop(index=list(set(gpt4_textual_feedback_sr.index) - set(gpt4_binary_feedback_sr.index)), inplace=True)
feedback_table = plot_binary_feedback_diff_table(
    gpt4_textual_feedback_sr,
    gpt4_binary_feedback_sr,
    main_table_sr.drop(index=list(set(main_table_sr.index) - set(gpt4_textual_feedback_sr.index)))
)

task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,66.41791,70.833333,32.55814,51.111111,43.0,32.967033,60.526316,44.897959


task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,47.761194,39.583333,30.232558,46.666667,31.0,21.978022,52.631579,40.816327


task_type,metric,Reasoning,Decision-Making,Code Generation,Micro Average
0,No feedback,$36.71$,$41.79$,$29.41$,$36.18$
1,"$\Delta_{\texttt{feedback, binary}}$",\gcb{+2.21},\gcb{+5.97},\gcb{+0.74},\gcb{+2.73}
2,"$\Delta_{\texttt{feedback, textual}}$",\gcb{+13.61},\gcb{+24.63},\gcb{+9.56},\gcb{+15.19}
3,$\Delta_{-\text{textual feedback}}$,\gcc{-11.40},\gcc{-18.66},\gcc{-8.82},\gcc{-12.46}


\begin{tabular}{ll|lll|l}
\toprule
                               metric &    Reasoning & Decision-Making & Code Generation & Micro Average \\
\midrule
                          No feedback &      $36.71$ &         $41.79$ &         $29.41$ &       $36.18$ \\
 $\Delta_{\texttt{feedback, binary}}$ &  \gcb{+2.21} &     \gcb{+5.97} &     \gcb{+0.74} &   \gcb{+2.73} \\
$\Delta_{\texttt{feedback, textual}}$ & \gcb{+13.61} &    \gcb{+24.63} &     \gcb{+9.56} &  \gcb{+15.19} \\
  $\Delta_{-\text{textual feedback}}$ & \gcc{-11.40} &    \gcc{-18.66} &     \gcc{-8.82} &  \gcc{-12.46} \\
\bottomrule
\end{tabular}



  print(feedback_table.to_latex(


## Frequency of Feedback

In [17]:
def plot_sparse_feedback_diff_table(dense_feedback_perf, sparse_feedback_perf, nofeedback_perf):
    diff = sparse_feedback_perf - dense_feedback_perf
    feedback_table = pd.concat([
        nofeedback_perf.applymap(lambda x: f"${x:.2f}$"),
        (dense_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"), #.applymap(lambda x: f"${x:.2f}$"),
        (sparse_feedback_perf - nofeedback_perf).applymap(lambda x: "\gcb{" + f"{x:+.2f}" + "}"), #.applymap(lambda x: f"${x:.2f}$"),
        diff.applymap(lambda x: "\gcc{" + f"{x:+.2f}" + "}")
    ], axis=1, keys=[
        '1no feedback',
        '2w/ dense feedback',
        '3w/ sparse feedback',
        '4\Delta_{\text{feedback frequency}}'
    ])

    # wrap all index with \texttt
    feedback_table.index = feedback_table.index.map(lambda x: "\texttt{" + x + "}")

    feedback_table.columns = feedback_table.columns.set_names(['metric', 'task_type'])
    COLUMN_NAME_MAP = {
        "code_generation": "Code Generation",
        "decision_making": "Decision-Making",
        "reasoning": "Reasoning",
        "vision_language": "Vision-Language",
        "avg_micro": "Micro Average",
    }
    feedback_table = feedback_table.stack(level='metric').rename(columns=COLUMN_NAME_MAP)
    feedback_table = feedback_table.rename(index={
        "1no feedback": "No feedback",
        "2w/ dense feedback": r"\Delta_{\texttt{feedback, dense}}",
        "3w/ sparse feedback": r"\Delta_{\texttt{feedback, sparse}}",
        "4\Delta_{\text{feedback frequency}}": r"$\Delta_{-\text{feedback frequency}}$",
    })
    # re-order Micro Average to the last column
    feedback_table = feedback_table.reindex(columns=[
        "Reasoning",
        "Decision-Making",
        "Code Generation",
        # "Vision-Language",
        "Micro Average",
    ])
    feedback_table = feedback_table.reset_index().drop(columns=['agent_model_name'])
    display(feedback_table)

    # print latex
    print(feedback_table.to_latex(
        index=False,
        escape=False,
        multirow=True,
        column_format="ll|lll|l"
    ))
    return feedback_table


gpt4_dense_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
gpt4_sparse_feedback_sr = generate_table(
    "exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'PHF=no_GT-textual-sparse' and feedback_model_name == 'gpt-4-0613' and agent_model_name == 'gpt-3.5-turbo-0613'", mode='sr'
)
# remove index that is in dense feedback but not in sparse feedback
gpt4_dense_feedback_sr.drop(index=list(set(gpt4_dense_feedback_sr.index) - set(gpt4_sparse_feedback_sr.index)), inplace=True)
feedback_table = plot_sparse_feedback_diff_table(
    gpt4_dense_feedback_sr,
    gpt4_sparse_feedback_sr,
    main_table_sr.drop(index=list(set(main_table_sr.index) - set(gpt4_dense_feedback_sr.index)))
)

task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,66.41791,70.833333,32.55814,51.111111,43.0,32.967033,60.526316,44.897959


task_name,alfworld,gsm8k,hotpotqa,humaneval,math,mbpp,mmlu,theoremqa
agent_model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo-0613,46.268657,58.333333,30.232558,44.444444,31.0,23.076923,53.947368,40.816327


task_type,metric,Reasoning,Decision-Making,Code Generation,Micro Average
0,No feedback,$36.71$,$41.79$,$29.41$,$36.18$
1,"\Delta_{\texttt{feedback, dense}}",\gcb{+13.61},\gcb{+24.63},\gcb{+9.56},\gcb{+15.19}
2,"\Delta_{\texttt{feedback, sparse}}",\gcb{+5.38},\gcb{+4.48},\gcb{+0.74},\gcb{+4.09}
3,$\Delta_{-\text{feedback frequency}}$,\gcc{-8.23},\gcc{-20.15},\gcc{-8.82},\gcc{-11.10}


\begin{tabular}{ll|lll|l}
\toprule
                               metric &    Reasoning & Decision-Making & Code Generation & Micro Average \\
\midrule
                          No feedback &      $36.71$ &         $41.79$ &         $29.41$ &       $36.18$ \\
    \Delta_{\texttt{feedback, dense}} & \gcb{+13.61} &    \gcb{+24.63} &     \gcb{+9.56} &  \gcb{+15.19} \\
   \Delta_{\texttt{feedback, sparse}} &  \gcb{+5.38} &     \gcb{+4.48} &     \gcb{+0.74} &   \gcb{+4.09} \\
$\Delta_{-\text{feedback frequency}}$ &  \gcc{-8.23} &    \gcc{-20.15} &     \gcc{-8.82} &  \gcc{-11.10} \\
\bottomrule
\end{tabular}



  print(feedback_table.to_latex(


## Detected Issues

In [18]:
def get_assistant_traj_str(state):
    traj_str = ""
    history = state["history"]
    for d in history:
        if d["role"] == "assistant":
            traj_str += f"\n{d['content']}\n"
    return traj_str

### Why Some Numbers are Zero? Fraction of Invalid Format

In [19]:
def replace_index(df):
    df = pd.concat([
        pd.DataFrame(pd.Series(df.index).apply(parse_modelname).tolist()),
        df.reset_index(drop=True)
    ], axis=1)
    df.set_index(["model", "size", "type"], inplace=True)
    df.sort_index(inplace=True)
    df.index = df.index.map(lambda x: (x[0], f"{x[1]}B" if x[1] != "-" else "-", x[2].replace('1', '').replace('2', '').replace('3', '')))
    return df


avg_count = generate_table("exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'None' and feedback_model_name == 'None'", mode='invalid_count')
filter_models=[
    "vicuna-7b-v1.5-16k",
    "vicuna-13b-v1.5-16k",
    "gpt-3.5-turbo-16k-0613",
    "gpt-3.5-turbo-0613-human-eval-gpt",
    "gpt-3.5-turbo-0613-human-eval-human",
    # "Lemur-70b-chat-v1"
]
display(avg_count[["invalid_count", "n_turns"]].style.format("{:.2f}").background_gradient(cmap='Blues', axis=0))
avg_count.reset_index(inplace=True)
avg_count = avg_count[avg_count["agent_model_name"].apply(lambda x: x not in filter_models)]
avg_count.set_index(["agent_model_name"], inplace=True)


# merge the two tables
avg_count_table = avg_count["invalid_count"].applymap(lambda x: f"{x:.2f}") + avg_count["n_turns"].applymap(lambda x: f" / {x:.2f}")
def _bold_ratio(x, ratio=0.2):
    ele1, ele2 = x.split(" / ")
    ele1 = float(ele1)
    ele2 = float(ele2)
    if (ele1 / (ele2+1e-6)) > ratio:
        return "$\mathbf{" + f"{ele1:.2f}" + "}$ / $\mathbf{" + f"{ele2:.2f}" + "}$"
    else:
        return "$" + f"{ele1:.2f}" + "$ / $" + f"{ele2:.2f}" + "$"
avg_count_table = avg_count_table.applymap(_bold_ratio)

avg_count_table = replace_index(avg_count_table)[["reasoning", "decision_making", "code_generation", "avg_micro"]]
display(avg_count_table)
print(avg_count_table.to_latex(
    escape=False,
    multirow=True,
    column_format="l|llll|l"
))

Unnamed: 0_level_0,invalid_count,invalid_count,invalid_count,invalid_count,n_turns,n_turns,n_turns,n_turns
task_type,code_generation,decision_making,reasoning,avg_micro,code_generation,decision_making,reasoning,avg_micro
agent_model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
CodeLlama-13b,0.0,0.0,0.5,0.27,4.93,3.36,4.55,4.36
CodeLlama-13b-Instruct,0.04,0.01,0.16,0.1,4.77,3.77,4.66,4.48
CodeLlama-34b,0.05,0.0,0.19,0.11,4.77,3.37,4.21,4.15
CodeLlama-34b-Instruct,1.09,0.04,0.23,0.39,3.27,3.83,3.68,3.62
CodeLlama-7b,2.38,0.11,3.96,2.71,4.38,4.17,4.99,4.66
CodeLlama-7b-Instruct,0.1,0.1,0.46,0.3,4.65,4.33,4.32,4.4
Lemur-70b-chat-v1,2.02,1.32,1.83,1.76,4.67,3.32,4.11,4.06
Lemur-70b-v1,0.26,0.0,0.29,0.22,4.33,3.28,4.25,4.05
Llama-2-13b,0.13,0.01,0.49,0.3,4.96,3.4,4.75,4.49
Llama-2-13b-chat,0.1,0.0,0.29,0.18,3.02,4.54,3.71,3.74


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reasoning,decision_making,code_generation,avg_micro
model,size,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
\texttt{CodeLlama},7B,Base,$\mathbf{3.96}$ / $\mathbf{4.99}$,$0.11$ / $4.17$,$\mathbf{2.38}$ / $\mathbf{4.38}$,$\mathbf{2.71}$ / $\mathbf{4.66}$
\texttt{CodeLlama},7B,SIFT,$0.46$ / $4.32$,$0.10$ / $4.33$,$0.10$ / $4.65$,$0.30$ / $4.40$
\texttt{CodeLlama},13B,Base,$0.50$ / $4.55$,$0.00$ / $3.36$,$0.00$ / $4.93$,$0.27$ / $4.36$
\texttt{CodeLlama},13B,SIFT,$0.16$ / $4.66$,$0.01$ / $3.77$,$0.04$ / $4.77$,$0.10$ / $4.48$
\texttt{CodeLlama},34B,Base,$0.19$ / $4.21$,$0.00$ / $3.37$,$0.05$ / $4.77$,$0.11$ / $4.15$
\texttt{CodeLlama},34B,SIFT,$0.23$ / $3.68$,$0.04$ / $3.83$,$\mathbf{1.09}$ / $\mathbf{3.27}$,$0.39$ / $3.62$
\texttt{LLaMA-2},7B,Base,$0.59$ / $4.62$,$0.00$ / $3.53$,$0.25$ / $4.96$,$0.38$ / $4.45$
\texttt{LLaMA-2},7B,RLHF,$0.75$ / $4.03$,$\mathbf{1.13}$ / $\mathbf{4.40}$,$0.72$ / $3.79$,$\mathbf{0.83}$ / $\mathbf{4.06}$
\texttt{LLaMA-2},13B,Base,$0.49$ / $4.75$,$0.01$ / $3.40$,$0.13$ / $4.96$,$0.30$ / $4.49$
\texttt{LLaMA-2},13B,RLHF,$0.29$ / $3.71$,$0.00$ / $4.54$,$0.10$ / $3.02$,$0.18$ / $3.74$


\begin{tabular}{l|llll|l}
\toprule
                    &   &   &                          reasoning &                    decision_making &                    code_generation &                          avg_micro \\
model & size & type &                                    &                                    &                                    &                                    \\
\midrule
\multirow{6}{*}{\texttt{CodeLlama}} & \multirow{2}{*}{7B} & Base &  $\mathbf{3.96}$ / $\mathbf{4.99}$ &                    $0.11$ / $4.17$ &  $\mathbf{2.38}$ / $\mathbf{4.38}$ &  $\mathbf{2.71}$ / $\mathbf{4.66}$ \\
                    &   & SIFT &                    $0.46$ / $4.32$ &                    $0.10$ / $4.33$ &                    $0.10$ / $4.65$ &                    $0.30$ / $4.40$ \\
\cline{2-7}
                    & \multirow{2}{*}{13B} & Base &                    $0.50$ / $4.55$ &                    $0.00$ / $3.36$ &                    $0.00$ / $4.93$ &                    $0.27$ / $4.36

  print(avg_count_table.to_latex(


### Vicuna Escaped Underscore

In [20]:
escape_underscore_results = all_results.query("exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'None' and feedback_model_name == 'None'")

escape_underscore_count = escape_underscore_results["state"].apply(get_assistant_traj_str).apply(lambda x: x.count(r"\_") > 0)
escape_underscore_count = pd.concat([escape_underscore_results[["agent_model_name", "task_type"]], escape_underscore_count], axis=1)
escape_underscore_count = escape_underscore_count.groupby(["agent_model_name", "task_type"]).mean() * 100
escape_underscore_count.reset_index(inplace=True)

escape_underscore_count = pd.concat([
    pd.DataFrame(escape_underscore_count["agent_model_name"].apply(parse_modelname).tolist()),
    escape_underscore_count
], axis=1)
escape_underscore_count.set_index(["model", "size", "type", "task_type"], inplace=True)
escape_underscore_count.drop(columns=["agent_model_name"], inplace=True)
escape_underscore_count = escape_underscore_count.unstack()["state"]
escape_underscore_count.sort_index(inplace=True)
escape_underscore_count.index = escape_underscore_count.index.map(lambda x: 
    (x[0], f"{x[1]}B" if x[1] != "-" else "-", x[2].replace('1', '').replace('2', '').replace('3', ''))
)
escape_underscore_count = escape_underscore_count[["reasoning", "decision_making", "code_generation"]]
display(escape_underscore_count.style.background_gradient(cmap='Blues', axis=0))

escape_underscore_count = escape_underscore_count.applymap(lambda x: f"${x:.2f}$" if x < 2 else f"$\\mathbf{{{x:.2f}}}$")


print(escape_underscore_count.to_latex(
    escape=False,
    multirow=True,
    column_format="l|l",
    float_format="%.2f"
))


Unnamed: 0_level_0,Unnamed: 1_level_0,task_type,reasoning,decision_making,code_generation
model,size,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
\texttt{CodeLlama},7B,Base,0.0,0.0,0.0
\texttt{CodeLlama},7B,SIFT,0.0,0.0,0.0
\texttt{CodeLlama},13B,Base,0.0,0.0,0.0
\texttt{CodeLlama},13B,SIFT,0.0,0.0,0.0
\texttt{CodeLlama},34B,Base,0.0,0.0,0.0
\texttt{CodeLlama},34B,SIFT,0.0,0.0,0.0
\texttt{LLaMA-2},7B,Base,0.0,0.0,0.0
\texttt{LLaMA-2},7B,RLHF,0.0,0.0,0.0
\texttt{LLaMA-2},13B,Base,0.0,0.0,0.0
\texttt{LLaMA-2},13B,RLHF,0.0,0.0,0.0


\begin{tabular}{l|l}
\toprule
                    &   & task_type &         reasoning &   decision_making &   code_generation \\
model & size & type &                   &                   &                   \\
\midrule
\multirow{6}{*}{\texttt{CodeLlama}} & \multirow{2}{*}{7B} & Base &            $0.00$ &            $0.00$ &            $0.00$ \\
                    &   & SIFT &            $0.00$ &            $0.00$ &            $0.00$ \\
\cline{2-6}
                    & \multirow{2}{*}{13B} & Base &            $0.00$ &            $0.00$ &            $0.00$ \\
                    &   & SIFT &            $0.00$ &            $0.00$ &            $0.00$ \\
\cline{2-6}
                    & \multirow{2}{*}{34B} & Base &            $0.00$ &            $0.00$ &            $0.00$ \\
                    &   & SIFT &            $0.00$ &            $0.00$ &            $0.00$ \\
\cline{1-6}
\cline{2-6}
\multirow{6}{*}{\texttt{LLaMA-2}} & \multirow{2}{*}{7B} & Base &            $0.00$ &           

  print(escape_underscore_count.to_latex(


### CodeLLaMA `\[PYTHON\]` bug

In [21]:
python_bracket_results = all_results.query("exp_setting == 'max5_p2+tool+cd' and feedback_setting == 'None' and feedback_model_name == 'None'")

python_bracket_count = python_bracket_results["state"].apply(get_assistant_traj_str).apply(lambda x: x.count(r"[PYTHON]") > 0)
python_bracket_count = pd.concat([python_bracket_results[["agent_model_name", "task_type"]], python_bracket_count], axis=1)
python_bracket_count = python_bracket_count.groupby(["agent_model_name", "task_type"]).mean() * 100
python_bracket_count.reset_index(inplace=True)

python_bracket_count = pd.concat([
    pd.DataFrame(python_bracket_count["agent_model_name"].apply(parse_modelname).tolist()),
    python_bracket_count
], axis=1)
python_bracket_count.set_index(["model", "size", "type", "task_type"], inplace=True)
python_bracket_count.drop(columns=["agent_model_name"], inplace=True)
python_bracket_count = python_bracket_count.unstack()["state"]
python_bracket_count.sort_index(inplace=True)
python_bracket_count.index = python_bracket_count.index.map(lambda x: 
    (x[0], f"{x[1]}B" if x[1] != "-" else "-", x[2].replace('1', '').replace('2', '').replace('3', ''))
)
python_bracket_count = python_bracket_count[["reasoning", "decision_making", "code_generation"]]
display(python_bracket_count.style.background_gradient(cmap='Blues', axis=0))
python_bracket_count = python_bracket_count.applymap(lambda x: f"${x:.2f}$" if x < 2 else f"$\\mathbf{{{x:.2f}}}$")

print(python_bracket_count.to_latex(
    escape=False,
    multirow=True,
    column_format="l|l",
    float_format="%.2f"
))


Unnamed: 0_level_0,Unnamed: 1_level_0,task_type,reasoning,decision_making,code_generation
model,size,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
\texttt{CodeLlama},7B,Base,0.0,0.0,0.0
\texttt{CodeLlama},7B,SIFT,0.0,0.0,0.0
\texttt{CodeLlama},13B,Base,0.0,0.0,0.0
\texttt{CodeLlama},13B,SIFT,0.0,0.0,2.205882
\texttt{CodeLlama},34B,Base,0.0,0.0,0.0
\texttt{CodeLlama},34B,SIFT,0.0,0.0,100.0
\texttt{LLaMA-2},7B,Base,0.0,0.0,0.0
\texttt{LLaMA-2},7B,RLHF,0.0,0.0,0.0
\texttt{LLaMA-2},13B,Base,0.0,0.0,0.0
\texttt{LLaMA-2},13B,RLHF,0.0,0.0,0.0


\begin{tabular}{l|l}
\toprule
                    &   & task_type & reasoning & decision_making &    code_generation \\
model & size & type &           &                 &                    \\
\midrule
\multirow{6}{*}{\texttt{CodeLlama}} & \multirow{2}{*}{7B} & Base &    $0.00$ &          $0.00$ &             $0.00$ \\
                    &   & SIFT &    $0.00$ &          $0.00$ &             $0.00$ \\
\cline{2-6}
                    & \multirow{2}{*}{13B} & Base &    $0.00$ &          $0.00$ &             $0.00$ \\
                    &   & SIFT &    $0.00$ &          $0.00$ &    $\mathbf{2.21}$ \\
\cline{2-6}
                    & \multirow{2}{*}{34B} & Base &    $0.00$ &          $0.00$ &             $0.00$ \\
                    &   & SIFT &    $0.00$ &          $0.00$ &  $\mathbf{100.00}$ \\
\cline{1-6}
\cline{2-6}
\multirow{6}{*}{\texttt{LLaMA-2}} & \multirow{2}{*}{7B} & Base &    $0.00$ &          $0.00$ &             $0.00$ \\
                    &   & RLHF &    $0.00$ &      

  print(python_bracket_count.to_latex(
