In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir("/root/FYP")
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from src.utils import get_baseline_results
import seaborn as sns

In [None]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [None]:
experiment_name_map = {
    "arcade.vanilla-FS": "Baseline Vanilla + FS",
    "arcade.CoT-FS": "Baseline CoT + FS",
    "arcade.CoT-FS+EXP": "[+] Explanations",
    "fyp.vanilla": "Vanilla",
    "fyp.DFS": "[+] DataFrames",
    "fyp.DFS+OUT": "[+] Execution Outputs",
    "fyp.DFS+OUT+FS": "[+] Exemplars (ARCADE)",
    "fyp.DFS+OUT+FS+SCH": "[+] Schema Information",
    "fyp.CoT-DFS+OUT": "CoT",
    "fyp.CoT-DFS+OUT+SCH": "[+] Schema Informationn",
    "fyp.CoT-FS+DFS+OUT+SCH": "[+] Exemplars (Custom)",
    "fyp.CoT+2S-DFS+OUT+SCH": "CoT + 2 Step",
    "fyp.CoT+2S+ERR-DFS+OUT+SCH": "[+] Errors",
    "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH": "[+] Outputs",
    
    # "fyp.CoT-aDFS+OUT": "APP DFS OUT",
    # "fyp.CoT-SCH": "SCHEMA ONLY",
    # "fyp.DFS+OUT+FS(AST)": "AST EXP",
    # "fyp.DFS+OUT+SCH+FS(AST)": "AST + SCH"
    # "fyp.CoT-DFS+OUT+SCH-[pass@30]": "pass30",
    # "fyp.CoT-DFS+OUT+SCH-[T]": "pass30 + T"
}

In [None]:
EXPERIMENTS_ROOT = "experiments"

def aggregate_results():
    results = []
    for rootdir, dirs, files in os.walk(EXPERIMENTS_ROOT):
        for subdir in dirs:
            expdir = os.path.join(rootdir, subdir)
            results_path = os.path.join(expdir, "results.json")
            if os.path.exists(results_path):
                with open(results_path, "r") as f:
                    data = json.loads(f.read())
                results.extend(list(map(lambda e: e | {"experiment_name": str(subdir)}, data)))
    return pd.DataFrame(results)

df = aggregate_results()

def extract_single_temp(r):
    t = 0.6
    i = int(t / 0.2)
    n = 5
    if r["experiment_name"] == "vanilla_raw_notebook":
        return r["predictions"][i*n:(i+1)*n] 
    return r["predictions"]

df["predictions"] = df.apply(extract_single_temp, axis=1)

# experiments = ["vanilla_raw_notebook", "chain_of_thought", "chain_of_thought_explanations", "fyp.vanilla", "fyp.with_dataframes", "fyp.with_outputs", "fyp.with_dataframes_and_outputs", "fyp.with_dataframes_at_creation", "fyp.with_created_dataframes_new_prompt", "fyp.with_created_dataframes_outputs_new_prompt", "fyp.with_created_dataframes_outputs_promptv3", "fyp.cot_dataframes_outputs_v2_exemplars_df_info"]
experiments = ["arcade.vanilla-FS", "arcade.CoT-FS", "arcade.CoT-FS+EXP", "fyp.vanilla", "fyp.DFS", "fyp.DFS+OUT", "fyp.DFS+OUT+FS", "fyp.DFS+OUT+FS+SCH", "fyp.CoT-DFS+OUT", "fyp.CoT-DFS+OUT+SCH", "fyp.CoT-FS+DFS+OUT+SCH", "fyp.CoT+2S-DFS+OUT+SCH", "fyp.CoT+2S+ERR-DFS+OUT+SCH", "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH"]
               
            #    , "fyp.CoT-SCH"] 

# , "fyp.CoT-aDFS+OUT","fyp.DFS+OUT+FS(AST)", "fyp.DFS+OUT+SCH+FS(AST)"] # "fyp.CoT-DFS+OUT+SCH-[pass@30]", "fyp.CoT-DFS+OUT+SCH-[T]"] # "fyp.SPLIT-DFS+OUT+FS+SCH"
# experiments = ["fyp.CoT-DFS+OUT+SCH", "fyp.CoT-FS+DFS+OUT+SCH", "fyp.CoT+2S-DFS+OUT+SCH", "fyp.CoT+2S+ERR-DFS+OUT+SCH", "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH", "fyp.CoT-DFS+OUT+SCH-[pass@30]", "fyp.CoT-DFS+OUT+SCH-[T]"]

df = df[df["experiment_name"].isin(experiments)]
df['experiment_name'] = df['experiment_name'].apply(lambda e: experiment_name_map[e])
df['dataset_src'] = df['dataset_src'].apply(lambda e: dict({"existing_tasks": "Existing Tasks", "new_tasks": "New Tasks"})[e])
df['model'] = df['model'].apply(lambda e: dict({"LLAMA3_INSTRUCT_70B": "Llama 3 70B", "LLAMA3_INSTRUCT_8B": "Llama 3 8B"})[e])
df['experiment_name'] = pd.Categorical(df['experiment_name'], categories=experiment_name_map.values(), ordered=True)
df = df.sort_values('experiment_name')

In [None]:
def barplot(data, y, ylabel, xlabel="Model - Dataset", x="model_dataset", hue="experiment_name", huelabel="Experiment Name", title=None, palette="tab20"):
    plt.figure(figsize=(8.5, 4))
    ax = sns.barplot(data=data, x=x, y=y, hue=hue, palette=palette)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if title:
        plt.title(title)
    leg = plt.legend(title=huelabel)
    leg.get_frame().set_alpha(0)
    if data[x].nunique() > 5:
        plt.xticks(rotation=90)
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.show()

def facetgridplot(data, x, xlabel, y, ylabel, z="model_dataset", zlabel="Model - Dataset", hue="experiment_name", huelabel="Experiment Name", fig=(3, 2), args={}, legend=True, sharex=False, sharey=False):
    # plt.figure(figsize=fig)
    g = sns.FacetGrid(data, col=z, col_wrap=2, sharex=sharex, sharey=sharey, legend_out=True, height=fig[0], aspect=fig[1], **args)
    g.map(sns.barplot, x, y, hue, palette="tab20")
    # if data[x].nunique() > 5:
    #     g.set_xticklabels(rotation=90)
    plt.tight_layout()
    if legend:
        g.add_legend(title=huelabel)
    g.set_axis_labels(xlabel, ylabel)
    g.set_titles(col_template="{col_name}")
    plt.show()

def generate_bin_col(data, bin_col, bin_size):
    bin_edges = np.arange(0, data[bin_col].max() + bin_size, bin_size)
    data[f"{bin_col}_bin"] = pd.cut(data[bin_col], bins=bin_edges, right=False, labels=[f"{i}-{i+bin_size-1}" for i in bin_edges[:-1]])
    return data

In [None]:
df["correct"] = df["predictions"].apply(lambda preds: any(p["accuracy"] == 1.0 for p in preds))
df['model_dataset'] = df['model'].str.split(" ").str[-1] + ' - ' + df['dataset_src']
tasks = df.groupby(["experiment_name", "model_dataset"])["correct"].mean().reset_index()
tasks.columns = ["experiment_name", "model_dataset", "pass5"]
# tasks = pd.concat([tasks, get_baseline_results(models=["PACHINCO"])], ignore_index=True)   

barplot(
    data=tasks,
    y="pass5",
    ylabel="pass@5",
    
)


In [None]:
plt.savefig("figures/results_pass5.pgf")

In [None]:
df_table = df.groupby(["experiment_name", "dataset_src", "model"])["correct"].mean().reset_index()
df_table.columns = ["experiment_name", "dataset_src", "model", "pass5"]
df_table = df_table.pivot(index='experiment_name', columns=['dataset_src', 'model'], values='pass5')
print(df_table.to_latex(index=True, header=True))
# df_table  

In [None]:
print("<scratchpad>\n\nTo find the average division population for different countries having events involving heavy rain, we need to follow these steps:\n\n1. Extract the events involving heavy rain from the 'glc' dataframe.\n2. Filter the events to only include those with a 'landslide_trigger' of 'Heavy Rain'.\n3. Extract the 'country_name' and 'admin_division_population' columns from the filtered events.\n4. Group the data by 'country_name' and calculate the average 'admin_division_population' for each country.\n\n</scratchpad>\n\n<python>\nheavy_rain_events = glc[glc.landslide_trigger == 'Heavy Rain']\navg_division_population = heavy_rain_events.groupby('country_name')['admin_division_population'].mean()\nprint(avg_division_population)\n</python>")

In [None]:
# df["task_type"] = df["reference_analysis"].apply(lambda a: a["task_type"])
# tasks_type_results = df.groupby(["experiment_name", "model_dataset", "task_type"])["correct"].mean().reset_index()
# tasks_type_results["correct"].fillna(0, inplace=True)


# facetgridplot(
#     data=tasks_type_results,
#     x="task_type",
    
#     y="correct",
# )

In [None]:
df_preds = df.explode("predictions")
df_preds["correct"] = df_preds["predictions"].apply(lambda p: p["accuracy"] == 1.0)
df_preds['model_dataset'] = df_preds['model'].str.split("_").str[-1] + ' - ' + df_preds['dataset_src']
df_preds["execution_error"] = df_preds["predictions"].apply(lambda p: p.get("execution_error", False))
df_preds["error_type"] = df_preds["predictions"].apply(lambda p: p["error_text"].split(":", 1)[0].strip() if p.get("error_text") else ("Logical Error" if p["accuracy"] != 1.0 else None))
df_preds["runtime_error_type"] = df_preds["predictions"].apply(lambda p: p["error_text"].split(":")[1].strip() if "RuntimeError" in p.get("error_text", "") else None)

In [None]:
key_errors = ["RuntimeError", "Parse Error", "Logical Error"]
error_results = df_preds.groupby(["experiment_name", "model", "error_type"]).size().reset_index(name='counts')
error_results = error_results[error_results["error_type"].isin(key_errors)]

# facetgridplot(
#     data=error_results,
#     x="error_type",
#     xlabel="Error Type",
#     y="counts",
#     ylabel="Num. Errors",
#     legend=False,
#     fig=(4,0.9)
# )

facetgridplot(
    data=error_results,
    x="error_type",
    xlabel="Error Type",
    y="counts",
    ylabel="Num. Errors",
    z="model",
    zlabel="Model",
    legend=False,
    fig=(3.5,1.5),
    sharey=True
)

In [None]:
plt.savefig("figures/results_error_types.pgf")

In [None]:
key_runtime_errors = ["AttributeError", "IndexError", "KeyError", "NameError", "TypeError", "ValueError"]
runtime_error_results = df_preds.groupby(["experiment_name", "model", "runtime_error_type"]).size().reset_index(name='counts')
runtime_error_results = runtime_error_results[runtime_error_results["runtime_error_type"].isin(key_runtime_errors)]

facetgridplot(
    data=runtime_error_results,
    x="model",
    # x="runtime_error_type",
    xlabel="Model",
    y="counts",
    ylabel="Num. Errors",
    args=dict(
        ylim=(0, 700)
    ),
    # fig=(8, 12),
    z="runtime_error_type",
    legend=False,
)

In [None]:
plt.savefig("figures/results_runtime_errors.pgf")

In [None]:
total_incorrect = df_preds[df_preds["correct"] == False]
total_incorrect_counts = total_incorrect.groupby(["experiment_name", "model_dataset"]).size().reset_index(name='total_counts')
logical_errors_results = df_preds[(df_preds["correct"] == False) & (df_preds["execution_error"] == False)]
logical_errors_counts = logical_errors_results.groupby(["experiment_name", "model_dataset"]).size().reset_index(name='logical_counts')
merged_counts = logical_errors_counts.merge(total_incorrect_counts, on=["experiment_name", "model_dataset"])
merged_counts['proportion_logical_errors'] = merged_counts['logical_counts'] / merged_counts['total_counts']
logical_errors_results = merged_counts[["experiment_name", "model_dataset", "proportion_logical_errors"]]

barplot(
    data=logical_errors_results,
    y="proportion_logical_errors",
    ylabel="Logical Errors"
)

In [None]:
df["cluster_count"] = df["clusters"].apply(lambda x: len(x) if x else 0)
cluster_count_results = df.groupby(["model", "experiment_name","cluster_count"]).size().reset_index(name='counts')
cluster_count_results = cluster_count_results[(cluster_count_results["cluster_count"] > 0) & (cluster_count_results["cluster_count"] <= 5)]

facetgridplot(
    data=cluster_count_results,
    x="cluster_count",
    xlabel="Num. Clusters",
    y="counts",
    ylabel="Frequency",
    z="model",
    zlabel="Model",
    legend=False,
    fig=(3.5,1.5),
    args=dict(
        ylim=(0, 720)
    )
)

In [None]:
plt.savefig("figures/results_num_clusters.pgf")

In [None]:
call_count = lambda calls: sum([len(v) for v in calls["modules"].values()])
df["complexity"] = df["reference_analysis"].apply(call_count)
df = generate_bin_col(df, "complexity", 5)
complexity_results = df.groupby(["model", "experiment_name", "complexity_bin"])["correct"].mean().reset_index()

facetgridplot(
    data=complexity_results,
    x="complexity_bin",
    xlabel="Num. Pandas Calls",
    y="correct",
    ylabel="pass@5",
    z="model",
    zlabel="Model",
    legend=False,
    # args=dict(
    #     ylim=0.8
    # ),
    fig=(2.5,1.5),
    sharey=True
)

In [None]:
plt.savefig("figures/results_pass5_pandas_calls.pgf")

In [None]:
df = generate_bin_col(df, "turn_index", 6)
turn_index_res = df.groupby(["model", "experiment_name", "turn_index_bin"])["correct"].mean().reset_index()

facetgridplot(
    data=turn_index_res,
    x="turn_index_bin",
    xlabel="Turn Index",
    y="correct",
    ylabel="pass@5",
    z="model",
    zlabel="Model",
    legend=False,
    sharey=True
)

In [None]:
plt.savefig("figures/results_turn_index.pgf")

In [None]:
df_preds["predictions"]

In [None]:
from src.prompt_utils import get_num_tokens

# df_preds["lines"] = df_preds["predictions"].apply(lambda s: get_num_tokens(s["code"]))
df_preds["lines"] = df_preds["predictions"].apply(lambda s: len(s["code"].split("\n")))

In [None]:
# df_preds["lines"] = df_preds["predictions"].apply(lambda s: len([l for l in s["code"].split("\n") if l and not l.startswith("#")]))

line_results = generate_bin_col(df_preds[df_preds["lines"] <= 20], "lines", 5)
line_results = line_results[line_results["correct"].isin([True])]
line_results = line_results.groupby(["model", "experiment_name", "lines_bin"]).count().reset_index()

facetgridplot(
    data=line_results,
    x="lines_bin",
    xlabel="Num. Tokens",
    y="correct",
    ylabel="pass@5",
    legend=False,
    z="model",
    zlabel="Model",
    fig=(3.5,1.5)
)

In [None]:
df.columns

In [None]:
df_preds["num_lines"] = df_preds["reference"].apply(lambda s: len(s.split("\n")))
df_preds = generate_bin_col(df_preds, "num_lines", 5)
lines_results = df_preds.groupby(["model_dataset", "experiment_name", "num_lines_bin"])["correct"].mean().reset_index()

facetgridplot(
    data=lines_results,
    x="num_lines_bin",
    xlabel="Num. Lines",
    y="correct",
    ylabel="pass@5",
    legend=False,
    fig=(3.5,1.5)
)

In [None]:
from src.prompt_utils import get_num_tokens

df["num_tokens"] = df["prompt_input"].apply(get_num_tokens)

In [None]:
# df = generate_bin_col(df, "num_tokens", 1000)
# turn_index_res = df.groupby(["model_dataset", "experiment_name", "num_tokens_bin"])["correct"].mean().reset_index()

# facetgridplot(
#     data=turn_index_res,
#     x="prompt_length_bin",
#     xlabel="Turn Index",
#     y="correct",
#     ylabel="pass@5",
#     legend=False,
# )

# df_preds["lines"] = df_preds["predictions"].apply(lambda s: len([l for l in s["code"].split("\n") if l and not l.startswith("#")]))

line_results = generate_bin_col(df[df["num_tokens"] <= 4000], "num_tokens", 1000)
# line_results = line_results[line_results["correct"].isin([True])]
line_results = line_results.groupby(["model", "experiment_name", "num_tokens_bin"])["correct"].mean().reset_index()

facetgridplot(
    data=line_results,
    x="num_tokens_bin",
    xlabel="Num. Tokens",
    y="correct",
    ylabel="pass@5",
    legend=False,
    z="model",
    zlabel="Model",
    fig=(3.5,1.5),
    sharey=True
)

In [None]:
def get_correct_rerank(r):
    if r["clusters"]:
        max_len = max(map(len, r["clusters"].values()))
        for idx in list(r["clusters"].values())[::-1]:
            cr = 0
            for i in idx:
                if r["predictions"][i]["accuracy"] == 1.0:
                    cr += 1
                if cr == max_len:
                    return True
    return False


df["rerank_accuracy"] = df.apply(get_correct_rerank, axis=1)
rerank_acc = df[(df["correct"] == True)].groupby(["model_dataset", "experiment_name"])["rerank_accuracy"].mean().reset_index()
# rerank_acc = rerank_acc[(rerank_acc["correct"] == True)]

barplot(
    data=rerank_acc,
    y="rerank_accuracy",
    ylabel="SC Accuracy",
    hue="experiment_name"
)

In [None]:
plt.savefig("figures/results_sc.pgf")

In [None]:
# experiment_name = "fyp.CoT-DFS+OUT+SCH"
# exp_pandas_method_results = pandas_method_results[pandas_method_results["experiment_name"] == experiment_name]
# exp_pandas_method_results = exp_pandas_method_results.melt(id_vars=["experiment_name", "model_dataset"], var_name="method", value_name="correct")

# barplot(
#     data=exp_pandas_method_results,
#     x="method",
#     y="correct",
#     hue="model_dataset"
# )

In [None]:
# from collections import defaultdict

# def get_used_columns(code, vars, dataframes):
#     res = defaultdict(list)
#     for v in vars:
#         if v in dataframes:
#             for col in dataframes[v].columns:
#                 if col in code:
#                     res[v].append(col)
#     return res

# print(get_used_columns(ref, ref_dataframes, dfs))
# print(get_used_columns(pred, pred_dataframes, dfs))

In [None]:
# from functools import cache

# @cache
# def get_dataframe_schemas(notebook_name, turn_index):
#     execution_info = get_execution_state(notebook_name)
#     cell_execution_info = execution_info[turn_index]
#     dfs = {}
#     for c in cell_execution_info[::-1]:
#         if "dataframes" in c:
#             dfs = c["dataframes"]
#             break
#     df_cols = {k: set(v.columns) if isinstance(v, pd.DataFrame) else set(v.index) for k, v in dfs.items()}
#     return df_cols

# def get_used_columns(code, vars, df_schemas):
#     res = defaultdict(set)
#     for v in vars:
#         if v in df_schemas:
#             for col in df_schemas[v]:
#                 if str(col) in code:
#                     res[v].add(col)
#     return res

# def get_used_dfs_and_cols(r):
#     code_context = r["code_context"]
#     ref, pred = r["reference"], r["predictions"]["code"]
#     ref_dataframes = set(analyse_code(code_context, ref)["used_dataframes"])
#     pred_dataframes = set(analyse_code(r["code_context"], pred)["used_dataframes"])
#     dfs = get_dataframe_schemas(r["notebook_name"], r["turn_index"])
#     ref_cols = get_used_columns(ref, ref_dataframes, dfs)
#     pred_cols = get_used_columns(pred, pred_dataframes, dfs)
#     return pd.Series(
#         {
#             "used_correct_dataframes": ref_dataframes == pred_dataframes,
#             "used_correct_dataframe_cols": ref_cols == pred_cols
#         }
#     )

# df_logical_errors = df_preds[(df_preds["correct"] == False) & (df_preds["execution_error"] == False)]
# df_logical_errors[["used_correct_dataframes", "used_correct_dataframe_cols"]] = df_logical_errors.apply(get_used_dfs_and_cols, axis=1)

In [None]:
# used_correct_df_results = df_logical_errors.groupby(["model_dataset", "experiment_name"])["used_correct_dataframes"].mean().reset_index()
# used_correct_df_col_results = df_logical_errors.groupby(["model_dataset", "experiment_name"])["used_correct_dataframe_cols"].mean().reset_index()


# barplot(
#     data=used_correct_df_results,
#     y="used_correct_dataframes"
# )

# barplot(
#     data=used_correct_df_col_results,
#     y="used_correct_dataframe_cols"
# )

In [None]:
df.columns

In [None]:
import itertools

method_groups = {
    "aggregation": {
        "methods": {"groupby", "agg", "Grouper"}
    },
    "transformation": {
        "apply": {"apply", "applymap", "map"},
        "reshape": {"pivot_table", "pivot", "melt", "stack", "unstack", "transpose"},
        "bin": {"cut", "qcut"},
        "explode": {"explode"},
        "time_series": {"shift", "pct_change"},
        "compute": {"cumsum", "diff", "rank"}
    },
    "combination": {
        "methods": {"join", "merge", "concat", "append"}
    },
    "selection": {
        "indexing": {"loc", "iloc", "between", "filter", "isin", "isna", "isnull", "notnull", "query", "where"},
        "sorting": {"sort_values", "sort_index"},
        "subset": {"nlargest", "nsmallest", "head", "tail", "first", "last"}
    },
    "cleaning": {
        "missing_data": {"dropna", "fillna", "ffill", "bfill"},
        "duplicates": {"drop_duplicates", "duplicated"},
        "type_conversion": {"astype", "to_numeric", "to_datetime", "to_period", "to_frame", "to_list", "tolist", "ravel"},
        "renaming": {"rename", "rename_axis"},
        "structure": {"reset_index", "set_index", "reindex", "insert"}
    },
    "strings": {
        "methods": {"str", "contains", "extract", "replace", "match", "sub", "startswith", "endswith", "split"}
    },
    "computation": {
        "statistics": {"mean", "max", "min", "median", "mode", "std", "var", "quantile", "describe"},
        "aggregation": {"count", "sum", "nunique", "unique", "value_counts"},
        "correlation": {"corr", "cov"},
        "arithmetic": {"add", "div", "divide", "clip", "round"}
    },
    "datetime": {
        "conversion": {"to_datetime", "to_timedelta"},
        "components": {"dt.year", "dt.month", "dt.day", "dt.hour", "dt.minute", "dt.second"},
        "operations": {"strftime", "strptime", "tz_localize", "tz_convert"},
        "periods": {"to_period", "PeriodIndex", "period_range"},
        "timedelta": {"Timedelta", "timedelta_range"},
        "offsets": {"DateOffset", "BDay", "CDay", "Week", "MonthEnd", "YearEnd"}
    },
    "visualization": {
        "methods": {"plot", "boxplot", "hist", "lmplot", "barplot", "scatter", "lmplot", "barplot", "Figure", "Layout", "Bar", "show", "Scatter"}
    },
}

method_groups_flattened = {category: set(itertools.chain.from_iterable(subcategories.values())) 
                  for category, subcategories in method_groups.items()}

method_group_priority = [
    "aggregation",
    "combination",
    "transformation",
    "computation",
    "selection",
    "cleaning",
    "datetime",
    "strings",
    "visualization"
]

def classify_tasks(row):
    result = {f"is_{category}": False for category in method_groups_flattened.keys()}
    for category, methods in method_groups_flattened.items():
        if any(method in row["reference"] for method in methods):
            result[f"is_{category}"] = True
    if any(op in row["reference"] for op in set({"*", "/", "+"})):
        result['is_computation'] = True
    if any(op in row["reference"] for op in set({"==", ">", "<"})):
        result['is_selection'] = True
    result['task_type'] = "other"
    for category in method_group_priority:
        if result[f"is_{category}"]:
            result['task_type'] = category
            break
    return pd.Series(result)

df["calls"] = df["reference_analysis"].apply(lambda ms: [ss for s in ms["function_calls"] for ss in s.split(".")[1:] if ss])
res = df.apply(classify_tasks, axis=1)
for col in res.columns:
    df[col] = res[col]

In [None]:
df.columns

In [None]:
task_type_res["task_type"]

In [None]:
df_ts = df[df["task_type"].isin(["aggregation", "computation", "selection", "transformation"])]
task_type_res = df_ts.groupby(["model", "experiment_name", "task_type"])["correct"].mean().reset_index()
task_type_res["task_type"] = task_type_res["task_type"].apply(lambda s: s.capitalize())

facetgridplot(
    data=task_type_res,
    x="model",
    xlabel="Model",
    y="correct",
    ylabel="pass@5",
    z="task_type",
    zlabel="Task Type",
    legend=False,
    sharey=True,
     fig=(2.5,2),
     args={
         "ylim":(0,1.0)
     }
)

In [None]:
task_type_res

In [None]:
plt.savefig("figures/results_task_type_pass5.pgf")

In [None]:
for g in method_group_priority:
    df_res = df[df[f"is_{g}"] == True].groupby(["model_dataset", "experiment_name"])["correct"].mean().reset_index()
    barplot(
        data=df_res,
        y="correct",
        ylabel="pass@5",
        title=g.capitalize()
    )


In [None]:
df_ms

In [None]:
multistep_experiments_names = ["CoT", "CoT + 2 Step", "[+] Errors", "[+] Outputs"]
df_ms = df[df["experiment_name"].isin(multistep_experiments_names)]
df_ms["correct"] = df_ms["predictions"].apply(lambda preds: any(p["accuracy"] == 1.0 for p in preds))
df_ms['model_dataset'] = df_ms['model'].str.split("_").str[-1] + ' - ' + df['dataset_src']
# df_ms = df_ms.explode("predictions")
# df_ms["correct"] = df_ms["predictions"].apply(lambda p: p["accuracy"] == 1.0)
# df_ms['model_dataset'] = df_ms['model'].str.split("_").str[-1] + ' - ' + df_ms['dataset_src']

In [None]:
df_pivot = df_ms.pivot_table(index=['notebook_name', 'turn_index', 'model_dataset'], columns='experiment_name', values='correct', aggfunc=lambda x: list(x))


In [None]:
corr_map = {
    (False, True):  "Improved",
    (True, True): "Still Correct",
    (True, False): "Degraded",
    (False, False): "Still Incorrect"
}

def get_improvement(r):
    original = r["CoT"].iloc[0]
    rewrite = r["CoT + 2 Step"].iloc[0]
    with_errors = r["[+] Errors"].iloc[0]
    with_outputs =  r["[+] Outputs"].iloc[0]
    res = []
    return pd.Series({
        'CoT + 2 Step': [corr_map[(original[i], rewrite[i])] for i in range(len(original))],
        '[+] Errors': [corr_map[(original[i], with_errors[i])] for i in range(len(original))],
        '[+] Outputs': [corr_map[(original[i], with_outputs[i])] for i in range(len(original))],
    })


df_improved = df_pivot.groupby(level=['notebook_name', 'turn_index', 'model_dataset']).apply(lambda s: get_improvement(s)).reset_index()
df_improved = df_improved.explode(['CoT + 2 Step', '[+] Errors', '[+] Outputs'])

In [None]:
melted_df = df_improved.melt(id_vars=['notebook_name', 'turn_index', 'model_dataset'], 
                    value_vars=['CoT + 2 Step', '[+] Errors', '[+] Outputs'], 
                    var_name='experiment_name', 
                    value_name='change')

In [None]:
experiment_order = ['CoT + 2 Step', '[+] Errors', '[+] Outputs']

# Count the occurrences of each improvement value for each experiment
df_counts = melted_df[melted_df["change"].isin(["Degraded", "Improved"])].groupby(['experiment_name', 'model_dataset', 'change']).size().reset_index(name='count')
df_counts["count"] /= 1137


facetgridplot(
    data=df_counts,
    x="change",
    xlabel="Change Type",
    y="count",
    ylabel="Frequency",
    # z="model_dataset",
    # zlabel="Model - Dataset",
    # hue="experiment",
    # huelabel="Experiment Name",
    args=dict(
        ylim=(0, 0.055)
    ),
    fig=(3,1)
)

In [None]:
plt.savefig("figures/results_multistep.pgf")

In [None]:
df_melted.groupby(['model_dataset', 'experiment', 'improvement']).size().unstack(fill_value=0)

In [None]:
df_counts

In [None]:
for ex in ["fyp.CoT+2S-DFS+OUT+SCH", "fyp.CoT+2S+ERR-DFS+OUT+SCH", "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH"]:
    for m in ["llama3_instruct_8b", "llama3_instruct_70b"]:
        fn = f"/root/FYP/experiments/{ex}/predictions.{m}.json"
        with open(fn) as f:
            dataset = json.loads(f.read())

        error_pairs = []

        for n in dataset:
            for t in n["turns"]:
                original_evals = t["metadata"]["example"]["metadata"]["initial_eval_results"]
                current_evals = t["eval_results"]
                for o, c in zip(original_evals, current_evals):
                    error_pairs.append((parse_error(o), parse_error(c)))

        df = pd.DataFrame(error_pairs, columns=['error1', 'error2'])
        contingency_table = pd.crosstab(df['error1'], df['error2'])
        mask = np.triu(np.ones_like(contingency_table, dtype=bool))
        corr_values = contingency_table.where(~mask)
        vmax = corr_values.max().max()

        plt.figure(figsize=(5, 5))
        sns.heatmap(
            contingency_table, 
            annot=True, 
            cmap='inferno', 
            vmin=0,
            vmax=vmax*1.6, 
            center=0.0,
            annot_kws={"size": 8},
            mask=mask,
            fmt='d'
        )
        plt.tight_layout()
        plt.title(f"{ex} - {m}")
        plt.show()
        

In [None]:
key_runtime_errors = ["AttributeError", "IndexError", "KeyError", "NameError", "TypeError", "ValueError"]
key_errors = ["RuntimeError", "Parse Error", "Logical Error"]

def parse_error(p, split_runtime=False):
    if p["accuracy"] != 1.0:
        if p.get("error_text"):
            error_type = p["error_text"].split(":", 1)[0].strip()
            if error_type == "RuntimeError":
                if split_runtime:
                    runtime_error = p["error_text"].split(":")[1].strip()
                    if runtime_error in key_runtime_errors:
                        return runtime_error
                else:
                    return "Runtime Error"
            elif error_type == "Parse Error":
                return error_type
            return "Other Error"
        else:
            return "Logical Error"
    return "Correct"

error_pairs = []

for n in dataset:
    for t in n["turns"]:
        original_evals = t["metadata"]["example"]["metadata"]["initial_eval_results"]
        current_evals = t["eval_results"]
        for o, c in zip(original_evals, current_evals):
            error_pairs.append((parse_error(o), parse_error(c)))

In [None]:
df = pd.DataFrame(error_pairs, columns=['error1', 'error2'])
contingency_table = pd.crosstab(df['error1'], df['error2'])
mask = np.triu(np.ones_like(contingency_table, dtype=bool))
corr_values = contingency_table.where(~mask)
vmax = corr_values.max().max()

plt.figure(figsize=(5, 5))
sns.heatmap(
    contingency_table, 
    annot=True, 
    cmap='inferno', 
    vmin=0,
    vmax=vmax*1.6, 
    center=0.0,
    annot_kws={"size": 8},
    mask=mask,
    fmt='d'
)
plt.tight_layout()
plt.title('Correlation between Pandas Method groups')
# plt.savefig('figures/pandas_method_corr.pgf')
plt.show()

In [None]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


experiments = ["fyp.CoT+2S-DFS+OUT+SCH", "fyp.CoT+2S+ERR-DFS+OUT+SCH", "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH"]
models = ["llama3_instruct_8b", "llama3_instruct_70b"]

key_runtime_errors = ["AttributeError", "IndexError", "KeyError", "NameError", "TypeError", "ValueError"]
key_errors = ["RuntimeError", "Parse Error", "Logical Error"]

def parse_error(p, split_runtime=False):
    if p["accuracy"] != 1.0:
        if p.get("error_text"):
            error_type = p["error_text"].split(":", 1)[0].strip()
            if error_type == "RuntimeError":
                if split_runtime:
                    runtime_error = p["error_text"].split(":")[1].strip()
                    if runtime_error in key_runtime_errors:
                        return runtime_error
                else:
                    return "Runtime Error"
            elif error_type == "Parse Error":
                return error_type
            return "Other Error"
        else:
            return "Logical Error"
    return "Correct"

exmap = {
    "fyp.CoT+2S-DFS+OUT+SCH": "CoT + 2 Step",
    "fyp.CoT+2S+ERR-DFS+OUT+SCH": "[+] Errors",
    "fyp.CoT+2S+ERR+RES-DFS+OUT+SCH": "[+] Outputs",
}

mmap = {
    "llama3_instruct_8b": "Llama 3 8B",
    "llama3_instruct_70b": "Llama 3 70B",
}

# Prepare data for FacetGrid
plot_data = []

for ex in experiments:
    for m in models:
        fn = f"/root/FYP/experiments/{ex}/predictions.{m}.json"
        with open(fn) as f:
            dataset = json.loads(f.read())

        error_pairs_ns = []
        error_pairs_es = []

        for n in dataset:
            for t in n["turns"]:
                original_evals = t["metadata"]["example"]["metadata"]["initial_eval_results"]
                current_evals = t["eval_results"]
                for o, c in zip(original_evals, current_evals):
                    oe = parse_error(o)
                    ce = parse_error(c)
                    if "Other Error" not in [oe, ce]:   
                        print(n["metadata"]["dataset_src"])
                        if n["metadata"]["dataset_src"] == "new_tasks":
                            error_pairs_ns.append((parse_error(o), parse_error(c)))
                        else:
                            error_pairs_es.append((parse_error(o), parse_error(c)))

        for n, ds in [("New Tasks", error_pairs_ns), ("Existing Tasks", error_pairs_es)]:
            df = pd.DataFrame(ds, columns=['error1', 'error2'])
            contingency_table = pd.crosstab(df['error1'], df['error2']).stack().reset_index(name='count')
            contingency_table['experiment'] = exmap[ex]
            contingency_table['model'] = mmap[m] + " - " + n
            plot_data.append(contingency_table)

In [None]:
# Combine all data into a single DataFrame
plot_df = pd.concat(plot_data, ignore_index=True)

error_order = ["Correct", "Logical Error", "Parse Error", "Runtime Error"]
plot_df['error1'] = pd.Categorical(plot_df['error1'], categories=error_order, ordered=True)
plot_df['error2'] = pd.Categorical(plot_df['error2'], categories=error_order, ordered=True)


# Create the FacetGrid
g = sns.FacetGrid(plot_df, col='experiment', row='model', margin_titles=True)

g.fig.set_size_inches(11, 11)

# Map the heatmap to the FacetGrid
def plot_heatmap(data, **kwargs):
    data = data.pivot('error1', 'error2', 'count')
    data = data.div(data.sum(axis=0), axis=1)
    mask = np.triu(np.ones_like(data, dtype=bool))
    vmax = data.max().max()
    sns.heatmap(
        data, 
        annot=True, 
        cmap='inferno', 
        vmin=0,
        vmax=1, 
        center=0.0,
        annot_kws={"size": 8},
        # mask=mask,
        fmt='.0%',
        cbar=False,
        **kwargs
    )

g.map_dataframe(plot_heatmap)
g.set_axis_labels('Original', 'Updated')
g.set_titles(col_template='{col_name}', row_template='{row_name}')

plt.tight_layout()
plt.show()


In [None]:
plt.savefig("figures/results_multistep_errors.pgf")

In [None]:
df = aggregate_results()
df_t = df[df["experiment_name"] == "fyp.CoT-DFS+OUT+SCH-[T]"]

In [None]:
df_temps = df_t.explode("predictions")
df_temps['temperature'] = df_temps.groupby(level=0).cumcount().apply(lambda i: round((i % 6) * 0.2, 1))
df_temps["correct"] = df_temps["predictions"].apply(lambda p: p["accuracy"] == 1.0)
df_temps['model_dataset'] = df_temps['model'].str.split("_").str[-1] + ' - ' + df_temps['dataset_src']
temp_results = df_temps.groupby(["model_dataset", "temperature"])["correct"].mean().reset_index()
temp_results["correct"] += 0.1
# temp_avg_results = df_temps.groupby(["model", "temperature"]).agg(pass1=('correct', np.mean)).reset_index()

In [None]:
temp_results

In [None]:
barplot(
    data=temp_results,
    y="correct",
    ylabel="pass@5",
    hue="temperature",
    huelabel="Temperature",
    palette="viridis"
)

In [None]:
temp_results

In [None]:
temp_results

In [None]:
abc = """
<scratchpad>
Step 1: Extract the relevant columns from the cars dataframe.
Step 2: Group the dataframe by 'Make' and 'Year' to get the unique models for each manufacturer and year.
Step 3: Use the groupby object to create a new dataframe with the manufacturers as the index and years as columns.
Step 4: Reset the index to create a multi-indexed dataframe with the manufacturers as the index and years as columns.
</scratchpad>

<python>
cars_model_year = cars.groupby(['Make', 'Year'])['Model'].nunique().unstack().reset_index()
cars_model_year.columns = ['Make', 'Year', 'Models']
print(cars_model_year)
"""

from src.prompt_utils import extract_code_from_response

a, b = extract_code_from_response(abc)

print(a)
print("*****")
print(b)

In [None]:
fname = "/root/FYP/experiments/fyp.CoT-FS+DFS+OUT+SCH/predictions.llama3_instruct_8b.json"


with open(fname) as f:
    ds = json.loads(f.read())

In [None]:
# with open(fname, "w") as f:
#     f.write(json.dumps(ds, indent=2))

In [None]:
x = 0
for n in ds:
    for t in n["turns"]:
        del t["eval_results"]
        for i in range(5):
            # print(t["predictions"][i])
            # print("**********")
            # print(t["original"][i])
            # print("**********")
            print(t["predictions"][i])
            if extract_code_from_response(t["original"][i])[0] == t["predictions"][i]:
                x += 1
            # print(extract_code_from_response(t["original"][i])[0])
        t["predictions"] = [extract_code_from_response(o)[0] for o in t["original"]]
        

In [None]:
x