In [None]:
import os
os.chdir("/root/FYP")
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from src.utils import get_baseline_results
import seaborn as sns

In [None]:
EXPERIMENTS_ROOT = "experiments"

def aggregate_results():
    results = []
    for rootdir, dirs, files in os.walk(EXPERIMENTS_ROOT):
        for subdir in dirs:
            expdir = os.path.join(rootdir, subdir)
            results_path = os.path.join(expdir, "results.json")
            if os.path.exists(results_path):
                with open(results_path, "r") as f:
                    data = json.loads(f.read())
                results.extend(list(map(lambda e: e | {"experiment_name": str(subdir)}, data)))
    return pd.DataFrame(results)

experiment_name_map = {
    "fyp.DFS+OUT+FS+SCH": "[+] Schema Information",
    "fyp.CoT-DFS+OUT+SCH": "CoT",
    "fyp.CoT+2S-DFS+OUT+SCH": "CoT + 2 Step",
    "fyp.CoT+2S+ERR-DFS+OUT+SCH": "[+] Errors",
}

df = aggregate_results()

def extract_single_temp(r):
    t = 0.6
    i = int(t / 0.2)
    n = 5
    if r["experiment_name"] == "vanilla_raw_notebook":
        return r["predictions"][i*n:(i+1)*n] 
    return r["predictions"]

df["predictions"] = df.apply(extract_single_temp, axis=1)

experiments = ["fyp.DFS+OUT+FS+SCH", "fyp.CoT-DFS+OUT+SCH", "fyp.CoT+2S-DFS+OUT+SCH", "fyp.CoT+2S+ERR-DFS+OUT+SCH"]
df = df[df["experiment_name"].isin(experiments)]
df['experiment_name'] = df['experiment_name'].apply(lambda e: experiment_name_map[e])
df['dataset_src'] = df['dataset_src'].apply(lambda e: dict({"existing_tasks": "Existing Tasks", "new_tasks": "New Tasks"})[e])

experiment_name_map = {
    "fyp.DFS+OUT+FS+SCH": "[+] Schema Information",
    "fyp.CoT-DFS+OUT+SCH": "CoT",
    "fyp.CoT+2S-DFS+OUT+SCH": "CoT + 2 Step",
    "fyp.CoT+2S+ERR-DFS+OUT+SCH": "[+] Errors",
}
df['experiment_name'] = pd.Categorical(df['experiment_name'], categories=experiment_name_map.values(), ordered=True)
df = df.sort_values('experiment_name')

In [None]:
# import json
# import pandas as pd

# fname = "/root/FYP/experiments/fyp.CoT-FS+DFS+OUT+SCH/results.json"

# with open(fname, "r") as f:
#     df = pd.DataFrame(json.loads(f.read()))

df_preds = df.explode("predictions")
df_preds["correct"] = df_preds["predictions"].apply(lambda p: p["accuracy"] == 1.0)

In [None]:
import evaluate
from codebleu import calc_codebleu
from pprint import pprint

bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")
chrf = evaluate.load("chrf")

def generate_metrics(predictions, references):
    results = {}
    results["bleu"] = bleu.compute(predictions=predictions, references=references)["bleu"]
    results |= meteor.compute(predictions=predictions, references=references)
    results |= rouge.compute(predictions=predictions, references=references)
    results["chrf"] = chrf.compute(predictions=predictions, references=references)["score"]
    results |= calc_codebleu(predictions, references, lang="python")
    return results

references = df_preds["reference"].values[:10]
predictions = df_preds["predictions"].apply(lambda p: p["code"]).values[:10]
pprint(generate_metrics(predictions, references))

In [None]:
import pandas as pd

In [None]:
def calc_metrics(df):
    references = df["reference"].values
    predictions = df["predictions"].apply(lambda p: p["code"]).values
    metrics =  generate_metrics(predictions, references)
    return pd.Series(metrics)


df_metrics = df_preds.groupby(["experiment_name", "model", "dataset_src", "correct"]).apply(calc_metrics).reset_index()
df_metrics

In [None]:
import importlib
importlib.reload(matplotlib)

In [None]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [None]:
df_metrics[[c for c in df_metrics.columns if not c.endswith("match_score")]]

In [None]:
experiment_name_map = {
    "fyp.DFS+OUT+FS+SCH": "[+] Schema Information",
    "fyp.CoT-DFS+OUT+SCH": "CoT",
    "fyp.CoT+2S-DFS+OUT+SCH": "CoT + 2 Step",
    "fyp.CoT+2S+ERR-DFS+OUT+SCH": "[+] Errors",
}
melted_df['experiment_name'] = pd.Categorical(melted_df['experiment_name'], categories=experiment_name_map.values(), ordered=True)
melted_df = melted_df.sort_values('experiment_name')

In [None]:
df['experiment_name'].dtype

In [None]:
df_res = df_metrics[["experiment_name", "model", "dataset_src", "correct", "bleu", "meteor", "chrf", "rougeL"]]
df_res = df_res[df_res["experiment_name"] == "CoT"]
df_res['model_dataset'] = df_res['model'].str.split("_").str[-1] + ' - ' + df_res['dataset_src']

In [None]:
df_res["chrf"] /= 100

In [None]:
melted_df["experiment_name"].dtype

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the data is already in a DataFrame called 'df'
# If not, you can create it like this:
# df = pd.read_csv('your_data.csv')  # or whatever format your data is in

# Melt the DataFrame to make it easier to plot
melted_df = pd.melt(df_res, id_vars=['model', 'dataset_src', 'correct'], 
                    value_vars=['bleu', 'meteor', 'chrf', 'rougeL'],
                    var_name='metric', value_name='score')

melted_df["model"] = melted_df["model"].apply(lambda m: "Llama 3 70B" if "70B" in m else "Llama 3 8B")

# Set up the plot
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Define color palettes
palette_70B = sns.color_palette("Greens_d", 2)
palette_8B = sns.color_palette("Reds_d", 2)
sns.set_style("whitegrid", {'axes.grid' : False})
# Create the grouped bar plot
g = sns.catplot(x='metric', y='score', hue='model', col='dataset_src',
                data=melted_df, kind='bar', height=4, aspect=1.2,
                palette=sns.color_palette("Greens_d", 2),
                legend_out=False, sharex=False)

# Customize the plot
g.set_axis_labels("Results", "Score")
g.set_titles("{col_name}")
g.legend.remove()
# g.add_legend(title="Correct")
# g.fig.suptitle("Model Performance Comparison", fontsize=16)

# Adjust layout and display the plot
# plt.tight_layout()
# plt.subplots_adjust(top=0.93)
plt.legend(title="Model", loc='upper right',bbox_to_anchor=(1,1))
plt.show()

In [None]:
plt.savefig("figures/results_nlp_metrics.pgf")

In [None]:
grouped = df_res.groupby('model_dataset').apply(compute_corr)
grouped

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming df_res is your dataframe
# df_res = df_metrics[["model_dataset", "correct", "bleu", "meteor", "chrf", "rougeL"]]

# Create a function to compute correlation matrix
def compute_corr(data):
    return data.corr()

# Group by model_dataset and compute correlation
grouped = df_res.groupby('model_dataset').apply(compute_corr)
grouped = grouped.reset_index()
grouped = grouped.melt(id_vars=['model_dataset', 'level_1'], 
                       var_name='level_2', 
                       value_name='correlation')

# Create FacetGrid
g = sns.FacetGrid(grouped, col='model_dataset', col_wrap=2, height=4, aspect=1)

# Define heatmap function
def plot_heatmap(data, **kwargs):
    data = data.pivot('level_1', 'level_2', 'correlation')
    mask = np.triu(np.ones_like(data, dtype=bool))
    sns.heatmap(
        data, 
        annot=True, 
        cmap='coolwarm', 
        # vmin=-1,
        # vmax=1, 
        # center=0,
        annot_kws={"size": 8},
        # mask=mask,
        fmt='.2f',
        cbar=False,
        **kwargs
    )

# Map the heatmap to the FacetGrid
g.map_dataframe(plot_heatmap)

# Set titles and labels
g.set_titles('{col_name}')
g.set_axis_labels('', '')

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_corr = df_metrics[["bleu", "meteor", "chrf", "rougeL", "correct"]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(df_corr, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df_preds["predictions"].apply(lambda p: p["recall"]).values