## Evaluate Taxonomy Completion

In [None]:
import sys
from collections import defaultdict
from copy import deepcopy

from llm_food_taxonomy.evaluation.metric import ScoreAccumulator
from llm_food_taxonomy.evaluation.supervised.parent_metric import ParentMetric
sys.path.append("..")

In [None]:
from llm_food_taxonomy.evaluation import PositionMetric, WuPSimilarity
from llm_food_taxonomy.data.loader import load_taxonomy, load_completion
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path
import numpy as np
tqdm.pandas()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', '{:,.4f}'.format)

In [None]:
trial = False
mode = "test"
dataset = "bettybossi"
data_path = Path(f"../data/{dataset}")
results_path = Path(f"../output/{dataset}")

In [None]:
terms, taxo = load_taxonomy(str(data_path), with_split=True)
id_to_name = {d["node_id"]: d["node_name"] for d in terms.to_dict(orient="records")}

In [None]:
terms.head()

In [None]:
taxo.head()

In [None]:
nodes_to_add = terms[terms.split == mode]
train_nodes = terms[terms.split == "train"].node_id.apply(str)
if trial:
    nodes_to_add = nodes_to_add.node_id.iloc[:20].values.tolist()
else:
    nodes_to_add = nodes_to_add.node_id.values.tolist()
nodes_to_add = [id_to_name[n] for n in nodes_to_add]

In [None]:
seed_taxonomy_df = taxo[
    taxo.apply(lambda r: (str(r.hypernym) in train_nodes.values) and (str(r.hyponym) in train_nodes.values), axis=1)]
seed_taxonomy_df.head()

In [None]:
seed_taxonomy = seed_taxonomy_df.values.tolist()
seed_taxonomy = [tuple(r) for r in seed_taxonomy]

In [None]:
[t for t in seed_taxonomy_df if t[0] == "food"]

In [None]:
len(nodes_to_add)

In [None]:
# filter_words = []
filter_words = ["zero"]
valid_words = []
# valid_words = ["few_basic"] #["Llama-3"]

In [None]:
import traceback
from pathlib import Path

models_outs = {}

outputs = [d for d in results_path.iterdir() if d.is_dir]

for o in outputs:
    try:
        try:
            model_name = f"{o.split('_')[2]}_{o.split('_')[3]}"
        except:
            model_name = Path(o).name
        add_model = (not any(f in model_name for f in filter_words) if len(filter_words) > 0 else any(w in model_name for w in valid_words)) or (len(valid_words) == 0 and len(filter_words) == 0)
        if add_model :
            pred_terms, pred_triplets = load_completion(o, with_reasoning=True)
            models_outs[model_name] = pred_terms, pred_triplets
            print(f"Loading {o}")
    except Exception as e:
        print(f"Error loading {o}: {e}")
        traceback.print_exc()

In [None]:
terms.sample(20)

In [None]:
models_outs.keys()

In [None]:
nodes_to_add

In [None]:
terms[terms.node_name == "cling"]

In [None]:
# models_outs['few_basic_Meta-Llama-3-70B-Instruct_2024-08-30_04-35-16'][1]

In [None]:
for model, (pred_terms, pred_triplets) in models_outs.items():
    print(f"Model: {model}, Pred Terms: {len(pred_terms)}, Pred Triplets: {len(pred_triplets)}")

In [None]:
metrics = [WuPSimilarity(), PositionMetric()]
res = []
pretty_names = {"tacoprompt": "TacoPrompt",
                "tmn": "TMN",
                "arborist": "Arborist",
                "temp": "TEMP",
                "qen": "QEN",
                "taxoexpan": "TaxoExpan"}
metric_cols = ["WPS", "F1", "P", "R", ""]
nonleaf_cols = [f"NL-{n}" if n != "" else "" for n in metric_cols]
leaf_cols = [f"L-{n}" if n != "" else "" for n in metric_cols][:-1]
cols = ["", "Model"] + metric_cols + nonleaf_cols + leaf_cols
preds = {}
truths = {}
populations = {}

with tqdm(total=len(models_outs) * len(metrics), desc="Evaluating...") as pbar:
    for model, (pred_terms, pred_triplets) in models_outs.items():
        pretty_model = pretty_names.get(model, model)
        row = ["", pretty_model]
        scores = []
        nonleaf_scores = []
        leaf_scores = []
        populations[pretty_model] = {}
        for m in metrics:
            pred = {row.query_node: row.predicted_positions for _, row in pred_triplets.iterrows() if
                    row.query_node in nodes_to_add}
            truth = {row.node_name: row.positions for _, row in terms.iterrows() if row.node_name in nodes_to_add}
            assert len(set(nodes_to_add) - set(truth.keys())) == 0, set(nodes_to_add) - set(truth.keys())
            preds[model] = pred
            s, nleaf_s, leaf_s = m.calculate(
                pred_positions=deepcopy(pred),
                true_positions=deepcopy(truth),
                node2name=deepcopy(id_to_name),
                seed_taxonomy=deepcopy(seed_taxonomy),
                leaves=terms[terms.leaf].node_name.values.tolist(),
                verbose=True
            )
            populations[pretty_model][type(m).__name__] = {}
            populations[pretty_model][type(m).__name__]["all"] = s.pop("scores")
            populations[pretty_model][type(m).__name__]["nonleaf"] = nleaf_s.pop("scores")
            populations[pretty_model][type(m).__name__]["leaf"] = leaf_s.pop("scores")
            scores.extend(s.values())
            nonleaf_scores.extend(nleaf_s.values())
            leaf_scores.extend(leaf_s.values())
            pbar.update(1)
        row += scores + [""] + nonleaf_scores + [""] + leaf_scores
        res.append(row)

res_df = pd.DataFrame(res, columns=cols).sort_values(by="Model", ascending=True)

In [None]:
display(res_df)

## Hypothesis testing 

In [None]:
from scipy.stats import permutation_test

In [None]:
def test_metric(x, y, axis=-1, scorer=lambda acc: acc.f1(nan=True)):
    if len(x.shape) <= 2:
        x = np.expand_dims(x, axis=0)
        y = np.expand_dims(y, axis=0)
    x = x.swapaxes(0, 1)
    y = y.swapaxes(0, 1)
    xtp, xfp, xfn = tuple(np.sum(x, axis=axis))
    ytp, yfp, yfn = tuple(np.sum(y, axis=axis))

    def _f1(tp, fp, fn):
        a = ScoreAccumulator()
        a.tp += tp
        a.fp += fp
        a.fn += fn
        return scorer(a)
    
    x_stat = np.array([_f1(tp, fp, fn) for tp, fp, fn in zip(xtp, xfp, xfn)])
    y_stat = np.array([_f1(tp, fp, fn) for tp, fp, fn in zip(ytp, yfp, yfn)])
    diff = np.squeeze(np.abs(x_stat - y_stat))
    return diff

In [None]:
from functools import partial

p_values = []
type_prefixes = {"all": "", "nonleaf": "NL-", "leaf": "L-"}
metric_map = {
    "F1": partial(test_metric, scorer=lambda acc: acc.f1(nan=True)),
    "P": partial(test_metric, scorer=lambda acc: acc.precision(nan=True)),
    "R": partial(test_metric, scorer=lambda acc: acc.recall(nan=True)),
    "WPS": lambda x, y: np.mean(x) - np.mean(y)
}
for model1 in tqdm(populations.keys()):
    for model2 in populations.keys():
        if model1 == model2:
            continue
        for metric_name in ["WuPSimilarity", "PositionMetric"]:
            for node_type in ["all", "nonleaf", "leaf"]:
                m1_scores = populations[model1][metric_name][node_type]
                m2_scores = populations[model2][metric_name][node_type]
                score_names =  ["F1", "P", "R"] if metric_name != "WuPSimilarity" else ["WPS"]
                for score_name in score_names:
                    p_val = permutation_test((m1_scores, m2_scores),
                                             statistic=metric_map[score_name],
                                             permutation_type="samples",
                                             random_state=123,
                                             n_resamples=1000).pvalue
                    p_values.append([model1, model2, f"{type_prefixes[node_type]}{score_name}", p_val])

stat_tests = pd.DataFrame(p_values, columns=["Model1", "Model2", "Score Name", "P-Value"])
stat_tests.head()

In [None]:
stat_tests[(stat_tests.Model1 == "few_basic_meta-llama-3-70b-instruct_2024-06-12_16-38-57") 
                                    & (stat_tests.Model2 == "temp") 
                                    & (stat_tests["Score Name"] == "F1")]

In [None]:
stat_tests[(stat_tests.Model1 == "TacoPrompt") 
                                    & (stat_tests.Model2 == "zero_meta-llama-3-70b-instruct_2024-06-11_13-42-43") 
                                    & (stat_tests["Score Name"] == "NL-F1")]

In [None]:
p_num_cols = [c for c in stat_tests.columns if str(stat_tests.dtypes.loc[c]) == "float64"]
pval_sdf = stat_tests.style.format('{:.4f}', subset=p_num_cols)
print(pval_sdf.hide(axis="index").to_latex())

In [None]:
# apply some formatting for all numbers (optional)
alpha = 0.05
significance_test = True
df = res_df[res_df.Model.apply(lambda x: "gpt" not in x)]
cols = [c for c in df.columns if str(df.dtypes.loc[c]) == "float64"]
df_s = df.style.format('{:.4f}', subset=cols)

# loop through rows and find which column for each row has the highest value
for c in cols:
    if str(df.dtypes.loc[c]) == "float64":
        if significance_test:
            row1 = df[c].idxmax()
            best_row = df.loc[row1]
            best_rows = [row1]  
            for other_idx, other_row in df.iterrows():
                if other_row.Model != best_row.Model:
                    p_vals = stat_tests[(stat_tests.Model1 == str(best_row.Model)) 
                                        & (stat_tests.Model2 == str(other_row.Model)) 
                                        & (stat_tests["Score Name"] == c)]
                    assert len(p_vals) == 1, f"Found {len(p_vals)} p-values for {c} with models: {best_row.Model.lower()} and {other_row.Model.lower()}"
                    p_val = p_vals.iloc[0]["P-Value"]
                    if p_val > alpha:
                        best_rows.append(other_idx)
            df_s = df_s.format(lambda x: "\\underline{" + f'{x:.4f}' + "}", subset=(best_rows, c))
            df_s = df_s.format(lambda x: "\\textbf{" + f'{x:.4f}' + "}", subset=(row1, c))
        else:
            row1, row2 = df.index.values[df[c].argsort()[::-1]][:2]
            df_s = df_s.format(lambda x: "\\textbf{" + f'{x:.4f}' + "}", subset=(row1, c))
            df_s = df_s.format(lambda x: "\\underline{" + f'{x:.4f}' + "}", subset=(row2, c))

print(df_s.hide(axis="index").to_latex())

## Inspect predictions

In [None]:
model1 = 'few_basic_Meta-Llama-3-70B-Instruct_2024-08-30_04-35-16'
model2 = 'none_Meta-Llama-3-70B-Instruct_2024-08-28_16-52-16'

In [None]:
truth = {row.node_name: row.positions for _, row in terms.iterrows() if row.node_name in nodes_to_add}

In [None]:
import numpy as np
model = model1
scores = defaultdict(list)

for q, true_pos in truth.items():
    true_pos = set(true_pos)
    pred_pos = set(preds[model].get(q, []))
    all_pos = true_pos.union(pred_pos)
    
    def get_score(all_pos, pred_pos, true_pos):
        tp, fp, fn, tn = 0, 0, 0, 0
        for pos in all_pos:
            if pos in true_pos and pos in pred_pos:
                tp += 1
            elif pos in true_pos and pos not in pred_pos:
                fn += 1
            elif pos not in true_pos and pos in pred_pos:
                fp += 1
            else:
                raise ValueError("Something went wrong!")
        p = tp / (tp + fp) if tp + fp > 0 else 0
        r = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
        return p, r, f1
    
    p, r, f1 = get_score(all_pos, pred_pos, true_pos)
    scores["node_name"].append(q)
    scores["p"].append(p)
    scores["r"].append(r)
    scores["f1"].append(f1)

    
error_df = 

In [None]:
preds[model]

In [None]:
truth

# Error Analysis
TODO: Majid