# Statistics for benchmark taxonomies

In [None]:
import glob
from collections import defaultdict
from pathlib import Path

import networkx as nx
import numpy as np
import pandas as pd

from llm_food_taxonomy.data.loader import load_taxonomy
from llm_food_taxonomy.graph.taxonomy import Taxonomy


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

In [None]:
def load_splits(taxonomy_dir: str):
    train_file = glob.glob(str(Path(taxonomy_dir) / "*.terms.train"))[0]
    val_file = glob.glob(str(Path(taxonomy_dir) / "*.terms.validation"))[0]
    test_file = glob.glob(str(Path(taxonomy_dir) / "*.terms.test"))[0]

    res = []
    for file in [train_file, val_file, test_file]:
        with open(file, "r") as f:
            lines = f.readlines()
        res.append(lines)
    return tuple(res)

In [None]:
def create_ancestries(taxo):
    e = [tuple(edge) for edge in taxo[["hypernym", "hyponym"]].values]
    g = nx.DiGraph(e)
    g.remove_edges_from(nx.selfloop_edges(g))

    roots = [n for n in g.nodes() if g.in_degree(n) == 0]

    def _get_ancestries(node, accumulator=()):
        yield list(accumulator + (node,))
        edges = g.out_edges(node)
        for e in edges:
            yield from _get_ancestries(e[1], accumulator=accumulator + (e[1],))

    a = []

    for r in roots:
        a.extend(list(_get_ancestries(r)))

    return a

In [None]:
def branching_factor(taxo, node2name):
    tax = Taxonomy(taxo, node2name)
    tax.connect()
    nodes = tax.children()
    return np.mean([len(cs) for _, cs in nodes if len(cs) > 0])


In [None]:
terms, taxo = load_taxonomy("../data/mesh")
terms.head()

In [None]:
taxo.head()

In [None]:
taxo["hypernym_name"] = taxo.hypernym.apply(lambda x: terms.set_index("node_id").loc[x].node_name)
taxo["hyponym_name"] = taxo.hyponym.apply(lambda x: terms.set_index("node_id").loc[x].node_name)

In [None]:
taxo.head()

In [None]:
np.intersect1d(terms.node_id, taxo.hypernym).shape[0] / terms.shape[0]

In [None]:
np.intersect1d(terms.node_id, taxo.hyponym).shape[0] / terms.shape[0]

In [None]:
taxo.set_index("hyponym").loc["proteins"]

In [None]:
e = [tuple(edge) for edge in taxo[["hypernym", "hyponym"]].values]
g = nx.DiGraph(e)

In [None]:
root_cand = [n for n in g.nodes() if g.in_degree(n) == 0]
root = root_cand[0]
root

In [None]:
list(g.out_edges(root))

## Load all benchmark taxonomies

In [None]:
taxos = {}
splits = {}
taxo_paths = ["../data/semeval_food",
              "../data/semeval_verb",
              "../data/mesh",
              "../data/wikitax",
              "../data/bettybossi",
              "../data/unsupervised_recipe1m",
              "../data/unsupervised_mesh",
              "../data/unsupervised_semeval_food",
              "../data/unsupervised_nonli",
              "../data/unsupervised_noback",
              "../data/unsupervised_nogen"]

for tp in taxo_paths:
    try:
        try:
            terms, taxo = load_taxonomy(tp)
            splits[Path(tp).name] = load_splits(tp)
        except IndexError:
            terms, taxo = load_taxonomy(tp, with_split=False)
        taxos[Path(tp).name] = terms, taxo
    except IndexError as ie:
        print(f"{tp}: {ie}")
        raise ie

In [None]:
!ls ../data/semeval_verb/*.terms.train

In [None]:
taxos["mesh"][1].set_index("hyponym").loc["proteins"]

In [None]:
stats = defaultdict(list)

for ds_name, (terms, edges) in taxos.items():
    try:
        stats["Dataset"].append(ds_name)
        stats["|V|"].append(len(terms))
        stats["|E|"].append(len(edges))
        stats["L"].append(len([l for l in terms.node_id if l in edges.hyponym.values and l not in edges.hypernym.values]))
        stats["L ratio"].append(str(round(stats["L"][-1] / stats["|V|"][-1], 2)))
        relations = [tuple(r) for r in edges[["hypernym", "hyponym"]].values.tolist()]
        if "mesh" in ds_name:
            edges = edges[edges.apply(lambda r: not ((r.hyponym == "proteins")
                                                     and (r.hypernym in ['glycoproteins', 'bloodproteins'])), axis=1)]
        try:
            d = pd.Series(create_ancestries(edges)).apply(lambda x: len(x)).max()
        except AssertionError:
            d = np.nan
        stats["D"].append(int(d))
        stats["BF"].append(branching_factor(relations, terms.set_index("node_id").node_name.to_dict()))
    except RecursionError as re:
        print(f"{ds_name}: {re}")
        raise re

stats_df = pd.DataFrame(stats)
stats_df

In [None]:
stats_df = pd.DataFrame(stats)[["Dataset", "|V|", "|E|", "D", "L", "L ratio", "BF"]]
cols = [c for c in stats_df.columns if str(stats_df.dtypes.loc[c]) == "float64"]
df_s = stats_df.style.format('{:.2f}', subset=cols)
print(df_s.hide(axis="index").to_latex())
df_s

In [None]:
split_stats = defaultdict(list)

for ds_name, (train, val, test) in splits.items():
    try:
        tot = len(train) + len(val) + len(test)
        split_stats["Dataset"].append(ds_name)
        split_stats["Train |V|"].append(f"{len(train)} ({str(round(len(train) / tot, 2) * 100)}\%)")
        split_stats["Val |V|"].append(f"{len(val)} ({str(round(len(val) / tot, 2) * 100)}\%)")
        split_stats["Test |V|"].append(f"{len(test)} ({str(round(len(test) / tot, 2) * 100)}\%)")
    except RecursionError as re:
        print(f"{ds_name}: {re}")
        raise re

splits_df = pd.DataFrame(split_stats)
print(splits_df.to_latex(index=False))
splits_df