To run this notebook, it is required that experiment results are available in `./results/` with the model results in `./results/baseline`, `./results/dataset-embeddings` and `./results/special-token`.

In [17]:
from collections import defaultdict
from functools import partial
import numpy as np
from deepsig import aso 

!pip install --upgrade pandas jinja2 tabulate --target /usr/local/lib/python3.8/site-packages/
import sys
sys.path = ['/usr/local/lib/python3.8/site-packages', '/shared-libs/python3.8/py-core/lib/python3.8/site-packages']
import pandas as pd

Collecting pandas
  Using cached pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting numpy>=1.20.3
  Using cached numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Collecting pytz>=2020.1
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)
Collecting six>=1.5
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, tabulate, six, numpy, MarkupSafe, python-dateutil, jinja2, pandas
[31mERROR: pip's dependency resolver does 

In [3]:
models = ["dataset-embeddings","special-token","baseline"]
domains = ["ai", "literature", "news", "politics", "science", "music"]
seeds = ["4012", "5096", "8878", "8857", "9908"]

M = len(domains)

data = defaultdict(partial(defaultdict, dict))
for model in models:
    for domain in domains:
        for seed in seeds:
            data[model][seed][domain] = pd.read_json(f"results/{model}/rs{seed}/{domain}-test-pred-results.json")

baseline = data['baseline']
special_tokens = data['special-token']
data_embeddings = data['dataset-embeddings']

In [4]:
baseline_scores_per_dataset = [np.array([baseline[seed][domain]['weighted avg']['f1-score'] for seed in seeds]) for domain in domains]
special_scores_per_dataset = [np.array([special_tokens[seed][domain]['weighted avg']['f1-score'] for seed in seeds]) for domain in domains]
data_embed_scores_per_dataset = [np.array([data_embeddings[seed][domain]['weighted avg']['f1-score'] for seed in seeds]) for domain in domains]

In [5]:
print("Special Tokens vs. Baseline")
special_vs_baseline = [aso(a, b, confidence_level=0.95, num_comparisons=M, seed=42) for a, b in zip(special_scores_per_dataset, baseline_scores_per_dataset)]
print("Dataset embeddings vs. Baseline")
embs_vs_baseline = [aso(a, b, confidence_level=0.95, num_comparisons=M, seed=42) for a, b in zip(data_embed_scores_per_dataset, baseline_scores_per_dataset)]

Special Tokens vs. Baseline
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.12it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.87it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:10<00:00, 91.64it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.62it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.97it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.79it/s]
Dataset embeddings vs. Baseline
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.00it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.32it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.23it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 117.18it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 117.51it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 118.48it/s]


In [6]:
print(f"""Significance testing of scores.
If the returned eps_min < 0.5, A is better than B and if eps_min > 0.5, B is better than A. 
The lower eps_min, the more confident the result
Special Tokens (A) vs. Baseline (B): {special_vs_baseline}
Dataset Embeddings (A) vs. Baseline (B) {embs_vs_baseline}""")

Significance testing of scores.
If the returned eps_min < 0.5, A is better than B and if eps_min > 0.5, B is better than A. 
The lower eps_min, the more confident the result
Special Tokens (A) vs. Baseline (B): [1.0, 0.661324495367533, 0.9504284510220415, 0.6590580577951596, 1.0, 0.7869886819598515]
Dataset Embeddings (A) vs. Baseline (B) [0.9960666166719527, 0.995270209415167, 1.0, 0.9956742251446987, 0.9953167096825037, 0.9962299841299801]


In [27]:
columns = {r"\textsc{Seed}": [r"\textsc{Special Tokens vs. Baseline}", r"\textsc{Dataset Embeddings vs. Baseline}"]}
for seed, score1, score2 in zip(seeds, special_vs_baseline, embs_vs_baseline):
    columns[seed] = [score1, score2]
print(pd.DataFrame(columns).to_latex(index=False, float_format="%.3f"))

\begin{tabular}{lrrrrr}
\toprule
\textsc{Seed} & 4012 & 5096 & 8878 & 8857 & 9908 \\
\midrule
\textsc{Special Tokens vs. Baseline} & 1.000 & 0.661 & 0.950 & 0.659 & 1.000 \\
\textsc{Dataset Embeddings vs. Baseline} & 0.996 & 0.995 & 1.000 & 0.996 & 0.995 \\
\bottomrule
\end{tabular}



In [8]:

for seed in seeds:
    base = 0
    spec = 0
    data = 0
    for domain in domains:
        b = baseline[seed][domain]['weighted avg']['f1-score']
        s = special_tokens[seed][domain]['weighted avg']['f1-score']
        d = data_embeddings[seed][domain]['weighted avg']['f1-score']
        base += b
        spec += s
        data += d
        print(domain, seed)
        print("base: ", b)
        print("spec: ", s>b,"\t",s)
        print("data: ", d>b,"\t",d)
        print("--"*20)

    #print("baseline: ", base/len(domains))
    #print("special: ", spec/len(domains))
    #print("data: ", data/len(domains))
    #print("-----------------------------")

ai 4012
base:  0.45670097683494704
spec:  False 	 0.455504318935575
data:  False 	 0.058728519097733
----------------------------------------
literature 4012
base:  0.652635996804101
spec:  False 	 0.650677808111603
data:  False 	 0.058716458223239
----------------------------------------
news 4012
base:  0.5108526950197061
spec:  True 	 0.515817925987182
data:  True 	 0.5966607507672991
----------------------------------------
politics 4012
base:  0.5603914256081131
spec:  True 	 0.5957128996282951
data:  False 	 0.24625683256972403
----------------------------------------
science 4012
base:  0.43147266209719004
spec:  True 	 0.44974953300445003
data:  False 	 0.06746868697699901
----------------------------------------
music 4012
base:  0.7266552581777831
spec:  True 	 0.7340624520705931
data:  False 	 0.141415851323511
----------------------------------------
ai 5096
base:  0.516517806060668
spec:  False 	 0.48362152177708506
data:  False 	 0.058728519097733
------------------------

In [20]:
from itertools import chain


baseline_scores = []
labels = [
    "related-to",
    "artifact",
    "cause-effect",
    "compare",
    "general-affiliation",
    "named",
    "opposite",
    "origin",
    "part-of",
    "physical",
    "role",
    "social",
    "temporal",
    "type-of",
    "usage",
    "win-defeat",
]
columns_baseline = {domain: [] for domain in domains}
columns_special = {domain: [] for domain in domains}
columns_dataset = {domain: [] for domain in domains}
model_columns = [columns_baseline, columns_special, columns_dataset]
model_results = [baseline, special_tokens, data_embeddings]

for model, columns in zip(model_results, model_columns):
    for domain in domains:
        for label in labels:
            f1 = (
                sum(model[seed][domain][label]["f1-score"] for seed in seeds)
                / len(seeds)
                if model[seed][domain][label]["support"] > 0
                else "-"
            )
            columns[domain].append(f1)

domain_to_faicon = {
    "ai": "\\faRobot",
    "music": "\\faMusic",
    "literature": "\\faBookOpen",
    "science": "\\faLeaf",
    "news": "\\faNewspaper",
    "politics": "\\faLandmark",
}

model_names = ["Baseline", "Special tokens", "Dataset embeddings"]
rows = [
    list(chain(*((fr"\textsc{{{model_name}}}",) * len(labels) for model_name in model_names))),
    list(map(lambda x: fr"\textsc{{{x}}}", labels)) * 3,
]
columns = {domain_to_faicon[domain]: [] for domain in domains}
for model_results in model_columns:
    for domain, scores in model_results.items():
        columns[domain_to_faicon[domain]].extend(scores)
print(
    pd.DataFrame(
        columns,
        index=rows,
    ).to_latex(float_format="%.2f")
)


\begin{tabular}{llrrllrr}
\toprule
 &  & \faRobot & \faBookOpen & \faNewspaper & \faLandmark & \faLeaf & \faMusic \\
\midrule
\multirow[t]{16}{*}{\textsc{Baseline}} & \textsc{related-to} & 0.41 & 0.10 & 0.00 & 0.05 & 0.40 & 0.07 \\
 & \textsc{artifact} & 0.65 & 0.89 & - & 0.36 & 0.59 & 0.86 \\
 & \textsc{cause-effect} & 0.00 & 0.00 & - & - & 0.00 & 0.00 \\
 & \textsc{compare} & 0.03 & 0.00 & - & 0.00 & 0.12 & 0.00 \\
 & \textsc{general-affiliation} & 0.14 & 0.83 & 0.32 & 0.36 & 0.44 & 0.85 \\
 & \textsc{named} & 0.79 & 0.61 & 0.41 & 0.76 & 0.65 & 0.45 \\
 & \textsc{opposite} & 0.09 & 0.00 & 0.09 & 0.22 & 0.00 & 0.00 \\
 & \textsc{origin} & 0.54 & 0.28 & - & 0.20 & 0.43 & 0.30 \\
 & \textsc{part-of} & 0.51 & 0.15 & 0.03 & 0.19 & 0.28 & 0.29 \\
 & \textsc{physical} & 0.85 & 0.82 & 0.71 & 0.78 & 0.79 & 0.94 \\
 & \textsc{role} & 0.69 & 0.64 & 0.64 & 0.56 & 0.48 & 0.64 \\
 & \textsc{social} & 0.00 & 0.48 & - & 0.24 & 0.28 & 0.07 \\
 & \textsc{temporal} & 0.70 & 0.74 & 0.24 & 0.87 & 0.63 & 

In [10]:
from IPython.display import display, HTML

display(
    HTML(
        pd.DataFrame(
            {domain_to_faicon[domain]: v for domain, v in columns_baseline.items()},
            index=list(map(lambda x: fr"\textsc{{{x}}}", labels)),
        ).to_html(float_format=lambda x: f"{x:.2f}")
    )
)

Unnamed: 0,\faRobot,\faBookOpen,\faNewspaper,\faLandmark,\faLeaf,\faMusic
\textsc{related-to},0.41,0.1,0.00,0.05,0.4,0.07
\textsc{artifact},0.65,0.89,-,0.36,0.59,0.86
\textsc{cause-effect},0.0,0.0,-,-,0.0,0.0
\textsc{compare},0.03,0.0,-,0.00,0.12,0.0
\textsc{general-affiliation},0.14,0.83,0.32,0.36,0.44,0.85
\textsc{named},0.79,0.61,0.41,0.76,0.65,0.45
\textsc{opposite},0.09,0.0,0.09,0.22,0.0,0.0
\textsc{origin},0.54,0.28,-,0.20,0.43,0.3
\textsc{part-of},0.51,0.15,0.03,0.19,0.28,0.29
\textsc{physical},0.85,0.82,0.71,0.78,0.79,0.94


### ASO comparisons between dataset-embeddings with different entity markers

In [10]:
marker_types = ["all", "generic", "none"]
dataframes = {marker: {seed: {} for seed in seeds} for marker in marker_types}
for seed in seeds:
    for marker in marker_types:
        for domain in domains:
            dataframes[marker][seed][domain] = pd.read_json(f"results_all/dataset-embeddings-{marker}/rs{seed}/{domain}-test-pred-results.json")

dataset_embeddings_all_per_dataset = [np.array([dataframes["all"][seed][domain]["weighted avg"]["f1-score"] for seed in seeds]) for domain in domains]
dataset_embeddings_generic_per_dataset = [np.array([dataframes["generic"][seed][domain]["weighted avg"]["f1-score"] for seed in seeds]) for domain in domains]
dataset_embeddings_none_per_dataset = [np.array([dataframes["none"][seed][domain]["weighted avg"]["f1-score"] for seed in seeds]) for domain in domains]

all_vs_none = [aso(a, b, confidence_level=0.95, num_comparisons=M, seed=42) for a, b in zip(dataset_embeddings_all_per_dataset, dataset_embeddings_none_per_dataset)]
generic_vs_none = [aso(a, b, confidence_level=0.95, num_comparisons=M, seed=42) for a, b in zip(dataset_embeddings_generic_per_dataset, dataset_embeddings_none_per_dataset)]
generic_vs_all = [aso(a, b, confidence_level=0.95, num_comparisons=M, seed=42) for a, b in zip(dataset_embeddings_generic_per_dataset, dataset_embeddings_all_per_dataset)]
print(f"""Significance testing of scores.
If the returned eps_min < 0.5, A is better than B and if eps_min > 0.5, B is better than A. 
The lower eps_min, the more confident the result
All entity types (A) vs. No entity types (B): {all_vs_none}
Generic entity types (A) vs. No entity types (B) {generic_vs_none}
Generic entity types (A) vs. All entity types (B) {generic_vs_all}""")

  warn("Division by zero encountered in violation ratio.")
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.57it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.46it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 115.54it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.05it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.17it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 116.09it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 113.77it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08<00:00, 113.70it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:09<00:00, 100.90it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:09<00:00, 101.04it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:13<00:00, 73.24it/s]
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:14<00:00, 69.69it/s] 
Bootstrap iterations: 100%|█████████▉| 999/1000 [00:08

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c3be415-6b72-4ca4-8444-c76a32e3ce00' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>