In [None]:
import sys
import os

parent_path = ".."
sys.path.append(os.path.abspath(parent_path))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from health_causenet import constants
import json
import pathlib
from tqdm.autonotebook import tqdm
import operator
from nltk import agreement
import nltk
import sklearn.metrics

tqdm.pandas()

plt.rcParams["font.family"] = "serif"

In [None]:
def p_mean_threshold_combiner(cause, effect, p):
    if p == 0:
        return np.sqrt(cause * effect)
    if p == float("inf"):
        return np.maximum(cause, effect)
    if p == -float("inf"):
        return np.minimum(cause, effect)
    return ((cause ** p + effect ** p) / 2) ** (1 / p)

In [None]:
test_causenet = pd.read_pickle(os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet.pkl"))
test_causenet_predictions = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_predictions.pkl")
)
test_causenet = test_causenet.merge(
    test_causenet_predictions, on=["cause", "effect", "dataset"], how="left"
)

sentence_test_causenet = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "sentence_test_causenet.pkl")
)
sentence_test_causenet_predictions = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "sentence_test_causenet_predictions.pkl")
)
sentence_test_causenet = sentence_test_causenet.merge(
    sentence_test_causenet_predictions,
    on=["cause", "effect", "dataset", "sentence"],
    how="left",
)

assert (
    not test_causenet.loc[:, test_causenet.columns.str.startswith("medical_score")]
    .isna()
    .any()
    .any()
)
assert (
    not sentence_test_causenet.loc[
        :, sentence_test_causenet.columns.str.startswith("medical_score")
    ]
    .isna()
    .any()
    .any()
)

test_causenet = test_causenet.loc[test_causenet.dataset.isin(["random_full", "random_support"])]
sentence_test_causenet = sentence_test_causenet.loc[sentence_test_causenet.dataset == "random_full"]
sentence_test_causenet["dataset"] = "sentence"
test_causenet = pd.concat([test_causenet, sentence_test_causenet]).reset_index(drop=True)
test_causenet

In [None]:
test_causenet.groupby("dataset").evaluation.agg(["sum", "mean", "size"])

In [None]:
num_words = (
    test_causenet.apply(
        lambda x: pd.Series(x[["cause", "effect"]].values.ravel()).map(
            lambda y: len(nltk.tokenize.word_tokenize(y))
        ),
        axis=1,
    )
    .sum(axis=1)
    .groupby(test_causenet.dataset)
    .mean()
    / 2
)
print("phrases")
for dataset, word_count in num_words.round(2).to_dict().items():
    print(dataset, word_count)
print("-------------------")
print("sentences")
num_words = (
    sentence_test_causenet.apply(
        lambda x: pd.Series(x[["sentence"]].values.ravel()).map(
            lambda y: len(nltk.tokenize.word_tokenize(y))
        ),
        axis=1,
    )
    .sum(axis=1)
    .groupby(sentence_test_causenet.dataset)
    .mean()
)
for dataset, word_count in num_words.round(2).to_dict().items():
    print(dataset, word_count)

In [None]:
with open(constants.MANUAL_EVALUATION_PATH, "r") as file:
    manual_eval_dict = json.load(file)
data = []
for key, value in manual_eval_dict.items():
    cause, effect = key.split("->")
    data.append({"cause": cause, "effect": effect, "evaluation": value})
evals = pd.DataFrame(data).rename({"evaluation": "rater"}, axis=1)
evals = evals.set_index(["cause", "effect"])
evals = evals.add_suffix("-0")

iterator = enumerate(pathlib.Path(constants.BASE_PATH).glob("agreement_relations*.csv"))

for idx, path in iterator:
    other_eval = pd.read_csv(path, index_col=0).set_index(["cause", "effect"])
    other_eval = other_eval.rename({"health-related": "rater"}, axis=1)
    other_eval = other_eval.add_suffix(f"-{idx + 1}")
    evals = evals.join(other_eval, how="inner")

rating_data = evals.reset_index(drop=True).stack().swaplevel().reset_index().values
rating_data = list(tuple(row) for row in rating_data)

rating_task = agreement.AnnotationTask(data=rating_data)
print("kappa " + str(rating_task.kappa()))
print("fleiss " + str(rating_task.multi_kappa()))
print("alpha " + str(rating_task.alpha()))
print("scotts " + str(rating_task.pi()))

In [None]:
evals = sentence_test_causenet.set_index(["cause", "effect"]).dropna(
    subset=["manual_evaluation"]
)
evals = evals.loc[:, ["evaluation", "manual_evaluation"]].astype(int)
rating_data = evals.reset_index(drop=True).stack().swaplevel().reset_index().values
rating_data = list(tuple(row) for row in rating_data)

rating_task = agreement.AnnotationTask(data=rating_data)
print("kappa " + str(rating_task.kappa()))
print("fleiss " + str(rating_task.multi_kappa()))
print("alpha " + str(rating_task.alpha()))
print("scotts " + str(rating_task.pi()))

In [None]:
print((evals.loc[evals["evaluation"] != evals["manual_evaluation"]]).sum())
evals.loc[evals["evaluation"] != evals["manual_evaluation"]]

In [None]:
def parse_test_causenet_combined(test_causenet, ops):
    test_causenet_combined = []

    total = len(ops)

    for op_name, op in tqdm(ops.items()):
        column_names = sorted(test_causenet.filter(regex="medical_score"))
        filtered = test_causenet[column_names]
        half = filtered.shape[1] // 2
        combined = op(filtered.iloc[:, :half].values, filtered.iloc[:, half:].values,)
        test_causenet_combined.append(
            pd.DataFrame(
                combined,
                columns=list(
                    name[20:] + "-" + op_name for name in filtered.iloc[:, :half]
                ),
            )
        )
    test_causenet_combined = pd.concat(test_causenet_combined, axis=1)
    return test_causenet_combined


# or and p=inf_mean are the same
ops = {
    "p=neg_inf_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, -float("inf")),
    "p=neg_10_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, -10),
    "p=neg_5_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, -5),
    "p=neg_2_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, -2),
    "p=neg_1_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, -1),
    "p=0_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 0),
    "p=1_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 1),
    "p=2_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 2),
    "p=5_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 5),
    "p=10_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, 10),
    "p=inf_mean": lambda cause, effect: p_mean_threshold_combiner(cause, effect, float("inf")),
}
test_causenet_combined = parse_test_causenet_combined(test_causenet, ops)
test_causenet_combined

In [None]:
test_causenet_combined.to_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_combined.pkl")
)

In [None]:
test_causenet_combined = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_combined.pkl")
)
test_causenet_combined

In [None]:
def parse_test_causenet_medical(test_causenet_combined, method_threshold_dict):
    test_causenet_medical = []

    total = 0
    for thresholds in method_threshold_dict.values():
        total += len(thresholds)

    pg = tqdm(total=total)
    for method_name, thresholds in method_threshold_dict.items():
        for threshold in thresholds:
            column_names = sorted(test_causenet_combined.filter(regex=method_name))
            filtered = test_causenet_combined[column_names]
            medical = filtered >= threshold
            medical = medical.add_suffix(f"-{threshold}-medical")
            test_causenet_medical.append(medical)
            pg.update()
    test_causenet_medical = pd.concat(test_causenet_medical, axis=1)
    return test_causenet_medical


taggers = ["quickumls", "scispacy", "metamap", "ctakes"]
method_threshold_dict = {
    "|".join(taggers): [round(thresh, 4) for thresh in np.linspace(0, 1, 100)],
    "term_domain_specificity": [round(thresh, 4) for thresh in np.linspace(0, 6, 100)],
    "contrastive_weight": [round(thresh, 4) for thresh in np.linspace(50, 140, 100)],
    "discriminative_weight": [round(thresh, 4) for thresh in np.linspace(0, 1100, 100)],
    "health_bert": [round(thresh, 4) for thresh in np.linspace(0, 1, 100)],
}

test_causenet_medical = parse_test_causenet_medical(
    test_causenet_combined, method_threshold_dict
)
test_causenet_medical

In [None]:
test_causenet_medical.to_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_medical.pkl")
)

In [None]:
test_causenet_medical = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_medical.pkl")
)
test_causenet_medical

In [None]:
seed = 42
sample_idcs = test_causenet_medical.groupby(test_causenet.dataset).sample(800, random_state=seed).index

validation_causenet_medical = test_causenet_medical.loc[sample_idcs]
test_causenet_medical = test_causenet_medical.loc[~test_causenet_medical.index.isin(sample_idcs.values)]

validation_causenet = test_causenet.loc[sample_idcs]
test_causenet = test_causenet.loc[~test_causenet.index.isin(sample_idcs.values)]

validation_causenet_combined = test_causenet_combined.loc[sample_idcs]
test_causenet_combined = test_causenet_combined.loc[~test_causenet_combined.index.isin(sample_idcs.values)]

validation_causenet.shape, test_causenet.shape

In [None]:
def evaluate_test_causenet_medical(test_causenet, test_causenet_medical):
    _tp = []
    _fp = []
    _tn = []
    _fn = []
    test_causenet_medical = test_causenet_medical.loc[~test_causenet.evaluation.isna()]
    test_causenet = test_causenet.loc[~test_causenet.evaluation.isna()]
    test_causenet.evaluation = test_causenet.evaluation.astype(int)
    for _dataset in tqdm(test_causenet.dataset.drop_duplicates().values):
        dataset_bool = test_causenet.dataset == _dataset
        _test_causenet_medical = test_causenet_medical.loc[dataset_bool]
        _test_causenet = test_causenet.loc[dataset_bool]
        _tp.append(
            (_test_causenet_medical & _test_causenet.evaluation.values[:, np.newaxis])
            .sum()
            .rename(_dataset)
        )
        _fp.append(
            (_test_causenet_medical & ~_test_causenet.evaluation.values[:, np.newaxis])
            .sum()
            .rename(_dataset)
        )
        _tn.append(
            (~_test_causenet_medical & ~_test_causenet.evaluation.values[:, np.newaxis])
            .loc[dataset_bool]
            .sum()
            .rename(_dataset)
        )
        _fn.append(
            (~_test_causenet_medical & _test_causenet.evaluation.values[:, np.newaxis])
            .loc[dataset_bool]
            .sum()
            .rename(_dataset)
        )
    tp = pd.concat(_tp, axis=1)
    fp = pd.concat(_fp, axis=1)
    tn = pd.concat(_tn, axis=1)
    fn = pd.concat(_fn, axis=1)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    numerator = tp * tn - fp * fn
    denominator = np.log([tp + fp, tp + fn, tn + fp, tn + fn]).sum(axis=0) * (1 / 2)
    mcc = numerator / np.exp(denominator)
    test_causenet_metrics = (
        pd.concat(
            [tp, fp, tn, fn, precision, recall, f1, accuracy, mcc],
            keys=[
                "tp",
                "fp",
                "tn",
                "fn",
                "precision",
                "recall",
                "f1",
                "accuracy",
                "mcc",
            ],
        )
        .stack()
        .unstack(0)
        .reset_index()
    )
    test_causenet_metrics = test_causenet_metrics.rename(
        {"level_1": "dataset", "level_0": "method"}, axis=1
    )
    test_causenet_metrics = pd.concat(
        [
            test_causenet_metrics.drop("method", axis=1),
            test_causenet_metrics.method.str.rsplit("-", expand=True, n=3)
            .drop(3, axis=1)
            .rename({0: "method", 1: "operator", 2: "threshold"}, axis=1),
        ],
        axis=1,
    )
    test_causenet_metrics = test_causenet_metrics[
        [
            "dataset",
            "method",
            "operator",
            "threshold",
            "tp",
            "fp",
            "tn",
            "fn",
            "precision",
            "recall",
            "f1",
            "accuracy",
            "mcc",
        ]
    ]
    test_causenet_metrics = test_causenet_metrics.replace([np.inf, -np.inf], np.nan)
    return test_causenet_metrics


validation_causenet_metrics = evaluate_test_causenet_medical(
    validation_causenet.loc[:, ["evaluation", "dataset"]], validation_causenet_medical,
)
test_causenet_metrics = evaluate_test_causenet_medical(
    test_causenet.loc[:, ["evaluation", "dataset"]], test_causenet_medical,
)

test_causenet_metrics = test_causenet_metrics.loc[
    test_causenet_metrics.dataset.isin(["random_support", "random_full"]) | 
    (test_causenet_metrics.operator == "p=inf_mean")
]
validation_causenet_metrics = validation_causenet_metrics.loc[
    validation_causenet_metrics.dataset.isin(["random_support", "random_full"]) | 
    (validation_causenet_metrics.operator == "p=inf_mean")
]

In [None]:
test_causenet_metrics = test_causenet_metrics.loc[
    test_causenet_metrics.dataset.isin(["random_support", "random_full"]) | 
    (test_causenet_metrics.operator == "p=inf_mean")
]
validation_causenet_metrics = validation_causenet_metrics.loc[
    validation_causenet_metrics.dataset.isin(["random_support", "random_full"]) | 
    (validation_causenet_metrics.operator == "p=inf_mean")
]
test_causenet_metrics

In [None]:
test_causenet_metrics.to_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_metrics.pkl")
)
validation_causenet_metrics.to_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "validation_causenet_metrics.pkl")
)

In [None]:
test_causenet_metrics = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "test_causenet_metrics.pkl")
)
validation_causenet_metrics = pd.read_pickle(
    os.path.join(constants.TEST_CAUSENET_PATH, "validation_causenet_metrics.pkl")
)
test_causenet_metrics

In [None]:
def precision_recall_plot(methods, plot_ops, dataset, labels=None):
    ax = None
    idx = 0
    styles = ["solid", "dashed", "dashdot", "dotted"]
    for method in methods:
        for op in plot_ops:
            plot_data = test_causenet_metrics.loc[
                (test_causenet_metrics.method == method)
                & (test_causenet_metrics.operator == op)
                & (test_causenet_metrics.dataset == dataset)
            ]
            plot_data = plot_data.groupby("recall").precision.max().sort_index()
            method_name = " ".join(
                name_dict.get(sub_method, sub_method.title())
                for sub_method in method.split("_")
            )
            if labels is None:
                plot_data = plot_data.rename(
                    f"{method_name} ({' '.join(op.split('-')).title()})"
                )
            else:
                plot_data = plot_data.rename(labels[idx])
            ax = plot_data.plot.line(
                figsize=(8, 4), linewidth=2, linestyle=styles[idx % len(styles)]
            )
            idx += 1
    ax.legend()
    ax.set_xlim((0, 1.05))
    ax.set_ylim(0.1, 1.05)
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    fig = ax.get_figure()
    return fig

In [None]:
def auc_roc_curve(methods, ops, dataset, labels=None, ax=None, x_lim=(0, 1), y_lim=(0, 1)):
    idx = 0
    styles = ["solid", "dashed", "dashdot", "dotted"]
    aucs = []
    for method in methods:
        for op in ops:
            plot_data = validation_causenet_metrics.loc[
                (validation_causenet_metrics.method == method)
                & (validation_causenet_metrics.operator == op)
                & (validation_causenet_metrics.dataset == dataset)
            ].copy()
            auc = sklearn.metrics.roc_auc_score(
                    validation_causenet.loc[validation_causenet.dataset == dataset, "evaluation"],
                    validation_causenet_combined.loc[validation_causenet.dataset == dataset].filter(
                        like="-".join((method, op))
                    ),
                )
            aucs.append(auc)
            plot_data["false_positive_rate"] = plot_data["fp"] / (
                plot_data["fp"] + plot_data["tn"]
            )
            plot_data = (
                plot_data.groupby("false_positive_rate").recall.max().sort_index()
            )
            if labels is not None:
                plot_data = plot_data.rename(labels[idx])
            plot_data.plot.line(
                linewidth=3, linestyle=styles[idx % len(styles)], ax=ax, xlabel=None
            )
            idx += 1
    if labels is not None:
        ax.legend()
    ax.set_xlim(*x_lim)
    ax.set_ylim(*y_lim)
    ax.set_xlabel("False Positive Rate")
    fig = ax.get_figure()
    return fig, aucs

In [None]:
# Termhood score ROC plots + AUC
SMALL_SIZE = 14
MEDIUM_SIZE = 16
LARGE_SIZE = 18

dataset = "random_full"

plot_dict = {
#     "$n$-grams": {
#         "methods": [f"discriminative_weight-encyclopedia-(1, {value})-1" for value in (1, 2, 3)],
#         "ops": ["p=1_mean"],
#         "labels": ["$n$=1", "$n$=2", "$n$=3"],
#     },
#     "$p_p$-values": {
#         "methods": [f"discriminative_weight-encyclopedia-(1, 3)-{value}" for value in (1, 2, 5, 10, "inf")],
#         "ops": ["p=1_mean"],
#         "labels": ["1", "2", "5", "10", "$\infty$"],
#     },
#     "Operators": {
#         "methods": ["discriminative_weight-encyclopedia-(1, 3)-1"],
#         "ops": ["and"] + [f"p={value}_mean" for value in (1, 2, 5, 10, "inf")],
#         "labels": ["AND", "1", "2", "5", "10", "$\infty$"],
#         "x_lim": (0.0, 0.6),
#         "y_lim": (0.4, 1.0),
#     },
    "PubMedBERT": {
        "methods": [f"health_bert-pubmedbert-{corpus}-{text_format}" for corpus in ["pubmed", "encyclopedia"] for text_format in ["sentence", "noun_phrase"]],
        "ops": ["p=inf_mean"],
        "labels": ["PM, S", "PM, NP", "ENC, S", "ENC, NP"],
#         "x_lim": (0.0, 0.6),
#         "y_lim": (0.4, 1.0),
        "dataset": "sentence",
    },
    "     Discriminative Weight": {
        "methods": [f"discriminative_weight-{corpus}-(1, 1)-1" for corpus in ["pubmed", "pubmed_central", "textbook", "encyclopedia"]],
        "ops": ["p=1_mean"],
        "labels": ["PM", "PMC", "TB", "ENC"],
        "dataset": "random_full"
#         "x_lim": (0.0, 0.6),
#         "y_lim": (0.4, 1.0),
    },
}

num_plots = len(plot_dict)
fig, axes = plt.subplots(1, ncols=num_plots, figsize=(4 * num_plots, 3))
for idx, (ax, (title, kwargs)) in enumerate(zip(axes, plot_dict.items())):
    fig, aucs = auc_roc_curve(ax=ax, **kwargs)
    for auc in aucs:
        ax.plot([0, 1], [0, 1], alpha=0.0, label=f"({auc:.2f})")
    _text = ax.text(0, 1.05, f"({chr(97 + idx)})", transform=ax.transAxes, size=LARGE_SIZE)
    if idx % 2 == 1:
        _text.set_in_layout(False)
    print(aucs)
    ax.set_xlabel(ax.get_xlabel(), size=LARGE_SIZE)
    ax.set_title(title, size=LARGE_SIZE)
    legend = ax.legend(fontsize=15, framealpha=0.0, ncol=2, columnspacing=-2.5)
    ax.tick_params(labelsize=SMALL_SIZE)

# set y_label
axes[0].set_ylabel("True Positive Rate", size=LARGE_SIZE)
    
plt.tight_layout(pad=0.2)
fig.savefig("figures/roc_curves.pdf", bbox_inches="tight")

In [None]:
all_plot_ops = ["and", "or"] + [f"p={value}_mean" for value in (1, 2, 5, 10, "inf")]
all_plot_op_labels = ["And", "Or", "p=1", "p=2", "p=5", "p=10", "p=inf"]
all_p_value_labels = [f"p={p_value}" for p_value in p_values]
plot_dict = {
    "umls-subset": {
        "methods": ["mesh_1.0", "mesh_syn_1.0", "umls_1.0"],
        "plot_ops": ["and"],
        "labels": ["MeSH", "MeSH Syn", "UMLS"],
    },
    "jaccard-threshold": {
        "methods": [
            f"mesh_syn_{jaccard_threshold}"
            for jaccard_threshold in jaccard_thresholds
            if len(str(jaccard_threshold))
        ][::-1],
        "plot_ops": ["and"],
        "labels": [
            str(jaccard_threshold)
            for jaccard_threshold in jaccard_thresholds
            if len(str(jaccard_threshold))
        ][::-1][::-1],
    },
    "mesh-syn-operator": {
        "methods": ["mesh_syn_1.0"],
        "plot_ops": all_plot_ops,
        "labels": all_plot_op_labels,
    },
    "approach": {
        "methods": [
            "mesh_syn_1.0",
            "contrastive_weight_pubmed_(1, 1)_1",
            "term_domain_specificity_pubmed_(1, 1)_1",
            "discriminative_weight_pubmed_(1, 1)_1",
        ],
        "plot_ops": ["and"],
        "labels": [
            "MeSH Syn",
            "Contrastive Weight",
            "Term Domain Specificity",
            "Discriminative Weight",
        ],
    },
    "discriminative-weight-corpora": {
        "methods": [
            f"discriminative_weight_{corpus}_(1, 1)_1" for corpus in medical_corpora
        ],
        "plot_ops": ["and"],
        "labels": [" ".join(corpus.split("_")).title() for corpus in medical_corpora],
    },
    "discriminative-weight-operator": {
        "methods": ["discriminative_weight_encyclopedia_(1, 1)_1"],
        "plot_ops": all_plot_ops,
        "labels": all_plot_op_labels,
    },
    "discriminative-weight-generalized-mean": {
        "methods": [
            f"discriminative_weight_encyclopedia_(1, 1)_{p_value}"
            for p_value in p_values
        ],
        "plot_ops": ["p=1_mean"],
        "labels": all_p_value_labels,
    },
    "discriminative-weight-n-gram": {
        "methods": [
            "discriminative_weight_encyclopedia_(1, 1)_1",
            "discriminative_weight_encyclopedia_(1, 2)_1",
            "discriminative_weight_encyclopedia_(1, 3)_1",
        ],
        "plot_ops": ["p=1_mean"],
        "labels": ["(1, 1)", "(1, 2)", "(1, 3)"],
    },
}
datasets = ["random_full", "wikidata"]
for dataset in datasets:
    for name, plot_values in plot_dict.items():
        print(name)
        fig = precision_recall_plot(dataset=dataset, **plot_values)
        fig.savefig(f"figures/{dataset}-{name}-comparison.pdf", bbox_inches="tight")
        fig.clear(True)

In [None]:
# methods = ["mesh_1.0", "mesh_syn_1.0", "umls_1.0"]
# methods = [f"mesh_syn_{jaccard_threshold}" for jaccard_threshold in jaccard_thresholds if len(str(jaccard_threshold))][::-1]
# methods = ["mesh_syn_1.0"]
# methods = ["mesh_syn_1.0", "contrastive_weight_pubmed_(1, 1)_1", "term_domain_specificity_pubmed_(1, 1)_1", "discriminative_weight_pubmed_(1, 1)_1"]
methods = [f"discriminative_weight_{corpus}_(1, 1)_1" for corpus in medical_corpora]
# methods = ["discriminative_weight_encyclopedia_(1, 1)_1"]
# methods = [f"discriminative_weight_encyclopedia_(1, 1)_{p_value}" for p_value in p_values]
# methods = ["discriminative_weight_encyclopedia_(1, 1)_1", "discriminative_weight_encyclopedia_(1, 2)_1"]
# methods = ["discriminative_weight_encyclopedia_(1, 1)_1"]


# plot_ops = ["and", "or", "arithmetic_mean", "quadratic_mean"]
# plot_ops = ["and", "arithmetic_mean", "quadratic_mean"]
# plot_ops = ["and", "or", "arithmetic_mean"]
# plot_ops = ["and", "arithmetic_mean"]
# plot_ops = ["arithmetic_mean"]
plot_ops = ["and"]
# plot_ops = ["or"]
dataset = "wikidata"
fig = precision_recall_plot(methods, plot_ops, dataset)

In [None]:
name = "discriminative-weight-corpus"
fig.savefig(f"figures/{name}-comparison.pdf", bbox_inches="tight")

In [None]:
def best_approach(
    validation_metrics,
    test_metrics,
    patterns,
    datasets,
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    macro,
    return_val=False,
):
    best_approaches = []

    def best_run(df):
        if threshold_score:
            if ((df[threshold_score] < threshold) | df[threshold_score].isna()).all():
                df = df.sort_values(threshold_score, ascending=False).iloc[[0]]
            else:
                df = df.loc[df[threshold_score] >= threshold]
        df = df.sort_values(optimization_score, ascending=False).iloc[0]
        df = df.map(lambda x: round(x, 4) if isinstance(x, (float)) else x)
        return df

    for pattern in patterns:
        tmp = validation_metrics.loc[validation_metrics.method.str.contains(pattern)]
        if datasets:
            tmp = tmp.loc[tmp.dataset.isin(datasets)]
        print(tmp.loc[~tmp.operator.isin(eval_ops)])
        if eval_ops:
            tmp = tmp.loc[tmp.operator.isin(eval_ops)]

        best_runs = tmp.groupby("dataset").apply(best_run)
        best_approaches.append(best_runs)

    idcs = np.tile(np.arange(len(patterns)), len(datasets)) * len(datasets) + np.repeat(
        np.arange(len(datasets)), len(patterns)
    )
    best_approaches = (
        pd.concat(best_approaches).reset_index(drop=True).iloc[idcs].reset_index()
    )

    if macro:
        for pattern in patterns:
            tmp = validation_metrics.loc[validation_metrics.method.str.contains(pattern)]
            if datasets:
                tmp = tmp.loc[tmp.dataset.isin(datasets)]
            if eval_ops:
                tmp = tmp.loc[tmp.operator.isin(eval_ops)]
            tmp = tmp.groupby(["method", "operator", "threshold"]).mean().reset_index()
            _best_run = best_run(tmp)
            _best_run["dataset"] = "macro"
            best_approaches = pd.concat([best_approaches, _best_run])

    best_approaches = best_approaches.drop("index", axis=1)
    best_approaches = best_approaches.reset_index(drop=True)
    best_approaches["method_class"] = best_approaches.method.apply(lambda x: x.split("-")[0])
    best_approaches.loc[best_approaches.method_class == "health_bert", "method_class"] = best_approaches.loc[best_approaches.method_class == "health_bert"].method.apply(lambda x: x.split("-")[1])
    validation_best_approaches = best_approaches.copy()
    test_best_approaches = best_approaches.merge(test_metrics, how="left", on=["dataset", "method", "operator", "threshold"], suffixes=("_to_drop", ""))
    test_best_approaches = test_best_approaches.drop(test_best_approaches.filter(like="_to_drop"), axis=1)
    if return_val:
        return validation_best_approaches, test_best_approaches
    return test_best_approaches

In [None]:
patterns = [
    "quickumls",
    "scispacy",
    "ctakes",
    "metamap",
    "-bert",
    "scibert",
    "pubmedbert",
    "contrastive",
    "specificity",
    "discriminative",
]
# patterns = ["mesh_[0-9]", "mesh_syn_[0-9]", "umls_[0-9]", "contrastive_weight_1", "term_domain_specificity_1", "discriminative_weight_1"]
# patterns = ["discriminative_weight_pubmed_(1, 2)_1"]
# patterns = ["contrastive", "specificity", "discriminative"]
threshold_score = "precision"
threshold = 0.9
optimization_score = "recall"
datasets = [
    "random_full",
    "random_support",
    "sentence",
]
macro = False
eval_ops = (
    ["p=0_mean"]
    + [f"p={p}_mean" for p in (1, 2, 5, 10, "inf")]
    + [f"p=neg_{p}_mean" for p in (1, 2, 5, 10, "inf")]
)

validation_best_approaches, test_best_approaches = best_approach(
    validation_causenet_metrics,
    test_causenet_metrics,
    patterns,
    datasets,
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    macro,
    return_val=True
)

if threshold_score:
    validation_best_approaches.to_csv(f"validation_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")
    test_best_approaches.to_csv(f"test_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")
else:
    validation_best_approaches.to_csv(f"validation_best_approaches_{optimization_score}.csv")
    test_best_approaches.to_csv(f"test_best_approaches_{optimization_score}.csv")
test_best_approaches

In [None]:
tmp = validation_causenet_metrics.loc[(validation_causenet_metrics.dataset == "random_full") & validation_causenet_metrics.method.str.contains("discri")]
# tmp = tmp.loc[tmp.precision > 0.9].sort_values("recall")
tmp

In [None]:
validation_best_approaches

In [None]:
threshold_score = "recall"
threshold = 0.9
optimization_score = "precision"
pd.read_csv(f"validation_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")

In [None]:
def mcc(samples, labels):
    tp = (samples & labels).sum()
    tn = (~samples & ~labels).sum()
    fp = (samples & ~labels).sum()
    fn = (~samples & labels).sum()
    numerator = tp * tn - fp * fn
    denominator = np.log([tp + fp, tp + fn, tn + fp, tn + fn]).sum() * (1 / 2)
    mcc = numerator / np.exp(denominator)
    return mcc

def precision(samples, labels):
    tp = (samples & labels).sum()
    fp = (samples & ~labels).sum()
    return tp / (tp + fp)

def recall(samples, labels):
    tp = (samples & labels).sum()
    fn = (~samples & labels).sum()
    return tp / (tp + fn)


def bootstrap(sample_x, sample_y, labels, metric, n):
    base_value = metric(sample_x, labels) - metric(sample_y, labels)
    pooled = list(sample_x) + list(sample_y)
    num_samples = len(pooled) // 2
    metrics = []
    for _ in tqdm(range(n), position=3, leave=False):
        permutation = np.random.permutation(pooled)
        sampled_x = permutation[:num_samples]
        sampled_y = permutation[num_samples:]
        sampled_metric = metric(sampled_x, labels) - metric(sampled_y, labels)
        metrics.append(sampled_metric)
    metrics = np.array(metrics)
    percentile = sum(base_value > metrics) / n
    return percentile

def significance_test(df, optimization_func, n=5000):
    rows = df.method.apply(lambda x: x.split("-")[0])
    percentiles = pd.DataFrame(np.nan, index=rows, columns=rows)
    name = df.name
    if df.name not in test_causenet.dataset.unique().tolist():
        return percentiles
#     print(f"{name:<30}", end="\r")
    df = df.copy()
    df["approach_name"] = (
        df.method + "-" + df.operator + "-" + df.threshold.astype(str) + "-medical"
    )
    value_bool = (test_causenet.dataset == name)
    for approach_1_idx, (_, approach_1) in tqdm(list(enumerate(df.iterrows())), position=1, leave=False):
        samples_1 = test_causenet_medical.loc[value_bool, approach_1.approach_name].values
        for approach_2_idx, (_, approach_2) in tqdm(list(enumerate(df.iterrows())), position=2, leave=False):
            if approach_1_idx >= approach_2_idx:
                continue
            samples_2 = test_causenet_medical.loc[value_bool, approach_2.approach_name].values
            percentile = bootstrap(
                samples_1, samples_2, test_causenet.loc[value_bool, "evaluation"], optimization_func, n
            )
            percentiles.iloc[approach_1_idx, approach_2_idx] = percentile
            percentiles.iloc[approach_2_idx, approach_1_idx] = 1 - percentile
    return percentiles


n = 100000
threshold_score = ""
threshold = 0.9
optimization_score = "mcc"
if threshold_score:
    significance_approaches = pd.read_csv(f"test_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")
else:
    significance_approaches = pd.read_csv(f"test_best_approaches_{optimization_score}.csv")


if optimization_score == "precision":
    optimization_func = precision
elif optimization_score == "recall":
    optimization_func = recall
elif optimization_score == "mcc":
    optimization_func = mcc
else:
    optimization_func = None
    
significance = (
    significance_approaches.groupby("dataset")
    .progress_apply(lambda x: significance_test(x, optimization_func, n=n))
    .drop("macro", level=0, errors="ignore")
)

if threshold_score:
    significance.to_csv(f"significance_{optimization_score}_{threshold_score}_{threshold}.csv")
else:
    significance.to_csv(f"significance_{optimization_score}.csv")
significance

In [None]:
precision_significance = pd.read_csv("significance_recall_precision_0.9.csv", index_col=[0, 1])
precision_significance.loc["wikidata", "discriminative_weight"]

In [None]:
recall_significance = pd.read_csv("significance_precision_recall_0.9.csv", index_col=[0, 1])
recall_significance.loc["wikidata", "pmbert"]

In [None]:
mcc_significance = pd.read_csv("significance_mcc.csv", index_col=[0, 1])
mcc_significance.loc["wikidata", "pmbert"]

In [None]:
def pprint(
    values,
    bold=False,
    bold_axis=0,
    bold_command="\highlight",
    bold_idcs=None,
    round_to=2,
    direction="maximize",
):
    for row_idx, row in enumerate(values):
        for col_idx, value in enumerate(row):
            try:
                float(value)
                values[row_idx, col_idx] = round(float(value), round_to)
            except ValueError:
                pass
    if bold_idcs is None:
        bold_idcs = []
    for idx, bold_idx in enumerate(bold_idcs):
        if bold_idx < 0:
            bold_idcs[idx] = values.shape[1 - bold_axis] + bold_idx
    if bold:
        if bold_axis not in (0, 1):
            raise ValueError(f"invalid axis value, expected 0, 1, got {axis}")
        inf_value = float("-inf") if direction == "maximize" else float("inf")
        numerical_values = [
            [value if isinstance(value, (float, int)) else inf_value for value in row]
            for row in values
        ]
        if direction == "maximize":
            best_values = np.amax(numerical_values, axis=bold_axis)
        else:
            best_values = np.amin(numerical_values, axis=bold_axis)
        if bold_axis == 1:
            values = values.T
        for row_idx, row in enumerate(values):
            for col_idx, value in enumerate(row):
                if (
                    bold_axis == 0
                    and values[row_idx][col_idx] == best_values[col_idx]
                    and col_idx in bold_idcs
                ) or (
                    bold_axis == 1
                    and values[row_idx][col_idx] == best_values[row_idx]
                    and row_idx in bold_idcs
                ):
                    string_value = str(value)
                    string_value += "0" * (round_to - len(string_value) + 2)
                    values[row_idx][col_idx] = f"{bold_command}{{{string_value}}}"
        if bold_axis == 1:
            values = values.T

    string_values = []
    for row in values:
        row_string_values = []
        for value in row:
            try:
                float(value)
                string_value = str(value)
                string_value += "0" * (round_to - len(string_value) + 2)
                row_string_values.append(string_value)
            except ValueError:
                row_string_values.append(str(value))
        string_values.append(row_string_values)

    out = " \\\\\n".join(" & ".join(values) for values in string_values)
    out += " \\\\"

    print(out)
    
    return out


def small(string):
    return f"\small{{{string}}}"


def scriptsize(string):
    return f"\scriptsize{{{string}}}"


def tiny(string):
    return f"\\tiny{{{string}}}"


def rename_method(method_name, parameters=True):

    name_dict = {
        "pubmedbert": "PubMedBERT",
        "scibert": "SciBERT",
        "bert": "BERT",
        "mesh": "MeSH",
        "quickumls": "QuickUMLS",
        "umls": "UMLS",
        "contrastive_weight": "CW",
        "term_domain_specificity": "TDS",
        "discriminative_weight": "DW",
        "encyclopedia": "ENC",
        "pubmed": "PM",
        "ctakes": "cTakes",
        "metamap": "MetaMap",
        "scispacy": "ScispaCy",
        "en_core_sci_": "",
        "rx_sno": "RS",
        "full": "Full",
        "noun_phrase": "NP",
        "sentence": "S",
    }

    for name, replace_name in name_dict.items():
        method_name = method_name.replace(name, replace_name)
    method_split = method_name.split("-")
    params = []
    if "CW" in method_name or "DW" in method_name or "TDS" in method_name:
        name = method_split[0]
        if parameters:
            try:
                method_split[-2] = (
                    "$n$=" + method_split[-2].strip("()").split(",")[1].strip()
                )
                method_split[-1] = f"$M_{{{method_split[-1]}}}$"
                params = method_split[1:]
            except IndexError:
                pass
    elif "ScispaCy" in method_name:
        name = method_split[0]
        if parameters:
            try:
                params = [method_split[1], f"\\textit{{{method_split[2]}}}", method_split[3]]
            except IndexError:
                pass
    elif "health_BERT" in method_name:
        name = method_split[1]
        if parameters:
            params = method_split[2:]
    else:
        name = method_split[0] 
        if parameters:
            params = method_split[1:] 
    if params:
        name += " " + scriptsize(", ".join(params))
    return name


def rename_operator(operator_name):
    operator_name = operator_name.replace("inf", "\infty").replace("neg_", "-")
    value = operator_name.split("=")[1].split("_")[0]
    if value[0] == "-":
        value = r"\texttt{--}\!" + value[1:]
    return f"$M_{{{value}}}$"

In [None]:
dataset = "random_full"
# dataset = "random_support"
# dataset = "sentence"
# dataset = "wikidata"
# dataset = "practitioner_full"
# dataset = "practitioner_sure"
# dataset = "practitioner_unsure"

approaches = [
    "ctakes",
    "metamap",
    "quickumls",
    "scispacy",
    "health_bert",
    "contrastive",
    "specificity",
    "discriminative",
]


def key(series):
    order = pd.Series(-1, index=series.index)
    for idx, approach in enumerate(approaches):
        order[series.str.contains(approach)] = idx
    return order


threshold_score = "precision"
threshold = 0.9
optimization_score = "recall"
if threshold_score:
    pretty_print_approaches = pd.read_csv(f"test_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")
else:
    pretty_print_approaches = pd.read_csv(f"test_best_approaches_{optimization_score}.csv")
pretty_print_approaches["pretty_method"] = pretty_print_approaches.method.map(
    rename_method
)
pretty_print_approaches["pretty_operator"] = pretty_print_approaches.operator.map(
    rename_operator
)

pretty_print_approaches = pretty_print_approaches.loc[
    pretty_print_approaches.dataset == dataset
]
pretty_print_approaches = pretty_print_approaches.loc[
    key(pretty_print_approaches.method) != -1
]
pretty_print_approaches = pretty_print_approaches.sort_values(by="method", key=key)

pprint(
    pretty_print_approaches.loc[
        :, [
            "pretty_method", 
            "pretty_operator", 
            "precision", 
            "recall", 
#             "f1", 
#             "mcc"
        ]
    ].values,
    bold=True,
    bold_idcs=[-1, -2, -3, -4],
)

In [None]:
precision_threshold = 0.0
dataset = "random_full"
linker_approaches = pd.read_csv(
    f"best_approaches_{precision_threshold}.csv", index_col=0
)
linker_approaches = linker_approaches.loc[linker_approaches.dataset == dataset].iloc[:4]
linker_predictions = []
for _, method, operator, threshold, *_ in linker_approaches.values:
    label = "-".join([method, operator, str(threshold), "medical"])
    linker_predictions.append(
        test_causenet_medical.loc[test_causenet.dataset == dataset, label]
    )
linker_predictions = pd.concat(linker_predictions, axis=1)
for threshold in range(1, 5):
    combined_predictions = linker_predictions.sum(axis=1) >= threshold
    tp = (
        combined_predictions
        & test_causenet.evaluation.loc[test_causenet.dataset == dataset]
    ).sum()
    tn = (
        ~combined_predictions
        & ~test_causenet.evaluation.loc[test_causenet.dataset == dataset]
    ).sum()
    fp = (
        combined_predictions
        & ~test_causenet.evaluation.loc[test_causenet.dataset == dataset]
    ).sum()
    fn = (
        ~combined_predictions
        & test_causenet.evaluation.loc[test_causenet.dataset == dataset]
    ).sum()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(threshold, round(precision, 2), round(recall, 2), round(f1, 2), round(accuracy, 2))
linker_predictions.sum(axis=1).value_counts().sort_index()

In [None]:
# ablation tests
threshold_score = "recall"
threshold = 0.9
optimization_score = "precision"
datasets = [
    "random_full",
    "random_support",
    "sentence",
#     "support",
#     "wikidata",
#     "practitioner_full",
#     "practitioner_sure",
#     "practitioner_unsure",
]
use_test_metrics = True
full_train = True
if use_test_metrics:
    ablation_test_metrics = test_causenet_metrics
    ablation_set = "test"
else:
    ablation_test_metrics = None
    ablation_set = "validation"
val_causenet = validation_causenet_metrics.loc[:, ["dataset", "method", "operator"]].drop_duplicates()
val_causenet["method_class"] = val_causenet["method"].map(lambda x: x.split("-")[0])


if threshold_score:
    ablation_best_approaches = pd.read_csv(f"{ablation_set}_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv", index_col=0)
else:
    ablation_best_approaches = pd.read_csv(f"{ablation_set}_best_approaches_{optimization_score}.csv", index_col=0)

ablation_best_approaches = ablation_best_approaches.loc[
    ablation_best_approaches.method_class.isin(["contrastive_weight", "discriminative_weight", "term_domain_specificity"]),
    ["dataset", "method", "operator", "method_class", optimization_score]
]
ablation_best_approaches = ablation_best_approaches.reset_index(drop=True)
ablation_methods = ablation_best_approaches.copy().drop(optimization_score, axis=1)
ablation_best_approaches = ablation_best_approaches.loc[:, ["dataset", "method_class", optimization_score]]

def get_ablation_df(ablation_df, validation_metrics, test_metrics=None):
    ablation_df = ablation_df.merge(validation_metrics.dropna())
    if threshold_score:
        ablation_df = ablation_df.loc[ablation_df[threshold_score] >= threshold]
    ablation_df = ablation_df.groupby(["dataset", "method_class"]).apply(lambda x: x.sort_values(optimization_score).iloc[-1]).reset_index(drop=True)
    if test_metrics is not None:
        ablation_df = ablation_df.merge(test_metrics, how="left", on=["dataset", "method", "operator", "threshold"], suffixes=("_to_drop", ""))
        ablation_df = ablation_df.drop(ablation_df.filter(like="_to_drop"), axis=1)
    ablation_df = ablation_df.loc[:, ["dataset", "method_class", optimization_score]]
    return ablation_df

if full_train:
    pattern = r".*-\(1, 1\)-.*"
    ablation_arithmetic_mean = get_ablation_df(
        val_causenet.loc[val_causenet.method.str.contains(pattern)],
        validation_causenet_metrics,
        test_causenet_metrics,
    )
else:
    ablation_arithmetic_mean = ablation_methods.copy()
    ablation_arithmetic_mean["method"] = ablation_arithmetic_mean["method"].map(lambda x: "-".join(x.split("-")[:-1]) + "-1")
    ablation_arithmetic_mean = get_ablation_df(ablation_arithmetic_mean, validation_causenet_metrics, test_causenet_metrics)
ablation_best_approaches = ablation_best_approaches.merge(ablation_arithmetic_mean, on=["dataset", "method_class"], suffixes=["", "-arithmetic_mean"], how="left")

if full_train:
    pattern = r".*-\(1, .\)-1"
    ablation_n_gram = get_ablation_df(
        val_causenet.loc[val_causenet.method.str.contains(pattern)],
        validation_causenet_metrics,
        test_causenet_metrics,
    )
else:
    ablation_n_gram = ablation_methods.copy()
    ablation_n_gram["method"] = ablation_n_gram["method"].map(lambda x: x.split(")")[0][:-1] + "1)" + x.split(")")[1])
    ablation_n_gram = get_ablation_df(ablation_n_gram, validation_causenet_metrics, test_causenet_metrics)
ablation_best_approaches = ablation_best_approaches.merge(ablation_n_gram, on=["dataset", "method_class"], suffixes=["", "-n_gram"], how="left")

# if full_train:
#     pattern = r".*-\(1, 1\)-1"
#     ablation_arithmetic_mean_n_gram = get_ablation_df(
#         val_causenet.loc[val_causenet.method.str.contains(pattern)],
#         validation_causenet_metrics,
#         test_causenet_metrics,
#     )
# else:
#     ablation_arithmetic_mean_n_gram = ablation_methods.copy()
#     ablation_arithmetic_mean_n_gram["method"] = ablation_arithmetic_mean_n_gram["method"].map(lambda x: x.split(")")[0][:-1] + "1)" + x.split(")")[1])
#     ablation_arithmetic_mean_n_gram["method"] = ablation_arithmetic_mean_n_gram["method"].map(lambda x: "-".join(x.split("-")[:-1]) + "-1")
#     ablation_arithmetic_mean_n_gram = get_ablation_df(ablation_arithmetic_mean_n_gram, validation_causenet_metrics, test_causenet_metrics)
# ablation_best_approaches = ablation_best_approaches.merge(ablation_arithmetic_mean_n_gram, on=["dataset", "method_class"], suffixes=["", "-arithmetic_mean_n_gram"], how="left")

ablation_best_approaches = ablation_best_approaches.merge(
    ablation_best_approaches.filter(like=f"{optimization_score}-").subtract(ablation_best_approaches[optimization_score].values, axis=0), 
    suffixes=["", "-reduction"],
    left_index=True,
    right_index=True,
)
ablation_best_approaches

In [None]:
print("\n", pprint(ablation_best_approaches.filter(like="-reduction").values))

In [None]:
# ablation tests
threshold_score = ""
threshold = 0.0
optimization_score = "mcc"
datasets = [
    "random_full",
    "random_support",
    "sentence",
#     "support",
#     "wikidata",
    #     "practitioner_full",
    #     "practitioner_sure",
    #     "practitioner_unsure",
]
eval_ops = ["p=0_mean"] + [f"p={p}_mean" for p in (1, 2, 5, 10, "inf")] + [f"p=neg_{p}_mean" for p in (1, 2, 5, 10, "inf")]

patterns = [
    "contrastive_weight-.*-\(1, 1\)-.*",
    "term_domain_specificity-.*-\(1, 1\)-.*",
    "discriminative_weight-.*-\(1, 1\)-.*",
]
best_approaches_n_grams = best_approach(
    validation_causenet_metrics,
    test_causenet_metrics,
    patterns,
    datasets,
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    False,
)

patterns = [
    "contrastive_weight-.*-\(1, .\)-1",
    "term_domain_specificity-.*-\(1, .\)-1",
    "discriminative_weight-.*-\(1, .\)-1",
]
best_approaches_arithmetic_mean = best_approach(
    validation_causenet_metrics,
    test_causenet_metrics,
    patterns,
    datasets,
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    False,
)

# patterns = [
#     "contrastive_weight-.*-\(1, .\)-neg_inf",
#     "term_domain_specificity-.*-\(1, .\)-neg_inf",
#     "discriminative_weight-.*-\(1, .\)-neg_inf",
# ]
# best_approaches_and = best_approach(
#     validation_causenet_metrics,
#     test_causenet_metrics,
#     patterns,
#     datasets,
#     threshold_score,
#     threshold,
#     optimization_score,
#     eval_ops,
#     False,
# )

# patterns = [
#     "contrastive_weight-.*-\(1, .\)-inf",
#     "term_domain_specificity-.*-\(1, .\)-inf",
#     "discriminative_weight-.*-\(1, .\)-inf",
# ]
# best_approaches_or = best_approach(
#     validation_causenet_metrics,
#     test_causenet_metrics,
#     patterns,
#     datasets,
#     threshold_score,
#     threshold,
#     optimization_score,
#     eval_ops,
#     False,
# )

patterns = [
    "contrastive_weight-.*-\(1, 1\)-1",
    "term_domain_specificity-.*-\(1, 1\)-1",
    "discriminative_weight-.*-\(1, 1\)-1",
]
best_approaches_n_grams_arithmetic_mean = best_approach(
    validation_causenet_metrics,
    test_causenet_metrics,
    patterns,
    datasets,
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    False,
)

# patterns = [
# #     "metamap",
# #     "ctakes",
# #     "quickumls",
# #     "scispacy",
# #     "-bert",
# #     "scibert",
# #     "pubmedbert",
#     "contrastive_weight-.*-\(1, .\)-.*",
#     "term_domain_specificity-.*-\(1, .\)-.*",
#     "discriminative_weight-.*-\(1, .\)-.*",
# ]
# eval_ops = ["p=neg_inf_mean"]
# best_approaches_and_operator = best_approach(
#     validation_causenet_metrics,
#     test_causenet_metrics,
#     patterns,
#     [dataset for dataset in datasets if dataset != "sentence"],
#     threshold_score,
#     threshold,
#     optimization_score,
#     eval_ops,
#     False,
# )

# patterns = [
# #     "metamap",
# #     "ctakes",
# #     "quickumls",
# #     "scispacy",
# #     "-bert",
# #     "scibert",
# #     "pubmedbert",
#     "contrastive_weight-.*-\(1, .\)-.*",
#     "term_domain_specificity-.*-\(1, .\)-.*",
#     "discriminative_weight-.*-\(1, .\)-.*",
# ]
# eval_ops = ["p=inf_mean"]
# best_approaches_or_operator = best_approach(
#     validation_causenet_metrics,
#     test_causenet_metrics,
#     patterns,
#     [dataset for dataset in datasets if dataset != "sentence"],
#     threshold_score,
#     threshold,
#     optimization_score,
#     eval_ops,
#     False,
# )

patterns = [
#     "metamap",
#     "ctakes",
#     "quickumls",
#     "scispacy",
#     "-bert",
#     "scibert",
#     "pubmedbert",
    "contrastive_weight-.*-\(1, .\)-.*",
    "term_domain_specificity-.*-\(1, .\)-.*",
    "discriminative_weight-.*-\(1, .\)-.*",
]
eval_ops = ["p=1_mean"]
best_approaches_arithmetic_mean_operator = best_approach(
    validation_causenet_metrics,
    test_causenet_metrics,
    patterns,
    [dataset for dataset in datasets if dataset != "sentence"],
    threshold_score,
    threshold,
    optimization_score,
    eval_ops,
    False,
)

# patterns = [
#     "health_bert-.*-sentence",
# ]
# eval_ops = ["and"] + [f"p={p}_mean" for p in (1, 2, 5, 10, "inf")]
# best_approaches_sentence = best_approach(
#     test_causenet_metrics,
#     patterns,
#     datasets,
#     threshold_score,
#     threshold,
#     optimization_score,
#     eval_ops,
#     False,
# )

if threshold_score:
    best_approaches_comparison = pd.read_csv(f"test_best_approaches_{optimization_score}_{threshold_score}_{threshold}.csv")
else:
    best_approaches_comparison = pd.read_csv(f"test_best_approaches_{optimization_score}.csv")

best_approaches_comparison = best_approaches_comparison.loc[:, ["method_class", "dataset", optimization_score]]
best_approaches_comparison = best_approaches_comparison.merge(best_approaches_n_grams.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-n_grams"), how="inner")
# best_approaches_comparison = best_approaches_comparison.merge(best_approaches_and.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-and"), how="outer")
# best_approaches_comparison = best_approaches_comparison.merge(best_approaches_or.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-or"), how="outer")
# best_approaches_comparison = best_approaches_comparison.merge(best_approaches_and_operator.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-and_operator"), how="outer")
# best_approaches_comparison = best_approaches_comparison.merge(best_approaches_or_operator.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-or_operator"), how="outer")
best_approaches_comparison = best_approaches_comparison.merge(best_approaches_arithmetic_mean.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-arithmetic_mean"), how="outer")
best_approaches_comparison = best_approaches_comparison.merge(best_approaches_n_grams_arithmetic_mean.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-n_grams_arithmetic_mean"), how="outer")
best_approaches_comparison = best_approaches_comparison.merge(best_approaches_arithmetic_mean_operator.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-arithmetic_mean_operator"), how="outer")
# best_approaches_comparison = best_approaches_comparison.merge(best_approaches_sentence.loc[:, ["method_class", "dataset", optimization_score]], on=["method_class", "dataset"], suffixes=("", "-sentence"), how="outer")
other_scores = best_approaches_comparison.filter(regex=f"{optimization_score}-", axis=1)
reduction = other_scores.subtract(best_approaches_comparison[optimization_score].values, axis=0)
best_approaches_comparison = pd.concat([best_approaches_comparison, reduction.add_suffix("-reduction")], axis=1)
best_approaches_comparison.filter(regex=f"method_class|dataset|{optimization_score}-.*reduction")

In [None]:
validation_causenet_metrics.loc[(validation_causenet_metrics.dataset == "sentence") & (validation_causenet_metrics.method.str.contains("term")) & (validation_causenet_metrics.method.str.contains("enc"))].dropna().sort_values("mcc")

In [None]:
test_causenet_metrics.loc[(test_causenet_metrics.dataset == "sentence") & (test_causenet_metrics.method.str.contains("term")) & (test_causenet_metrics.method.str.contains("enc"))].dropna().sort_values("mcc")

In [None]:
pprint(best_approaches_comparison.values)

In [None]:
best_approaches_comparison

In [None]:
best_approaches_comparison.loc[best_approaches_comparison.method_class.str.contains("bert")].filter(regex="method_class|dataset|mcc-.*reduction").mean()

In [None]:
best_approaches_comparison.filter(regex="method_class|dataset|mcc-.*reduction").groupby("method_class").mean()

In [None]:
best_approaches_comparison.loc[best_approaches_comparison.method_class.str.contains("bert")].filter(regex="method_class|dataset|mcc-.*reduction").mean()

In [None]:
# different classifications between methods

# method_1_name = "discriminative_weight_pubmed_(1, 1)_1"
# method_2_name = "discriminative_weight_pubmed_(1, 2)_1"
# op_1 = "and"
# op_2 = "and"
# threshold_1 = "66"
# threshold_2 = "66"

sort = False
ascending=False
dataset = "random_full"
method_1 = "term_domain_specificity"
method_2 = "pubmedbert"
method_1_data = best_approaches.loc[best_approaches.dataset == dataset].set_index("method_class").loc[method_1]
method_2_data = best_approaches.loc[best_approaches.dataset == dataset].set_index("method_class").loc[method_2]
method_1_name = method_1_data.method
method_2_name = method_2_data.method
op_1 = method_1_data.operator
op_2 = method_2_data.operator
threshold_1 = method_1_data.threshold
threshold_2 = method_2_data.threshold

medical_label_1 = "-".join([method_1_name, op_1, threshold_1, "medical"])
medical_label_2 = "-".join([method_2_name, op_2, threshold_2, "medical"])
combined_label_1 = "-".join([method_1_name, op_1])
combined_label_2 = "-".join([method_2_name, op_2])
score_label_1 = "-".join(["medical_score", "{}", method_1_name])
score_label_2 = "-".join(["medical_score", "{}", method_2_name])

labels = [
    "cause",
    "effect",
    "evaluation",
    medical_label_1,
    medical_label_2,
    combined_label_1,
    combined_label_2,
    score_label_1.format("cause"),
    score_label_1.format("effect"),
    score_label_2.format("cause"),
    score_label_2.format("effect"),
]

df_filter = pd.Series(True, index=test_causenet_medical.index)
df_filter = df_filter & (test_causenet.dataset == dataset)
df_filter = df_filter & (
    test_causenet_medical[medical_label_1] != test_causenet_medical[medical_label_2]
)

health_causenet_errors = pd.concat(
    [
        test_causenet_medical.loc[df_filter, [medical_label_1, medical_label_2]],
        test_causenet.loc[
            df_filter, 
            [
                "cause", 
                "effect", 
                "evaluation",
                score_label_1.format("cause"),
                score_label_2.format("cause"),
                score_label_1.format("effect"),
                score_label_2.format("effect"),
            ]
        ],
        test_causenet_combined.loc[df_filter, [combined_label_1, combined_label_2]]
    ],
    axis=1,
)
health_causenet_errors = health_causenet_errors.loc[:, labels]
# if sort:
#     sort_index = (
#         (health_causenet_errors.iloc[:, -2] + health_causenet_errors.iloc[:, -1])
#         .sort_values(ascending=ascending)
#         .index
#     )
#     health_causenet_errors = health_causenet_errors.loc[sort_index]
health_causenet_errors = health_causenet_errors.sort_values(["evaluation", medical_label_1, medical_label_2])
# pprint(health_causenet_errors.head(40).values)
health_causenet_errors.to_csv(f"errors-{method_1_name}_vs_{method_2_name}.csv")
health_causenet_errors

In [None]:
test_causenet_combined.loc[df_filter]

In [None]:
ascending = False
sort = False
method = "discriminative_weight-encyclopedia-(1, 1)-2"
op = "arithmetic_mean"
threshold = "60"
medical_label = "-".join([method, op, threshold, "medical"])
labels = [
    "cause",
    "effect",
    "cause_origin",
    "effect_origin",
    f"medical_score-cause-{method}",
    f"medical_score-effect-{method}",
]
dataset = "wikidata"
errors = [
    #     "tp",
    "fp",
    #     "tn",
    #     "fn",
]
value_filter = pd.Series(True, index=test_causenet_medical.index)
if dataset:
    value_filter = value_filter & ((test_causenet.dataset == dataset).values)
if errors:
    error_filter = pd.Series(False, index=test_causenet_medical.index)
    if "tp" in errors:
        error_filter = error_filter | (test_causenet_medical[medical_label] == 1) & (
            evaluation.values == 1
        )
    if "fp" in errors:
        error_filter = error_filter | (test_causenet_medical[medical_label] == 1) & (
            evaluation.values == 0
        )
    if "tn" in errors:
        error_filter = error_filter | (test_causenet_medical[medical_label] == 0) & (
            evaluation.values == 0
        )
    if "fn" in errors:
        error_filter = error_filter | (test_causenet_medical[medical_label] == 0) & (
            evaluation.values == 1
        )
    value_filter = value_filter & error_filter
health_causenet_errors = pd.concat(
    [
        test_causenet.loc[value_filter.values, labels],
        test_causenet_medical.loc[value_filter.values, [medical_label]],
    ],
    axis=1,
)
health_causenet_errors["evaluation"] = evaluation.loc[value_filter.values].values
health_causenet_errors = health_causenet_errors.loc[
    :,
    [
        "cause",
        "effect",
        "cause_origin",
        "effect_origin",
        "evaluation",
        medical_label,
        f"medical_score-cause-{method}",
        f"medical_score-effect-{method}",
    ],
]
if sort:
    sort_index = (
        (health_causenet_errors.iloc[:, -2] + health_causenet_errors.iloc[:, -1])
        .sort_values(ascending=ascending)
        .index
    )
    health_causenet_errors = health_causenet_errors.loc[sort_index]
# else:
#     health_causenet_errors = health_causenet_errors.sample(health_causenet_errors.shape[0])
health_causenet_errors

# Sentence Analysis

In [None]:
patterns = [
    "quickumls",
    "scispacy",
    "ctakes",
    "metamap",
    "-bert",
    "scibert",
    "pubmedbert",
    "contrastive",
    "specificity",
    "discriminative",
]

precision_threshold = 0.0
datasets = [
    "random_full",
    #     "random_support",
    #     "support",
]
macro = False
eval_ops = ["p=inf_mean"]
optimization_score = "mcc"

best_approaches = best_approach(
    sentence_test_causenet_metrics,
    patterns,
    datasets,
    precision_threshold,
    optimization_score,
    eval_ops,
    macro,
)
best_approaches.to_csv(f"sentence_best_approaches_{optimization_score}_{precision_threshold}.csv")
best_approaches

In [None]:
# dataset = "wikidata"
dataset = "random_full"
# dataset = "random_support"
# dataset = "practitioner_full"
# dataset = "practitioner_sure"
# dataset = "practitioner_unsure"

approaches = [
    "metamap",
    "ctakes",
    "quickumls",
    "scispacy",
    "-bert",
    "scibert",
    "pubmedbert",
    "contrastive",
    "specificity",
    "discriminative",
]


def key(series):
    order = pd.Series(-1, index=series.index)
    for idx, approach in enumerate(approaches):
        order[series.str.contains(approach)] = idx
    return order


pretty_print_approaches = best_approaches.copy()
pretty_print_approaches["pretty_method"] = pretty_print_approaches.method.map(
    lambda x: rename_method(x, False)
)
pretty_print_approaches["pretty_operator"] = pretty_print_approaches.operator.map(
    rename_operator
)

pretty_print_approaches = pretty_print_approaches.loc[
    pretty_print_approaches.dataset == dataset
]
pretty_print_approaches = pretty_print_approaches.loc[
    key(pretty_print_approaches.method) != -1
]
pretty_print_approaches = pretty_print_approaches.sort_values(by="method", key=key)

pprint(
    pretty_print_approaches.loc[
        :, ["pretty_method", "precision", "recall", "f1", "mcc"]
    ].values,
    bold=True,
    bold_idcs=[-1, -2, -3],
)

In [None]:
dataset = "random_full"
# method = "health_bert-encyclopedia-noun_phrase"
method = "discriminative_weight-encyclopedia-(1, 1)-1"
operator = "or"
# threshold = "0.64"
threshold = "35"
label_name = "-".join((method, operator, threshold)) + "-medical"
score_name = "medical_score-cause-" + method
filter_bool = (
    (sentence_test_causenet.dataset == dataset)
    & ~sentence_test_causenet_medical[label_name]
    & sentence_test_causenet.evaluation
)
sentence_test_causenet.loc[filter_bool, ["cause", score_name, "evaluation"]].rename(
    {score_name: "medical_score"}, axis=1
)

In [None]:
dataset = "random_full"
method_1 = "health_bert-encyclopedia-noun_phrase"
method_2 = "discriminative_weight-encyclopedia-(1, 1)-1"
threshold_1 = "0.64"
threshold_2 = "35"
label_name_1 = "-".join((method_1, "or", threshold)) + "-medical"
label_name_2 = "-".join((method_2, "or", threshold)) + "-medical"
score_name = "medical_score-cause-" + method
filter_bool = (
    (sentence_test_causenet.dataset == dataset)
    & ~sentence_test_causenet_medical[label_name]
    & sentence_test_causenet.evaluation
)
sentence_test_causenet.loc[filter_bool, ["cause", score_name, "evaluation"]].rename(
    {score_name: "medical_score"}, axis=1
)

# Sentence vs Phrase Analysis

In [None]:
list([method for method in test_causenet_medical if "discriminative_weight-encyclopedia-(1, 3)" in method])

In [None]:
precision_threshold = 0.0
dataset = "random_full"
approach = "health_bert"
phrase_approaches = pd.read_csv(
    f"best_approaches_{precision_threshold}.csv", index_col=0
)
phrase_approaches = phrase_approaches.loc[phrase_approaches.dataset == "random_full"]
sentence_approaches = pd.read_csv(
    f"sentence_best_approaches_{precision_threshold}.csv", index_col=0
)

phrase_predictions = []
sentence_predictions = []
for _, method, operator, threshold, *_ in phrase_approaches.values:
    try:
        label = "-".join([method, operator, str(threshold), "medical"])
        phrase_predictions.append(
            test_causenet_medical.loc[test_causenet.dataset == dataset, label]
        )
    except KeyError:
        threshold = int(threshold)
        label = "-".join([method, operator, str(threshold), "medical"])
        phrase_predictions.append(
            test_causenet_medical.loc[test_causenet.dataset == dataset, label]
        )
for _, method, operator, threshold, *_ in sentence_approaches.values:
    try:
        label = "-".join([method, operator, str(threshold), "medical"])
        sentence_predictions.append(
            sentence_test_causenet_medical.loc[test_causenet.dataset == dataset, label]
        )
    except KeyError:
        threshold = int(threshold)
        label = "-".join([method, operator, str(threshold), "medical"])
        sentence_predictions.append(
            sentence_test_causenet_medical.loc[test_causenet.dataset == dataset, label]
        )
phrase_predictions = pd.concat(phrase_predictions, axis=1)
sentence_predictions = pd.concat(sentence_predictions, axis=1)
phrase_predictions.columns = [
    column.split("-")[0] for column in phrase_predictions.columns
]
sentence_predictions.columns = [
    column.split("-")[0] for column in sentence_predictions.columns
]
different_predictions = phrase_predictions != sentence_predictions
different_predictions = sentence_test_causenet.loc[sentence_test_causenet.dataset == dataset].loc[
    different_predictions[approach].values, ["cause", "effect", "sentence"]
].join(phrase_predictions[approach].rename("phrase_prediction")).join(
    sentence_predictions[approach].rename("sentence_prediction")
).join(
    test_causenet.evaluation.rename("phrase_label")
).join(
    sentence_test_causenet.manual_evaluation.rename("sentence_label")
).loc[
    :,
    [
        "cause",
        "effect",
        "sentence",
        "phrase_label",
        "sentence_label",
        "phrase_prediction",
        "sentence_prediction",
    ],
]
different_predictions.to_csv(
    f"sentence_phrase_differences_{approach}.csv"
)
print("number of health related predictions:")
print(different_predictions[["phrase_prediction", "sentence_prediction"]].sum())
print()
print("number of incorrect health related predictions:")
print(pd.Series([(different_predictions.phrase_prediction != different_predictions.phrase_label).sum(), 
     (different_predictions.sentence_prediction != different_predictions.sentence_label).sum()], index=["phrase_prediction", "sentence_prediction"]))
print()
different_predictions

# Wikidata Error Analysis

In [None]:
health_causenet_errors.loc[health_causenet_errors.cause == "influenza virus"]

In [None]:
pprint(health_causenet_errors.iloc[:, [0, -2, 1, -1]].values[:10])

In [None]:
test_causenet.loc[
    (test_causenet.cause_origin == "wd:Q87075524")
    | (test_causenet.effect_origin == "wd:Q87075524"),
    ["cause", "effect", "cause_origin", "effect_origin"],
]

In [None]:
# which origins are exclusive
health_causenet_errors.loc[
    (health_causenet_errors.cause_origin == "wd:Q87075524")
    | (health_causenet_errors.effect_origin == "wd:Q87075524")
].head(40)

In [None]:
tp = health_causenet_errors.iloc[:, 5]
fn = ~health_causenet_errors.iloc[:, 5]
error_origin = (
    test_wikidata.set_index(["cause", "effect"])
    .loc[
        health_causenet_errors.set_index(["cause", "effect"]).index,
        ["cause_origin", "effect_origin"],
    ]
    .reset_index()
)
error_origin["label"] = "tp"
error_origin.loc[fn.values, "label"] = "fn"
error_cause_origin = (
    error_origin.set_index(["label", "cause"])
    .cause_origin.str.split("|", expand=True)
    .stack()
    .reset_index()
    .drop("level_2", axis=1)
    .drop_duplicates()
    .rename({0: "origin"}, axis=1)
)
error_effect_origin = (
    error_origin.set_index(["label", "effect"])
    .cause_origin.str.split("|", expand=True)
    .stack()
    .reset_index()
    .drop("level_2", axis=1)
    .drop_duplicates()
    .rename({0: "origin"}, axis=1)
)
origin_value_counts = (
    error_cause_origin.groupby("label").origin.value_counts()
    + error_effect_origin.groupby("label").origin.value_counts()
).unstack(0)
origin_value_counts["perc"] = origin_value_counts.tp / origin_value_counts.sum(axis=1)
origin_value_counts.sort_values("perc")