# SBIC corpus evaluations


## Loading & Prep


In [None]:
import pandas as pd
import json
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
)
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from tabulate import tabulate
from typing import List

In [None]:
SEEDS = [23, 42, 271, 314, 1337]

In [None]:
np.random.seed(42)

SBIC_CATEGORICAL_COLUMNS = ["groupYN", "intentYN", "lewdYN", "offensiveYN", "ingroupYN"]
approaches = [
    "majority",
    "random",
    "deberta-v3-base-finetune",
    "deberta-v3-base-two-task-mtl",
    "deberta-v3-base-joint-mtl",
    "deberta-v3-base-joint-mtl-no-aae",
    "roberta-base-finetune",
    "roberta-base-two-task-mtl",
    "roberta-base-joint-mtl",
    "roberta-base-joint-mtl-no-aae",
    "bert-base-uncased-finetune",
    "bert-base-uncased-two-task-mtl",
    "bert-base-uncased-joint-mtl",
    "bert-base-uncased-joint-mtl-no-aae",
]

In [None]:
# Loading AAE annotated SBIC data
sbic_train = pd.read_csv(
    "aae-classification/output/sbic-train_aae-annotated-deberta-v3-base-aee-classifier.csv"
)
sbic_val = pd.read_csv(
    "aae-classification/output/sbic-val_aae-annotated-deberta-v3-base-aee-classifier.csv"
)
sbic_test = pd.read_csv(
    "aae-classification/output/sbic-test_aae-annotated-deberta-v3-base-aee-classifier.csv"
)

In [None]:
# Load results for STL models
finetuned_test = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"finetuning/output/sbic-test_predictions-deberta-v3-base-finetune-seed{seed}.csv"
    )
    if "post_id" not in finetuned_test.columns:
        finetuned_test["post_id"] = results_seed["post_id"]

    finetuned_test = pd.merge(finetuned_test, results_seed, on="post_id", how="left")

finetuned_test_roberta_base = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"finetuning/output/sbic-test_predictions-roberta-base-finetune-seed{seed}.csv"
    )
    if "post_id" not in finetuned_test_roberta_base.columns:
        finetuned_test_roberta_base["post_id"] = results_seed["post_id"]

    finetuned_test_roberta_base = pd.merge(
        finetuned_test_roberta_base, results_seed, on="post_id", how="left"
    )

finetuned_test_bert_base_uncased = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"finetuning/output/sbic-test_predictions-bert-base-uncased-finetune-seed{seed}.csv"
    )
    if "post_id" not in finetuned_test_bert_base_uncased.columns:
        finetuned_test_bert_base_uncased["post_id"] = results_seed["post_id"]

    finetuned_test_bert_base_uncased = pd.merge(
        finetuned_test_bert_base_uncased, results_seed, on="post_id", how="left"
    )

In [None]:
# Load results for two-task models
two_task_mtl_test = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-deberta-v3-base-two-task-mtl-seed{seed}.csv"
    )
    if "post_id" not in two_task_mtl_test.columns:
        two_task_mtl_test["post_id"] = results_seed["post_id"]

    two_task_mtl_test = pd.merge(
        two_task_mtl_test, results_seed, on="post_id", how="left"
    )

two_task_mtl_test_roberta_base = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-roberta-base-two-task-mtl-seed{seed}.csv"
    )
    if "post_id" not in two_task_mtl_test_roberta_base.columns:
        two_task_mtl_test_roberta_base["post_id"] = results_seed["post_id"]

    two_task_mtl_test_roberta_base = pd.merge(
        two_task_mtl_test_roberta_base, results_seed, on="post_id", how="left"
    )

two_task_mtl_test_bert_base_uncased = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-bert-base-uncased-two-task-mtl-seed{seed}.csv"
    )
    if "post_id" not in two_task_mtl_test_bert_base_uncased.columns:
        two_task_mtl_test_bert_base_uncased["post_id"] = results_seed["post_id"]

    two_task_mtl_test_bert_base_uncased = pd.merge(
        two_task_mtl_test_bert_base_uncased, results_seed, on="post_id", how="left"
    )

In [None]:
# Load results for MTL+AAE models
jmtl_test = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-deberta-v3-base-joint-mtl-seed{seed}.csv"
    )
    if "post_id" not in jmtl_test.columns:
        jmtl_test["post_id"] = results_seed["post_id"]

    jmtl_test = pd.merge(jmtl_test, results_seed, on="post_id", how="left")

jmtl_test_roberta_base = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-roberta-base-joint-mtl-seed{seed}.csv"
    )
    if "post_id" not in jmtl_test_roberta_base.columns:
        jmtl_test_roberta_base["post_id"] = results_seed["post_id"]

    jmtl_test_roberta_base = pd.merge(
        jmtl_test_roberta_base, results_seed, on="post_id", how="left"
    )

jmtl_test_bert_base_uncased = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-bert-base-uncased-joint-mtl-seed{seed}.csv"
    )
    if "post_id" not in jmtl_test_bert_base_uncased.columns:
        jmtl_test_bert_base_uncased["post_id"] = results_seed["post_id"]

    jmtl_test_bert_base_uncased = pd.merge(
        jmtl_test_bert_base_uncased, results_seed, on="post_id", how="left"
    )

In [None]:
# Load results for MTL-no-AAE models
jmtl_no_aae_test = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-deberta-v3-base-joint-mtl-no-aae-seed{seed}.csv"
    )
    if "post_id" not in jmtl_no_aae_test.columns:
        jmtl_no_aae_test["post_id"] = results_seed["post_id"]

    jmtl_no_aae_test = pd.merge(
        jmtl_no_aae_test, results_seed, on="post_id", how="left"
    )

jmtl_no_aae_test_roberta_base = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-roberta-base-joint-mtl-no-aae-seed{seed}.csv"
    )
    if "post_id" not in jmtl_no_aae_test_roberta_base.columns:
        jmtl_no_aae_test_roberta_base["post_id"] = results_seed["post_id"]

    jmtl_no_aae_test_roberta_base = pd.merge(
        jmtl_no_aae_test_roberta_base, results_seed, on="post_id", how="left"
    )

jmtl_no_aae_test_bert_base_uncased = pd.DataFrame()
for seed in SEEDS:
    results_seed = pd.read_csv(
        f"joint-multitask-learning/output/sbic-test_predictions-bert-base-uncased-joint-mtl-no-aae-seed{seed}.csv"
    )
    if "post_id" not in jmtl_no_aae_test_bert_base_uncased.columns:
        jmtl_no_aae_test_bert_base_uncased["post_id"] = results_seed["post_id"]

    jmtl_no_aae_test_bert_base_uncased = pd.merge(
        jmtl_no_aae_test_bert_base_uncased, results_seed, on="post_id", how="left"
    )

In [None]:
# Majority baselines
groupYN_majority_label = sbic_test["groupYN"].value_counts(sort=True).index[0]
groupYN_majority_predictions = np.full(len(sbic_test), groupYN_majority_label)
intentYN_majority_label = sbic_test["intentYN"].value_counts(sort=True).index[0]
intentYN_majority_predictions = np.full(len(sbic_test), intentYN_majority_label)
lewdYN_majority_label = sbic_test["lewdYN"].value_counts(sort=True).index[0]
lewdYN_majority_predictions = np.full(len(sbic_test), lewdYN_majority_label)
offensiveYN_majority_label = sbic_test["offensiveYN"].value_counts(sort=True).index[0]
offensiveYN_majority_predictions = np.full(len(sbic_test), offensiveYN_majority_label)
ingroupYN_majority_label = sbic_test["ingroupYN"].value_counts(sort=True).index[0]
ingroupYN_majority_predictions = np.full(len(sbic_test), ingroupYN_majority_label)

# Pseudo-random baselines
groupYN_random_predictions = np.random.randint(2, size=len(sbic_test))
intentYN_random_predictions = np.random.randint(2, size=len(sbic_test))
lewdYN_random_predictions = np.random.randint(2, size=len(sbic_test))
offensiveYN_random_predictions = np.random.randint(2, size=len(sbic_test))
ingroupYN_random_predictions = np.random.randint(2, size=len(sbic_test))

In [None]:
predictions_df = sbic_test.copy()

# Random baseline predictions
predictions_df["prediction_groupYN_random"] = groupYN_random_predictions
predictions_df["prediction_intentYN_random"] = intentYN_random_predictions
predictions_df["prediction_lewdYN_random"] = lewdYN_random_predictions
predictions_df["prediction_offensiveYN_random"] = offensiveYN_random_predictions
predictions_df["prediction_ingroupYN_random"] = ingroupYN_random_predictions


# ------------------------------------------------------------------------------------
# Majority baseline predictions
predictions_df["prediction_groupYN_majority"] = groupYN_majority_predictions
predictions_df["prediction_intentYN_majority"] = intentYN_majority_predictions
predictions_df["prediction_lewdYN_majority"] = lewdYN_majority_predictions
predictions_df["prediction_offensiveYN_majority"] = offensiveYN_majority_predictions
predictions_df["prediction_ingroupYN_majority"] = ingroupYN_majority_predictions


# ------------------------------------------------------------------------------------
# Singletask predictions
predictions_df = pd.merge(predictions_df, finetuned_test, on="post_id", how="left")


# ------------------------------------------------------------------------------------
# Singletask (roberta-base) predictions
predictions_df = pd.merge(
    predictions_df, finetuned_test_roberta_base, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# Singletask (bert-base-uncased) predictions
predictions_df = pd.merge(
    predictions_df, finetuned_test_bert_base_uncased, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# Two-task predictions
predictions_df = pd.merge(predictions_df, two_task_mtl_test, on="post_id", how="left")


# ------------------------------------------------------------------------------------
# Two-task predictions (roberta-base)
predictions_df = pd.merge(
    predictions_df, two_task_mtl_test_roberta_base, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# Two-task predictions (bert-base-uncased)
predictions_df = pd.merge(
    predictions_df, two_task_mtl_test_bert_base_uncased, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# MTL predictions
predictions_df = pd.merge(predictions_df, jmtl_test, on="post_id", how="left")


# ------------------------------------------------------------------------------------
# MTL predictions (roberta-base)
predictions_df = pd.merge(
    predictions_df, jmtl_test_roberta_base, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# MTL predictions (bert-base-uncased)
predictions_df = pd.merge(
    predictions_df, jmtl_test_bert_base_uncased, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# MTL (No AAE) predictions
predictions_df = pd.merge(predictions_df, jmtl_no_aae_test, on="post_id", how="left")


# ------------------------------------------------------------------------------------
# MTL (No AAE) predictions (roberta-base)
predictions_df = pd.merge(
    predictions_df, jmtl_no_aae_test_roberta_base, on="post_id", how="left"
)


# ------------------------------------------------------------------------------------
# MTL (No AAE) predictions (bert-base-uncased)
predictions_df = pd.merge(
    predictions_df, jmtl_no_aae_test_bert_base_uncased, on="post_id", how="left"
)


prediction_columns_by_approach = [
    list(filter(lambda x: x if approach in x else None, predictions_df.columns))
    for approach in approaches
]

## Dataset


In [None]:
print("Class distributions (test set)")
for label in SBIC_CATEGORICAL_COLUMNS:
    print(f"==={label}")
    num_neg = len(sbic_test[sbic_test[label] == 0])
    num_pos = len(sbic_test[sbic_test[label] == 1])
    print(f"Negative: {num_neg} ({num_neg / len(sbic_test):.2f})")
    print(
        f"Positive: {num_pos} ({num_pos / len(sbic_test):.2f})",
    )

## Classification scores


### Overall


In [None]:
for label in SBIC_CATEGORICAL_COLUMNS:
    print("=" * 40)
    print(label)
    print("=" * 40)

    table_0_headers = ["Accuracy (0)", "Precision (0)", "Recall (0)", "F1 (0)"]
    table_0 = []
    table_1_headers = ["Accuracy (1)", "Precision (1)", "Recall (1)", "F1 (1)"]
    table_1 = []
    table_macro_headers = [
        "Accuracy",
        "Precision (macro)",
        "Recall (macro)",
        "F1 (macro)",
    ]
    table_macro = []

    for approach_name in approaches:
        approach_accuracy_0 = []
        approach_precision_0 = []
        approach_recall_0 = []
        approach_f1_0 = []
        approach_accuracy_1 = []
        approach_precision_1 = []
        approach_recall_1 = []
        approach_f1_1 = []
        approach_accuracy_macro = []
        approach_precision_macro = []
        approach_recall_macro = []
        approach_f1_macro = []

        for seed in SEEDS:
            if approach_name == "majority" or approach_name == "random":
                prediction = predictions_df[f"prediction_{label}_{approach_name}"]
            else:
                prediction = predictions_df[
                    f"prediction_{label}_{approach_name}-seed{seed}"
                ]

            test_accuracy_0 = accuracy_score(y_true=sbic_test[label], y_pred=prediction)
            test_precision_0, test_recall_0, test_f1_0, support_0 = (
                precision_recall_fscore_support(
                    y_true=sbic_test[label],
                    y_pred=prediction,
                    pos_label=0,
                    average="binary",
                )
            )
            approach_accuracy_0.append(test_accuracy_0)
            approach_precision_0.append(test_precision_0)
            approach_recall_0.append(test_recall_0)
            approach_f1_0.append(test_f1_0)
            table_0.append(
                [
                    f"{approach_name}-seed{seed}".capitalize(),
                    np.round(test_accuracy_0, decimals=3),
                    np.round(test_precision_0, decimals=3),
                    np.round(test_recall_0, decimals=3),
                    np.round(test_f1_0, decimals=3),
                ]
            )

            test_accuracy_1 = accuracy_score(y_true=sbic_test[label], y_pred=prediction)
            test_precision_1, test_recall_1, test_f1_1, support_1 = (
                precision_recall_fscore_support(
                    y_true=sbic_test[label],
                    y_pred=prediction,
                    pos_label=1,
                    average="binary",
                )
            )
            approach_accuracy_1.append(test_accuracy_1)
            approach_precision_1.append(test_precision_1)
            approach_recall_1.append(test_recall_1)
            approach_f1_1.append(test_f1_1)
            table_1.append(
                [
                    f"{approach_name}-seed{seed}".capitalize(),
                    np.round(test_accuracy_1, decimals=3),
                    np.round(test_precision_1, decimals=3),
                    np.round(test_recall_1, decimals=3),
                    np.round(test_f1_1, decimals=3),
                ]
            )

            test_accuracy_macro = 0
            (
                test_precision_macro,
                test_recall_macro,
                test_f1_macro,
                support_macro,
            ) = precision_recall_fscore_support(
                y_true=sbic_test[label],
                y_pred=prediction,
                average="macro",
            )
            approach_accuracy_macro.append(test_accuracy_macro)
            approach_precision_macro.append(test_precision_macro)
            approach_recall_macro.append(test_recall_macro)
            approach_f1_macro.append(test_f1_macro)
            table_macro.append(
                [
                    f"{approach_name}-seed{seed}".capitalize(),
                    test_accuracy_macro,
                    np.round(test_precision_macro, decimals=3),
                    np.round(test_recall_macro, decimals=3),
                    np.round(test_f1_macro, decimals=3),
                ]
            )

        table_0.append(
            [
                f"{approach_name}-average".capitalize(),
                np.round(np.mean(approach_accuracy_0), decimals=3),
                np.round(np.mean(approach_precision_0), decimals=3),
                np.round(np.mean(approach_recall_0), decimals=3),
                np.round(np.mean(approach_f1_0), decimals=3),
            ]
        )
        table_1.append(
            [
                f"{approach_name}-average".capitalize(),
                np.round(np.mean(approach_accuracy_1), decimals=3),
                np.round(np.mean(approach_precision_1), decimals=3),
                np.round(np.mean(approach_recall_1), decimals=3),
                np.round(np.mean(approach_f1_1), decimals=3),
            ]
        )
        table_macro.append(
            [
                f"{approach_name}-average".capitalize(),
                np.round(np.mean(approach_accuracy_macro), decimals=3),
                np.round(np.mean(approach_precision_macro), decimals=3),
                np.round(np.mean(approach_recall_macro), decimals=3),
                np.round(np.mean(approach_f1_macro), decimals=3),
            ]
        )

    print("Negative class binary scores")
    print(tabulate(table_0, headers=table_0_headers, tablefmt="rounded_grid"))
    for appr in table_0:
        print(f"{appr[0]}: {appr[1]},,,{appr[2]},,,,{appr[3]},,,,{appr[4]}")

    print("Positive class binary scores")
    print(tabulate(table_1, headers=table_1_headers, tablefmt="rounded_grid"))
    for appr in table_1:
        print(f"{appr[0]}: ,,{appr[2]},,,,{appr[3]},,,,{appr[4]}")

    print("Macro averaged scores")
    print(tabulate(table_macro, headers=table_macro_headers, tablefmt="rounded_grid"))
    for appr in table_macro:
        print(f"{appr[0]}: ,,,,{appr[2]},,,,{appr[3]},,,,{appr[4]}")

    # Print full approach table row
    print("Full table rows")
    for i in range(len(table_0)):
        print(
            f"{table_0[i][0]}: "  # Approach name
            f"{table_0[i][1]},,"  # Accuracy
            f"{table_1[i][2]},{table_0[i][2]},{table_macro[i][2]},,"  # Precision
            f"{table_1[i][3]},{table_0[i][3]},{table_macro[i][3]},,"  # Recall
            f"{table_1[i][4]},{table_0[i][4]},{table_macro[i][4]}"  # F1
        )

### Per dialect


In [None]:
# Print number of samples per class per dialect
for split_name, split in [
    ("train", sbic_train),
    ("val", sbic_val),
    ("test", sbic_test),
]:
    print("=" * 60)
    print(f"Split: {split_name}")
    aae_samples = split[split["aae_dialect"] == 1]
    non_aae_samples = split[split["aae_dialect"] == 0]
    print(f"Total AAE:\t{len(aae_samples)}")
    print(f"Total Non-AAE:\t{len(non_aae_samples)}")

    # Number of samples per class
    for label in SBIC_CATEGORICAL_COLUMNS:
        print("-" * 40)
        print(label)
        print("-" * 40)

        print(f"AAE, positive:\t\t{len(aae_samples[aae_samples[label] == 1])}")
        print(f"AAE, negative:\t\t{len(aae_samples[aae_samples[label] == 0])}")
        print(
            f"Non-AAE, positive:\t{len(non_aae_samples[non_aae_samples[label] == 1])}"
        )
        print(
            f"Non-AAE, negative:\t{len(non_aae_samples[non_aae_samples[label] == 0])}"
        )

In [None]:
def get_instance_counts(y_true: List[int], y_pred: List[int], pos_label: int = 1):
    """Only works for binary classification"""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    neg_label = 0 if pos_label == 1 else 1

    tp = np.sum(np.logical_and(y_pred == pos_label, y_true == pos_label))
    tn = np.sum(np.logical_and(y_pred == neg_label, y_true == neg_label))
    fp = np.sum(np.logical_and(y_pred == pos_label, y_true == neg_label))
    fn = np.sum(np.logical_and(y_pred == neg_label, y_true == pos_label))

    return (tp, tn, fp, fn)


def true_positive_rate(
    y_true: List[int], y_pred: List[int], average: str = "binary", pos_label: int = 1
) -> float:
    # implemented as tp/(tp+fn) for a specific group
    # tp, tn, fp, fn = get_instance_counts(y_true, y_pred)
    # return tp / (tp + fn)
    return recall_score(
        y_true=y_true, y_pred=y_pred, average=average, pos_label=pos_label
    )


def false_positive_rate(
    y_true: List[int], y_pred: List[int], average: str = "binary", pos_label: int = 1
) -> float:
    # implemented as fp/(fp+tn) for a specific group
    if average == "binary":
        tp, tn, fp, fn = get_instance_counts(y_true, y_pred, pos_label=pos_label)
        return fp / (fp + tn)
    elif average == "macro":
        tp_0, tn_0, fp_0, fn_0 = get_instance_counts(y_true, y_pred, pos_label=0)
        tp_1, tn_1, fp_1, fn_1 = get_instance_counts(y_true, y_pred, pos_label=1)
        fpr_0 = fp_0 / (fp_0 + tn_0)
        fpr_1 = fp_1 / (fp_1 + tn_1)
        return (fpr_0 + fpr_1) / 2


def positive_predicted_value(
    y_true: List[int], y_pred: List[int], average: str = "binary", pos_label: int = 1
) -> float:
    # implemented as tp/(tp+fp)
    # tp, tn, fp, fn = get_instance_counts(y_true, y_pred)
    # return tp / (tp + fp)
    return precision_score(
        y_true=y_true, y_pred=y_pred, average=average, pos_label=pos_label
    )

In [None]:
for dialect in [0, 1]:
    print("-" * 80)
    print("AAE" if dialect == 1 else "Non AAE")
    print("-" * 80)
    for label in SBIC_CATEGORICAL_COLUMNS:
        print("=" * 40)
        print(label)
        print("=" * 40)

        table_0_headers = [
            "Accuracy (0)",
            "Precision (0)",
            "Recall (0)",
            "F1 (0)",
            "TPR (0)",
            "FPR (0)",
            "PPV (0)",
        ]
        table_0 = []
        table_1_headers = [
            "Accuracy (1)",
            "Precision (1)",
            "Recall (1)",
            "F1 (1)",
            "TPR (1)",
            "FPR (1)",
            "PPV (1)",
        ]
        table_1 = []
        table_macro_headers = [
            "Accuracy",
            "Precision (macro)",
            "Recall (macro)",
            "F1 (macro)",
            "TPR (macro)",
            "FPR (macro)",
            "PPV (macro)",
        ]
        table_macro = []

        # mcnemar_pvalues = {}

        all_approach_result_string = []
        for approach_name in approaches:
            approach_accuracy_0 = []
            approach_precision_0 = []
            approach_recall_0 = []
            approach_f1_0 = []
            approach_tpr_0 = []
            approach_fpr_0 = []
            approach_ppv_0 = []
            approach_accuracy_1 = []
            approach_precision_1 = []
            approach_recall_1 = []
            approach_f1_1 = []
            approach_tpr_1 = []
            approach_fpr_1 = []
            approach_ppv_1 = []
            approach_accuracy_macro = []
            approach_precision_macro = []
            approach_recall_macro = []
            approach_f1_macro = []
            approach_tpr_macro = []
            approach_fpr_macro = []
            approach_ppv_macro = []
            for seed in SEEDS:
                y_true = sbic_test[sbic_test["aae_dialect"] == dialect][label]

                if approach_name == "majority" or approach_name == "random":
                    prediction = predictions_df[
                        predictions_df["aae_dialect"] == dialect
                    ][f"prediction_{label}_{approach_name}"]
                else:
                    prediction = predictions_df[
                        predictions_df["aae_dialect"] == dialect
                    ][f"prediction_{label}_{approach_name}-seed{seed}"]

                # Accuracy
                test_accuracy_0 = accuracy_score(y_true=y_true, y_pred=prediction)
                # Precision, recall, f1
                test_precision_0, test_recall_0, test_f1_0, support_0 = (
                    precision_recall_fscore_support(
                        y_true=y_true,
                        y_pred=prediction,
                        pos_label=0,
                        average="binary",
                    )
                )
                # TPR
                tpr_0 = true_positive_rate(
                    y_true=y_true, y_pred=prediction, pos_label=0
                )
                # FPR
                fpr_0 = false_positive_rate(
                    y_true=y_true, y_pred=prediction, pos_label=0
                )
                # PPV
                ppv_0 = positive_predicted_value(
                    y_true=y_true, y_pred=prediction, pos_label=0
                )

                approach_accuracy_0.append(test_accuracy_0)
                approach_precision_0.append(test_precision_0)
                approach_recall_0.append(test_recall_0)
                approach_f1_0.append(test_f1_0)
                approach_tpr_0.append(tpr_0)
                approach_fpr_0.append(fpr_0)
                approach_ppv_0.append(ppv_0)
                table_0.append(
                    [
                        f"{approach_name}-seed{seed}".capitalize(),
                        np.round(test_accuracy_0, decimals=3),
                        np.round(test_precision_0, decimals=3),
                        np.round(test_recall_0, decimals=3),
                        np.round(test_f1_0, decimals=3),
                        np.round(tpr_0, decimals=3),
                        np.round(fpr_0, decimals=3),
                        np.round(ppv_0, decimals=3),
                    ]
                )

                test_accuracy_1 = accuracy_score(y_true=y_true, y_pred=prediction)
                test_precision_1, test_recall_1, test_f1_1, support_1 = (
                    precision_recall_fscore_support(
                        y_true=y_true,
                        y_pred=prediction,
                        pos_label=1,
                        average="binary",
                    )
                )
                # TPR
                tpr_1 = true_positive_rate(
                    y_true=y_true, y_pred=prediction, pos_label=1
                )
                # FPR
                fpr_1 = false_positive_rate(
                    y_true=y_true, y_pred=prediction, pos_label=1
                )
                # PPV
                ppv_1 = positive_predicted_value(
                    y_true=y_true, y_pred=prediction, pos_label=1
                )
                approach_accuracy_1.append(test_accuracy_1)
                approach_precision_1.append(test_precision_1)
                approach_recall_1.append(test_recall_1)
                approach_f1_1.append(test_f1_1)
                approach_tpr_1.append(tpr_1)
                approach_fpr_1.append(fpr_1)
                approach_ppv_1.append(ppv_1)
                table_1.append(
                    [
                        f"{approach_name}-seed{seed}".capitalize(),
                        np.round(test_accuracy_1, decimals=3),
                        np.round(test_precision_1, decimals=3),
                        np.round(test_recall_1, decimals=3),
                        np.round(test_f1_1, decimals=3),
                        np.round(tpr_1, decimals=3),
                        np.round(fpr_1, decimals=3),
                        np.round(ppv_1, decimals=3),
                    ]
                )

                test_accuracy_macro = 0
                (
                    test_precision_macro,
                    test_recall_macro,
                    test_f1_macro,
                    support_macro,
                ) = precision_recall_fscore_support(
                    y_true=y_true,
                    y_pred=prediction,
                    average="macro",
                )
                # TPR
                tpr_macro = true_positive_rate(
                    y_true=y_true, y_pred=prediction, average="macro"
                )
                # FPR
                fpr_macro = false_positive_rate(
                    y_true=y_true, y_pred=prediction, average="macro"
                )
                # PPV
                ppv_macro = positive_predicted_value(
                    y_true=y_true, y_pred=prediction, average="macro"
                )
                approach_accuracy_macro.append(test_accuracy_macro)
                approach_precision_macro.append(test_precision_macro)
                approach_recall_macro.append(test_recall_macro)
                approach_f1_macro.append(test_f1_macro)
                approach_tpr_macro.append(tpr_macro)
                approach_fpr_macro.append(fpr_macro)
                approach_ppv_macro.append(ppv_macro)
                table_macro.append(
                    [
                        f"{approach_name}-seed{seed}".capitalize(),
                        test_accuracy_macro,
                        np.round(test_precision_macro, decimals=3),
                        np.round(test_recall_macro, decimals=3),
                        np.round(test_f1_macro, decimals=3),
                        np.round(tpr_macro, decimals=3),
                        np.round(fpr_macro, decimals=3),
                        np.round(ppv_macro, decimals=3),
                    ]
                )

                # McNemar significance test
                comparison_approach = f"deberta-v3-base-finetune-seed{seed}"
                prediction_comparison = predictions_df[
                    predictions_df["aae_dialect"] == dialect
                ][f"prediction_{label}_{comparison_approach}"]

            table_0.append(
                [
                    f"{approach_name}-average".capitalize(),
                    np.round(np.mean(approach_accuracy_0), decimals=3),
                    np.round(np.mean(approach_precision_0), decimals=3),
                    np.round(np.mean(approach_recall_0), decimals=3),
                    np.round(np.mean(approach_f1_0), decimals=3),
                    np.round(np.mean(approach_tpr_0), decimals=3),
                    np.round(np.mean(approach_fpr_0), decimals=3),
                    np.round(np.mean(approach_ppv_0), decimals=3),
                ]
            )
            table_1.append(
                [
                    f"{approach_name}-average".capitalize(),
                    np.round(np.mean(approach_accuracy_1), decimals=3),
                    np.round(np.mean(approach_precision_1), decimals=3),
                    np.round(np.mean(approach_recall_1), decimals=3),
                    np.round(np.mean(approach_f1_1), decimals=3),
                    np.round(np.mean(approach_tpr_1), decimals=3),
                    np.round(np.mean(approach_fpr_1), decimals=3),
                    np.round(np.mean(approach_ppv_1), decimals=3),
                ]
            )
            table_macro.append(
                [
                    f"{approach_name}-average".capitalize(),
                    np.round(np.mean(approach_accuracy_macro), decimals=3),
                    np.round(np.mean(approach_precision_macro), decimals=3),
                    np.round(np.mean(approach_recall_macro), decimals=3),
                    np.round(np.mean(approach_f1_macro), decimals=3),
                    np.round(np.mean(approach_tpr_macro), decimals=3),
                    np.round(np.mean(approach_fpr_macro), decimals=3),
                    np.round(np.mean(approach_ppv_macro), decimals=3),
                ]
            )

        all_approach_result_string.append("Full results tables")
        all_approach_result_string.append(label)
        if dialect == 0:
            all_approach_result_string.append("no AAE")
        elif dialect == 1:
            all_approach_result_string.append("AAE")
        for i in range(len(table_macro)):
            if "average" in table_0[i][0] and "Roberta" in table_0[i][0]:
                if dialect == 0:
                    # print("no AAE")
                    all_approach_result_string.append(
                        f"{table_0[i][0]}: "  # Approach naem
                        f"{table_0[i][1]},,,"  # Accuracy
                        f"{table_1[i][2]},,{table_0[i][2]},,{table_macro[i][2]},,,"  # Precision
                        f"{table_1[i][3]},,{table_0[i][3]},,{table_macro[i][3]},,,"  # Recall
                        f"{table_1[i][4]},,{table_0[i][4]},,{table_macro[i][4]},,,"  # F1
                        f"{table_1[i][5]},,,,"  # TPR
                        f"{table_1[i][6]},,,,"  # FPR
                        f"{table_1[i][7]},,,,"  # PPV
                    )
                elif dialect == 1:
                    # print("AAE")
                    all_approach_result_string.append(
                        f"{table_0[i][0]}: ,"  # Approach naem
                        f"{table_0[i][1]},,,"  # Accuracy
                        f"{table_1[i][2]},,{table_0[i][2]},,{table_macro[i][2]},,,"  # Precision
                        f"{table_1[i][3]},,{table_0[i][3]},,{table_macro[i][3]},,,"  # Recall
                        f"{table_1[i][4]},,{table_0[i][4]},,{table_macro[i][4]},,,"  # F1
                        f"{table_1[i][5]},,,,"  # TPR
                        f"{table_1[i][6]},,,,"  # FPR
                        f"{table_1[i][7]},,,,"  # PPV
                    )

        [print(i) for i in all_approach_result_string]

## Significance tests


### $t$-test


In [None]:
from scipy.stats import shapiro, ttest_rel, ttest_1samp, wilcoxon, ttest_ind

Test: if performance of A is significantly higher than performance of B


In [None]:
approach_to_test = ""
alpha = 0.05
n_splits = 10

#### Prepare data


In [None]:
sbic_test_splits = np.array_split(predictions_df, n_splits)

#### Significance tests for selected approaches (with seeds)


In [None]:
# Settings to compare approach to respective ablations
# approaches = ["deberta-v3-base-joint-mtl-no-aae", "deberta-v3-base-joint-mtl"]
# baselines = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-finetune"]
# ttest_function = ttest_rel

# Settings to compare AAE approaches to non-AAE variants
approaches = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-joint-mtl"]
baselines = ["deberta-v3-base-finetune", "deberta-v3-base-joint-mtl-no-aae"]
ttest_function = ttest_rel

approach_pairs = [(x, y) for x in approaches for y in baselines]

results = {}
for dialect in ["aae", "no-aae", "overall"]:
    results[dialect] = {}
    if dialect == "aae":
        dialect_value = 1
    elif dialect == "no-aae":
        dialect_value = 0
    else:
        dialect_value = -1

    for label in SBIC_CATEGORICAL_COLUMNS:
        print("=" * 30)
        print(dialect, label)
        print("=" * 30)
        results[dialect][label] = {}

        for score_index in [0, 1, 2]:
            if score_index == 0:
                print("\n\n\n")
                print("Precision")
            elif score_index == 1:
                print("\n\n\n")
                print("Recall")
            elif score_index == 2:
                print("\n\n\n")
                print("F1 score")

            results[dialect][label][score_index] = {}

            for pos_label in [0, 1, -1]:
                if pos_label == 0:
                    print("Negative label")
                elif pos_label == 1:
                    print("Positive label")
                elif pos_label == -1:
                    print("Macro averaged")

                results[dialect][label][score_index][pos_label] = {}

                # Test for significance
                for approach_A, approach_B in approach_pairs:
                    approach_prediction_seeds = {}
                    for approach in [approach_A, approach_B]:
                        approach_prediction_seeds[approach] = {}
                        if dialect_value == -1:
                            if pos_label == -1:
                                seed_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=predictions_df[
                                            f"prediction_{label}_{approach}-seed{seed}"
                                        ],
                                        y_true=predictions_df[label],
                                        average="macro",
                                    )[score_index]
                                    for seed in SEEDS
                                ]
                            else:
                                seed_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=predictions_df[
                                            f"prediction_{label}_{approach}-seed{seed}"
                                        ],
                                        y_true=predictions_df[label],
                                        average="binary",
                                        pos_label=pos_label,
                                    )[score_index]
                                    for seed in SEEDS
                                ]
                        else:
                            if pos_label == -1:
                                seed_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][f"prediction_{label}_{approach}-seed{seed}"],
                                        y_true=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][label],
                                        average="macro",
                                    )[score_index]
                                    for seed in SEEDS
                                ]
                            else:
                                seed_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][f"prediction_{label}_{approach}-seed{seed}"],
                                        y_true=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][label],
                                        average="binary",
                                        pos_label=pos_label,
                                    )[score_index]
                                    for seed in SEEDS
                                ]
                        approach_prediction_seeds[approach][label] = seed_scores

                    scores_A = approach_prediction_seeds[approach_A][label]
                    scores_B = approach_prediction_seeds[approach_B][label]

                    if not np.mean(scores_A) > np.mean(scores_B):
                        # print(
                        #     f"Skipped {approach_A} vs. {approach_B}; scores {np.mean(scores_A)} and {np.mean(scores_B)}"
                        # )
                        continue
                    # =======================================
                    # Test for significance
                    # Passing the scores of the baseline first, to ensure testing the correct hypothesis
                    t_results = ttest_function(scores_B, scores_A)
                    p_value = t_results[1] / 2

                    if p_value <= alpha:
                        print(
                            f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                        )
                        results[dialect][label][score_index][pos_label][
                            f"{approach_A}___{approach_B}"
                        ] = f"Significant with {p_value:.4f} (t-test)"
                    else:
                        print(
                            f"{approach_A} is NOT significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                        )
                        results[dialect][label][score_index][pos_label][
                            f"{approach_A}___{approach_B}"
                        ] = f"Not significant with {p_value:.4f} (t-test)"
                print("-" * 50)

#### Significance tests for selected approaches


In [None]:
# Settings to compare approach to respective ablations
# approaches = ["deberta-v3-base-joint-mtl-no-aae", "deberta-v3-base-joint-mtl"]
# baselines = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-finetuned"]
# ttest_function = ttest_rel

# Settings to compare AAE approaches to non-AAE variants
approaches = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-joint-mtl"]
baselines = ["deberta-v3-base-finetuned", "deberta-v3-base-joint-mtl-no-aae"]
ttest_function = ttest_rel

approach_pairs = [(x, y) for x in approaches for y in baselines]

results = {}
for dialect in ["aae", "no-aae", "overall"]:
    results[dialect] = {}
    if dialect == "aae":
        dialect_value = 1
    elif dialect == "no-aae":
        dialect_value = 0
    else:
        dialect_value = -1

    for label in SBIC_CATEGORICAL_COLUMNS:
        print("=" * 30)
        print(dialect, label)
        print("=" * 30)
        results[dialect][label] = {}

        for score_index in [0, 1, 2]:
            if score_index == 0:
                print("\n\n\n")
                print("Precision")
            elif score_index == 1:
                print("\n\n\n")
                print("Recall")
            elif score_index == 2:
                print("\n\n\n")
                print("F1 score")

            results[dialect][label][score_index] = {}

            for pos_label in [0, 1, -1]:
                if pos_label == 0:
                    print("Negative label")
                elif pos_label == 1:
                    print("Positive label")
                elif pos_label == -1:
                    print("Macro averaged")

                results[dialect][label][score_index][pos_label] = {}

                # Test for significance
                for approach_A, approach_B in approach_pairs:
                    approach_prediction_splits = {}
                    for approach in [approach_A, approach_B]:
                        approach_prediction_splits[approach] = {}
                        if dialect_value == -1:
                            if pos_label == -1:
                                split_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=split[f"prediction_{label}_{approach}"],
                                        y_true=split[label],
                                        average="macro",
                                    )[score_index]
                                    for split in sbic_test_splits
                                ]
                            else:
                                split_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=split[f"prediction_{label}_{approach}"],
                                        y_true=split[label],
                                        average="binary",
                                        pos_label=pos_label,
                                    )[score_index]
                                    for split in sbic_test_splits
                                ]
                        else:
                            if pos_label == -1:
                                split_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=split[
                                            split["aae_dialect"] == dialect_value
                                        ][f"prediction_{label}_{approach}"],
                                        y_true=split[
                                            split["aae_dialect"] == dialect_value
                                        ][label],
                                        average="macro",
                                    )[score_index]
                                    for split in sbic_test_splits
                                ]
                            else:
                                split_scores = [
                                    precision_recall_fscore_support(
                                        y_pred=split[
                                            split["aae_dialect"] == dialect_value
                                        ][f"prediction_{label}_{approach}"],
                                        y_true=split[
                                            split["aae_dialect"] == dialect_value
                                        ][label],
                                        average="binary",
                                        pos_label=pos_label,
                                    )[score_index]
                                    for split in sbic_test_splits
                                ]
                        approach_prediction_splits[approach][label] = split_scores

                    scores_A = approach_prediction_splits[approach_A][label]
                    scores_B = approach_prediction_splits[approach_B][label]

                    if not np.mean(scores_A) > np.mean(scores_B):
                        continue
                    # =======================================
                    # Check for nomality
                    score_differences = [a - b for a, b in zip(scores_A, scores_B)]

                    # If normality > alpha, null hypothesis that test is normally distributed can be rejected
                    normality = shapiro(score_differences)[1]

                    # =======================================
                    # Test for significance, with test depending on normality
                    if normality > alpha:
                        # Passing the scores of the baseline first, to ensure testing the correct hypothesis
                        t_results = ttest_function(scores_B, scores_A)
                        p_value = t_results[1] / 2

                        if p_value <= alpha:
                            print(
                                f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                            )
                            results[dialect][label][score_index][pos_label][
                                f"{approach_A}___{approach_B}"
                            ] = f"Significant with {p_value:.4f} (t-test)"
                        else:
                            print(
                                f"{approach_A} is NOT significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                            )
                            results[dialect][label][score_index][pos_label][
                                f"{approach_A}___{approach_B}"
                            ] = f"Not significant with {p_value:.4f} (t-test)"
                    else:
                        # print("Normality test not significant. Ommitting test for now.")
                        results[dialect][label][score_index][pos_label][
                            f"{approach_A}___{approach_B}"
                        ] = "No normal distribution"
                        # We can use the wilcoxon-signed rank test when normality is not given, as it is
                        # non-parametric and thus does not make assumptions about the distribution
                        w_results = wilcoxon(scores_B, scores_A)
                        p_value = w_results[1]

                        if p_value <= alpha:
                            print(
                                f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (wilcoxon)."
                            )
                            results[dialect][label][score_index][pos_label][
                                f"{approach_A}___{approach_B}"
                            ] = f"Significant with {p_value:.4f} (wilcoxon)"
                        else:
                            print(
                                f"{approach_A} is NOT significantly better than {approach_B} with p-value {p_value:.4f} (wilcoxon)."
                            )
                            results[dialect][label][score_index][pos_label][
                                f"{approach_A}___{approach_B}"
                            ] = f"Not significant with {p_value:.4f} (wilcoxon)"
                print("-" * 50)

#### Significance test for fairness metrics


In [None]:
# Settings to compare approach to respective ablations
# approaches = ["deberta-v3-base-joint-mtl-no-aae", "deberta-v3-base-joint-mtl"]
# baselines = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-finetune"]
# ttest_function = ttest_rel

# Settings to compare AAE approaches to non-AAE variants
approaches = ["deberta-v3-base-two-task-mtl", "deberta-v3-base-joint-mtl"]
baselines = ["deberta-v3-base-finetune", "deberta-v3-base-joint-mtl-no-aae"]
ttest_function = ttest_rel

approach_pairs = [(x, y) for x in approaches for y in baselines]

results = {}
for dialect in ["aae", "no-aae", "overall"]:
    results[dialect] = {}
    if dialect == "aae":
        dialect_value = 1
    elif dialect == "no-aae":
        dialect_value = 0
    else:
        dialect_value = -1

    for label in SBIC_CATEGORICAL_COLUMNS:
        print("=" * 30)
        print(dialect, label)
        print("=" * 30)
        results[dialect][label] = {}

        for score_index in [0, 1, 2]:
            if score_index == 0:
                print("\n\n\n")
                print("True Positive Rate")
                metric_function = true_positive_rate
            elif score_index == 1:
                print("\n\n\n")
                print("False Positive Rate")
                metric_function = false_positive_rate
            elif score_index == 2:
                print("\n\n\n")
                print("Positive Predictive Value")
                metric_function = positive_predicted_value

            results[dialect][label][score_index] = {}

            for pos_label in [0, 1, -1]:
                if pos_label == 0:
                    print("Negative label")
                elif pos_label == 1:
                    print("Positive label")
                elif pos_label == -1:
                    print("Macro averaged")

                results[dialect][label][score_index][pos_label] = {}

                # Test for significance
                for approach_A, approach_B in approach_pairs:
                    approach_prediction_splits = {}
                    for approach in [approach_A, approach_B]:
                        approach_prediction_splits[approach] = {}
                        if dialect_value == -1:
                            if pos_label == -1:
                                seed_scores = [
                                    metric_function(
                                        y_true=predictions_df[label],
                                        y_pred=predictions_df[
                                            f"prediction_{label}_{approach}-seed{seed}"
                                        ],
                                        average="macro",
                                    )
                                    for seed in SEEDS
                                ]
                            else:
                                seed_scores = [
                                    metric_function(
                                        y_true=predictions_df[label],
                                        y_pred=predictions_df[
                                            f"prediction_{label}_{approach}-seed{seed}"
                                        ],
                                        pos_label=pos_label,
                                    )
                                    for seed in SEEDS
                                ]
                        else:
                            if pos_label == -1:
                                seed_scores = [
                                    metric_function(
                                        y_true=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][label],
                                        y_pred=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][f"prediction_{label}_{approach}-seed{seed}"],
                                        average="macro",
                                    )
                                    for seed in SEEDS
                                ]
                            else:
                                seed_scores = [
                                    metric_function(
                                        y_true=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][label],
                                        y_pred=predictions_df[
                                            predictions_df["aae_dialect"]
                                            == dialect_value
                                        ][f"prediction_{label}_{approach}-seed{seed}"],
                                        pos_label=pos_label,
                                    )
                                    for seed in SEEDS
                                ]
                        approach_prediction_splits[approach][label] = seed_scores

                    scores_A = approach_prediction_splits[approach_A][label]
                    scores_B = approach_prediction_splits[approach_B][label]

                    # For FPR, we need to switch the test, as lower is better
                    if score_index == 1:
                        if not np.mean(scores_A) < np.mean(scores_B):
                            continue
                    else:
                        if not np.mean(scores_A) > np.mean(scores_B):
                            continue
                    # =======================================
                    # Check for nomality
                    score_differences = [a - b for a, b in zip(scores_A, scores_B)]

                    # If normality > alpha, null hypothesis that test is normally distributed can be rejected
                    normality = shapiro(score_differences)[1]
                    # Passing the scores of the baseline first, to ensure testing the correct hypothesis
                    t_results = ttest_function(scores_B, scores_A)
                    p_value = t_results[1] / 2

                    if p_value <= alpha:
                        print(
                            f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                        )
                        results[dialect][label][score_index][pos_label][
                            f"{approach_A}___{approach_B}"
                        ] = f"Significant with {p_value:.4f} (t-test)"
                    else:
                        print(
                            f"{approach_A} is NOT significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                        )
                        results[dialect][label][score_index][pos_label][
                            f"{approach_A}___{approach_B}"
                        ] = f"Not significant with {p_value:.4f} (t-test)"
                print("-" * 50)

#### Significance tests for no-code baselines

(one-sample $t$-test)


In [None]:
approaches = [
    "deberta-v3-base-joint-mtl-no-aae",
    "deberta-v3-base-joint-mtl",
    "deberta-v3-base-two-task-mtl",
    "deberta-v3-base-finetune",
]
no_code_baselines = {
    "GPT-2": {
        "offensiveYN": 0.788,
        "intentYN": 0.786,
        "lewdYN": 0.807,
        "groupYN": 0.699,
        "ingroupYN": 0.000,
    },
    "Few-shot learning": {
        "offensiveYN": 0.822,
        "intentYN": 0.798,
        "lewdYN": 0.411,
        "groupYN": 0.737,
        "ingroupYN": 0.000,
    },
}
approach_no_code_baseline_pairs = [
    (a, b) for a in approaches for b in no_code_baselines.keys()
]

results = {}
for label in SBIC_CATEGORICAL_COLUMNS:
    results[label] = {}

    print("=" * 30)
    print(f"For label '{label}'")
    print("=" * 30)
    print("-" * 50)

    approach_prediction_splits = {}
    for approach in approaches:
        seed_scores = [
            precision_recall_fscore_support(
                y_pred=predictions_df[f"prediction_{label}_{approach}-seed{seed}"],
                y_true=predictions_df[label],
                average="binary",
                pos_label=1,
            )[2]
            for seed in SEEDS
        ]
        approach_prediction_splits[approach] = seed_scores

    # Test for significance
    for approach_A, approach_B in approach_no_code_baseline_pairs:
        scores_A = approach_prediction_splits[approach_A]
        score_B = no_code_baselines[approach_B][label]

        if (np.mean(scores_A) == 0.0) or (not np.mean(scores_A) > score_B):
            results[label][
                f"{approach_A}___{approach_B}"
            ] = "Not better than second approach"
            continue
        # =======================================
        # Check for nomality
        score_differences = [a - score_B for a in scores_A]

        # Since we have a large sample size, we can use the one-sampled t-test even with skewed and
        # non-normally distributed data, as per https://stats.libretexts.org/Bookshelves/Applied_Statistics/Biological_Statistics_(McDonald)/04%3A_Tests_for_One_Measurement_Variable/4.01%3A_One-Sample_t-Test ; visited on 2024-02-01
        # (see section "Assumptions")

        # =======================================
        # Test for significance, without test depending on normality
        # two sided t-test, one sampled
        t_results = ttest_1samp(scores_A, score_B)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f}."
            )
            results[label][
                f"{approach_A}___{approach_B}"
            ] = f"Significant with {p_value:.4f}"
        else:
            print(
                f"{approach_A} is NOT significantly better than {approach_B} with p-value {p_value:.4f}."
            )
            results[label][
                f"{approach_A}___{approach_B}"
            ] = f"Not significant with {p_value:.4f}"

    print("-" * 50)

In [None]:
print(json.dumps(results, indent=4))