# Imports


In [None]:
import pandas as pd
import hashlib
from os import listdir, path
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import json

from src.bias_detection.config.settings import RANDOM_SEED
from src.bias_detection.data_handler import DataHandler
import matplotlib.pyplot as plt
from collections import Counter

# Commonsense QA corpus


In [None]:
MODELS = [
    "mistral-7b-instruct-v2",
    "command-r-v01",
    "llama3-70b-instruct",
]

TRAINING_DATASETS = ["commonsense_qa"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["common_qa"])
commonsense_qa_train_df = data_handler.common_qa["train"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]
commonsense_qa_val_df = data_handler.common_qa["dev"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]
commonsense_qa_test_df = data_handler.common_qa["test"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]

In [None]:
commonsense_qa_composition_predictions_val_per_model = {}
commonsense_qa_composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "commonsense_qa_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        commonsense_qa_composition_predictions_val_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_val

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "commonsense_qa_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            # Since we have duplicates, we need to sort them first, then merge them (cannot use
            # df.merge properly)
            df_sorted = df.sort_values(by="post_id")
            composition_predictions_test = composition_predictions_test.sort_values(
                by="post_id"
            )

            composition_predictions_test[f"pred_best_composition_seed{seed}"] = (
                df_sorted[f"pred_best_composition_seed{seed}"]
            )

        commonsense_qa_composition_predictions_test_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_test

In [None]:
# Load composition-specific prediction files


def reformat_input(text: str) -> str:
    question_answer_split = text.split("\n")
    answer_start = question_answer_split[1].find("'")
    answer_end = question_answer_split[1].find("'", answer_start + 1)
    answer = question_answer_split[1][answer_start + 1 : answer_end]

    return f"[Q] {question_answer_split[0]} [A] {answer}"


commonsense_qa_output_dir = "outputs/prompt-predictions/commonsense_qa"
commonsense_qa_predictions_per_composition_val_per_model = {}
commonsense_qa_predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f
        for f in sorted(listdir(commonsense_qa_output_dir))
        if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"commonsense_qa-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(
                f"commonsense_qa-greedy-dev_{model}_", ""
            ).replace(".parquet", "")

        df = pd.read_parquet(path.join(commonsense_qa_output_dir, f))
        df["post_id"] = df.input.apply(
            lambda x: hashlib.md5(reformat_input(x).encode()).hexdigest()
        )
        predictions_per_composition_val[composition_name] = df

    commonsense_qa_predictions_per_composition_val_per_model[model] = (
        predictions_per_composition_val
    )

    # Test set
    composition_files_test = [
        f
        for f in sorted(listdir(commonsense_qa_output_dir))
        if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"commonsense_qa-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(
                f"commonsense_qa-greedy-test_{model}_", ""
            ).replace(".parquet", "")

        df = pd.read_parquet(path.join(commonsense_qa_output_dir, f))
        df["post_id"] = df.input.apply(
            lambda x: hashlib.md5(reformat_input(x).encode()).hexdigest()
        )
        predictions_per_composition_test[composition_name] = df

    commonsense_qa_predictions_per_composition_test_per_model[model] = (
        predictions_per_composition_test
    )

In [None]:
print("## Training split")
positive_instances_train = len(
    commonsense_qa_train_df[commonsense_qa_train_df.true_label == 1]
)
negative_instances_train = len(
    commonsense_qa_train_df[commonsense_qa_train_df.true_label == 0]
)
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(commonsense_qa_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(commonsense_qa_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(
    commonsense_qa_val_df[commonsense_qa_val_df.true_label == 1]
)
negative_instances_val = len(
    commonsense_qa_val_df[commonsense_qa_val_df.true_label == 0]
)
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(commonsense_qa_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(commonsense_qa_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(
    commonsense_qa_test_df[commonsense_qa_test_df.true_label == 1]
)
negative_instances_test = len(
    commonsense_qa_test_df[commonsense_qa_test_df.true_label == 0]
)
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(commonsense_qa_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(commonsense_qa_test_df), decimals=3)})"
)

## Adaptive prompting evaluation


In [None]:
commonsense_qa_composition_prediction_scores_per_model = {}

for model in commonsense_qa_composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]
    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in commonsense_qa_composition_predictions_val_per_model[
            model
        ].iterrows():
            preds = commonsense_qa_predictions_per_composition_val_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]

            # TODO: REMOVE; ONLY TEMPORARY FIX FOR BROKEN DATA
            # (doesn't have any impact on compelete data, though)
            try:
                if f"output_{seed}" in preds.columns:
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                    )
                else:
                    # If we don't have predictions for other seeds, use the primary seed
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                    )
                y_true_seed.append(
                    preds[preds.post_id == row.post_id].iloc[0]["true_label"]
                )
            except IndexError:
                # print(f"No post found for id {row.post_id} in predictions. Skipping for now.")
                pass

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in commonsense_qa_composition_predictions_test_per_model[
            model
        ].iterrows():
            preds = commonsense_qa_predictions_per_composition_test_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]

            # TODO: REMOVE; ONLY TEMPORARY FIX FOR BROKEN DATA
            # (doesn't have any impact on compelete data, though)
            try:
                if f"output_{seed}" in preds.columns:
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                    )
                else:
                    # If we don't have predictions for other seeds, use the primary seed
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                    )
                y_true_seed.append(
                    preds[preds.post_id == row.post_id].iloc[0]["true_label"]
                )
            except IndexError:
                # print(f"No post found for id {row.post_id} in predictions. Skipping for now.")
                pass

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    commonsense_qa_composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        commonsense_qa_composition_prediction_scores_per_model[model][
            "test_macro_precision"
        ],
    )
    print(
        "Recall (macro) (over all seeds):",
        commonsense_qa_composition_prediction_scores_per_model[model][
            "test_macro_recall"
        ],
    )
    print(
        "F1 (macro) (over all seeds):",
        commonsense_qa_composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
commonsense_qa_all_scores_val_per_model = {}
commonsense_qa_all_scores_test_per_model = {}

for model in MODELS:
    # Validation split
    all_scores_val = {}
    for name, predictions in commonsense_qa_predictions_per_composition_val_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    commonsense_qa_all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in commonsense_qa_predictions_per_composition_test_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    commonsense_qa_all_scores_test_per_model[model] = all_scores_test

commonsense_qa_all_f1_scores_test_per_model = {}
commonsense_qa_all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    commonsense_qa_all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_test_per_model[model].items()
    ]

    commonsense_qa_all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in commonsense_qa_all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = commonsense_qa_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(commonsense_qa_test_df))]

commonsense_qa_maj_baseline_scores = precision_recall_fscore_support(
    y_true=commonsense_qa_test_df["true_label"],
    y_pred=majority_baseline_pred,
    pos_label=1,
)

commonsense_qa_maj_baseline_precision_macro_averaged_test = (
    commonsense_qa_maj_baseline_scores[0][0] + commonsense_qa_maj_baseline_scores[0][1]
) / 2
commonsense_qa_maj_baseline_recall_macro_averaged_test = (
    commonsense_qa_maj_baseline_scores[1][0] + commonsense_qa_maj_baseline_scores[1][1]
) / 2
commonsense_qa_maj_baseline_f1_macro_averaged_test = (
    commonsense_qa_maj_baseline_scores[2][0] + commonsense_qa_maj_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    commonsense_qa_maj_baseline_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):",
    commonsense_qa_maj_baseline_recall_macro_averaged_test,
)
print(
    "F1 (macro) (over all seeds):", commonsense_qa_maj_baseline_f1_macro_averaged_test
)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(commonsense_qa_test_df))

commonsense_qa_random_baseline_scores = precision_recall_fscore_support(
    y_true=commonsense_qa_test_df["true_label"],
    y_pred=random_baseline_pred,
    pos_label=1,
)

commonsense_qa_random_baseline_precision_macro_averaged = (
    commonsense_qa_random_baseline_scores[0][0]
    + commonsense_qa_random_baseline_scores[0][1]
) / 2
commonsense_qa_random_baseline_recall_macro_averaged = (
    commonsense_qa_random_baseline_scores[1][0]
    + commonsense_qa_random_baseline_scores[1][1]
) / 2
commonsense_qa_random_baseline_f1_macro_averaged = (
    commonsense_qa_random_baseline_scores[2][0]
    + commonsense_qa_random_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    commonsense_qa_random_baseline_precision_macro_averaged,
)
print(
    "Recall (macro) (over all seeds):",
    commonsense_qa_random_baseline_recall_macro_averaged,
)
print("F1 (macro) (over all seeds):", commonsense_qa_random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
commonsense_qa_oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for (
            composition,
            df,
        ) in commonsense_qa_predictions_per_composition_test_per_model[model].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    commonsense_qa_oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        commonsense_qa_oracle_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Oracle Recall (macro):",
        commonsense_qa_oracle_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Oracle F1 (macro):",
        commonsense_qa_oracle_scores_per_model[model]["test_macro_f1"],
    )

### No technique

_Task description and input text only_


In [None]:
commonsense_qa_no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(commonsense_qa_all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    commonsense_qa_no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        commonsense_qa_no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        commonsense_qa_no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        commonsense_qa_no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
commonsense_qa_optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    commonsense_qa_optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_val_per_model[model].items()
    ]
    commonsense_qa_optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_val_per_model[model].items()
    ]
    commonsense_qa_optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_val_per_model[model].items()
    ]
    # Test split
    commonsense_qa_optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_test_per_model[model].items()
    ]
    commonsense_qa_optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_test_per_model[model].items()
    ]
    commonsense_qa_optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in commonsense_qa_all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    commonsense_qa_optimal_composition_val_f1_macro_i = np.argmax(
        commonsense_qa_optimal_f1_macro_averaged_scores_val
    )
    commonsense_qa_optimal_composition_name = list(
        commonsense_qa_all_scores_val_per_model[model].keys()
    )[commonsense_qa_optimal_composition_val_f1_macro_i]

    commonsense_qa_optimal_composition_scores_per_model[model] = {
        "composition_name": list(commonsense_qa_all_scores_val_per_model[model].keys())[
            commonsense_qa_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision": commonsense_qa_optimal_precision_macro_averaged_scores_test[
            commonsense_qa_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                commonsense_qa_optimal_composition_name
            ]
        ],
        "test_macro_recall": commonsense_qa_optimal_recall_macro_averaged_scores_test[
            commonsense_qa_optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                commonsense_qa_optimal_composition_name
            ]
        ],
        "test_macro_f1": commonsense_qa_optimal_f1_macro_averaged_scores_test[
            commonsense_qa_optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in commonsense_qa_all_scores_test_per_model[model][
                commonsense_qa_optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        commonsense_qa_optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        commonsense_qa_optimal_composition_scores_per_model[model][
            "test_macro_precision"
        ],
    )
    print(
        "Optimal composition Recall (macro):",
        commonsense_qa_optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        commonsense_qa_optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
commonsense_qa_ensemble_scores_per_model = {}

for model in MODELS:
    all_seed_scores = []

    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for (
            composition_name,
            comp_preds,
        ) in commonsense_qa_predictions_per_composition_test_per_model[model].items():
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]
            try:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{seed}"]].rename(
                        columns={f"output_{seed}": f"{composition_name}_{seed}"}
                    ),
                    on="post_id",
                    how="left",
                )
            except KeyError:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{RANDOM_SEED[0]}"]].rename(
                        columns={
                            f"output_{RANDOM_SEED[0]}": f"{composition_name}_{seed}"
                        }
                    ),
                    on="post_id",
                    how="left",
                )

        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there is a tie, use a random value between 0 and 1
        seed_df["majority"] = np.where(mode[1].isna(), mode[0], np.random.randint(2))

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    commonsense_qa_ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        commonsense_qa_ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        commonsense_qa_ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        commonsense_qa_ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20241010181021_commonsense_qa-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"commonsense_qa-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(commonsense_qa_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)

commonsense_qa_finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    commonsense_qa_finetune_scores["test_macro_precision"],
)
print(
    "Recall (macro) (over all seeds):",
    commonsense_qa_finetune_scores["test_macro_recall"],
)
print("F1 (macro) (over all seeds):", commonsense_qa_finetune_scores["test_macro_f1"])

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
commonsense_qa_best_on_test_scores = {}
for model in MODELS:
    commonsense_qa_best_composition = np.argmax(
        commonsense_qa_all_f1_scores_test_per_model[model]
    )
    commonsense_qa_best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": commonsense_qa_all_f1_scores_test_per_model_seed_scores[
            model
        ][
            commonsense_qa_best_composition
        ]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "commonsense_qa"

baselines = [
    ("BaseComposition", commonsense_qa_no_technique_scores_per_model),
    ("BestOnVal", commonsense_qa_optimal_composition_scores_per_model),
    ("BestOnTest", commonsense_qa_best_on_test_scores),
    ("Finetune", commonsense_qa_finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = commonsense_qa_composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in commonsense_qa_all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(commonsense_qa_all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(commonsense_qa_all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(commonsense_qa_all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        commonsense_qa_all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        commonsense_qa_all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(commonsense_qa_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(commonsense_qa_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        commonsense_qa_all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        commonsense_qa_all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(commonsense_qa_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(commonsense_qa_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        commonsense_qa_no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        commonsense_qa_no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(commonsense_qa_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(commonsense_qa_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_handle = plt.scatter(
        i + 1,
        commonsense_qa_optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        commonsense_qa_optimal_composition_scores_per_model[model]["test_macro_f1"],
        # ("Best-on-validation " f"({np.round(commonsense_qa_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(commonsense_qa_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        commonsense_qa_ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        commonsense_qa_ensemble_scores_per_model[model]["test_macro_f1"],
        # ("Majority ensemble " f"({np.round(commonsense_qa_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E ->",
        (
            "Majority ensemble "
            f"({np.round(commonsense_qa_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            commonsense_qa_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            commonsense_qa_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            commonsense_qa_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            commonsense_qa_oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            commonsense_qa_oracle_scores_per_model[model]["test_macro_f1"],
            # (
            #     "Oracle " f"({np.round(commonsense_qa_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            # ),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                "Oracle "
                f"({np.round(commonsense_qa_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(commonsense_qa_all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(commonsense_qa_all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {commonsense_qa_optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    commonsense_qa_finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    commonsense_qa_finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(commonsense_qa_finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        commonsense_qa_maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        commonsense_qa_maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(commonsense_qa_maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        commonsense_qa_random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        commonsense_qa_random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(commonsense_qa_random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("commonsense_qa data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")

plt.savefig("outputs/figures/commonsense_qa__performance-box-plot.pdf")
plt.savefig("outputs/figures/commonsense_qa__performance-box-plot.svg")

plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
commonsense_qa_composition_counts_per_seed_per_model = {}

for model in commonsense_qa_composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    commonsense_qa_composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        comp_count = Counter(
            commonsense_qa_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in commonsense_qa_predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if (
                composition
                not in commonsense_qa_composition_counts_per_seed_per_model[
                    model
                ].keys()
            ):
                commonsense_qa_composition_counts_per_seed_per_model[model][
                    composition
                ] = []

            if composition in comp_count.keys():
                commonsense_qa_composition_counts_per_seed_per_model[model][
                    composition
                ].append(comp_count[composition])
            else:
                commonsense_qa_composition_counts_per_seed_per_model[model][
                    composition
                ].append(0)

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(
        commonsense_qa_composition_counts_per_seed_per_model[model].keys()
    )
    values = [
        np.mean(commonsense_qa_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(commonsense_qa_composition_counts_per_seed_per_model[model][comp])
        - np.min(commonsense_qa_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(commonsense_qa_composition_counts_per_seed_per_model[model][comp])
        - np.mean(commonsense_qa_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on commonsense_qa")

    plt.savefig(
        f"outputs/figures/commonsense_qa__{model}__composition-frequency.pdf",
        bbox_inches="tight",
    )
    plt.savefig(
        f"outputs/figures/commonsense_qa__{model}__composition-frequency.png",
        bbox_inches="tight",
    )
    # Show the plot
    plt.show()

In [None]:
# Create composition frequency latex tables
target_dataset = "commonsense_qa"
target_dataset_models = [
    m
    for m in commonsense_qa_composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for (
        composition_name,
        composition_counts,
    ) in commonsense_qa_composition_counts_per_seed_per_model[model].items():
        mean_counts = np.mean(composition_counts)

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            model
        ] = mean_counts


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    # "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in commonsense_qa_composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in commonsense_qa_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    # mask = np.triu(np.ones_like(average_cooccurrences, dtype=bool))
    # masked_data = np.ma.masked_array(average_cooccurrences, mask)

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on commonsense_qa"
    )
    plt.savefig(f"outputs/figures/commonsense_qa__{model}__technique-cooccurrences.pdf")
    plt.show()

# ESNLI corpus


In [None]:
MODELS = [
    "mistral-7b-instruct-v2",
    "command-r-v01",
    "llama3-70b-instruct",
]

TRAINING_DATASETS = ["esnli"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["esnli"])
esnli_train_df = data_handler.esnli_data["train"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]
esnli_val_df = data_handler.esnli_data["dev"].rename(columns={"label": "true_label"})[
    ["md5_hash", "true_label"]
]
esnli_test_df = data_handler.esnli_data["test"].rename(columns={"label": "true_label"})[
    ["md5_hash", "true_label"]
]

In [None]:
esnli_composition_predictions_val_per_model = {}
esnli_composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "esnli_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        esnli_composition_predictions_val_per_model[f"{model}__{training_dataset}"] = (
            composition_predictions_val
        )

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "esnli_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            # Since we have duplicates, we need to sort them first, then merge them (cannot use
            # df.merge properly)
            df_sorted = df.sort_values(by="post_id")
            composition_predictions_test = composition_predictions_test.sort_values(
                by="post_id"
            )

            composition_predictions_test[f"pred_best_composition_seed{seed}"] = (
                df_sorted[f"pred_best_composition_seed{seed}"]
            )

        esnli_composition_predictions_test_per_model[f"{model}__{training_dataset}"] = (
            composition_predictions_test
        )

In [None]:
# Load composition-specific prediction files
esnli_output_dir = "outputs/prompt-predictions/esnli"
esnli_predictions_per_composition_val_per_model = {}
esnli_predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f for f in sorted(listdir(esnli_output_dir)) if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"esnli-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"esnli-greedy-dev_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(esnli_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_val[composition_name] = df

    esnli_predictions_per_composition_val_per_model[model] = (
        predictions_per_composition_val
    )

    # Test set
    composition_files_test = [
        f for f in sorted(listdir(esnli_output_dir)) if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"esnli-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"esnli-greedy-test_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(esnli_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_test[composition_name] = df

    esnli_predictions_per_composition_test_per_model[model] = (
        predictions_per_composition_test
    )

In [None]:
print("## Training split")
positive_instances_train = len(esnli_train_df[esnli_train_df.true_label == 1])
negative_instances_train = len(esnli_train_df[esnli_train_df.true_label == 0])
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(esnli_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(esnli_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(esnli_val_df[esnli_val_df.true_label == 1])
negative_instances_val = len(esnli_val_df[esnli_val_df.true_label == 0])
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(esnli_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(esnli_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(esnli_test_df[esnli_test_df.true_label == 1])
negative_instances_test = len(esnli_test_df[esnli_test_df.true_label == 0])
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(esnli_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(esnli_test_df), decimals=3)})"
)

## Adaptive prediction evaluation


In [None]:
esnli_composition_prediction_scores_per_model = {}

for model in esnli_composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]
    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in esnli_composition_predictions_val_per_model[model].iterrows():
            preds = esnli_predictions_per_composition_val_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in esnli_composition_predictions_test_per_model[model].iterrows():
            preds = esnli_predictions_per_composition_test_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]

            try:
                if f"output_{seed}" in preds.columns:
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                    )
                else:
                    # If we don't have predictions for other seeds, use the primary seed
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                    )
                y_true_seed.append(
                    preds[preds.post_id == row.post_id].iloc[0]["true_label"]
                )
            except IndexError:
                # print(f"No post found for id {row.post_id} in predictions. Skipping for now.")
                pass

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    esnli_composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        esnli_composition_prediction_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        esnli_composition_prediction_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        esnli_composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
esnli_all_scores_val_per_model = {}
esnli_all_scores_test_per_model = {}

for model in MODELS:
    # Validation split
    all_scores_val = {}
    for name, predictions in esnli_predictions_per_composition_val_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    esnli_all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in esnli_predictions_per_composition_test_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    esnli_all_scores_test_per_model[model] = all_scores_test

esnli_all_f1_scores_test_per_model = {}
esnli_all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    esnli_all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_test_per_model[model].items()
    ]
    esnli_all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in esnli_all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = esnli_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(esnli_test_df))]

esnli_maj_baseline_scores = precision_recall_fscore_support(
    y_true=esnli_test_df["true_label"], y_pred=majority_baseline_pred, pos_label=1
)

esnli_maj_baseline_precision_macro_averaged_test = (
    esnli_maj_baseline_scores[0][0] + esnli_maj_baseline_scores[0][1]
) / 2
esnli_maj_baseline_recall_macro_averaged_test = (
    esnli_maj_baseline_scores[1][0] + esnli_maj_baseline_scores[1][1]
) / 2
esnli_maj_baseline_f1_macro_averaged_test = (
    esnli_maj_baseline_scores[2][0] + esnli_maj_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    esnli_maj_baseline_precision_macro_averaged_test,
)
print("Recall (macro) (over all seeds):", esnli_maj_baseline_recall_macro_averaged_test)
print("F1 (macro) (over all seeds):", esnli_maj_baseline_f1_macro_averaged_test)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(esnli_test_df))

esnli_random_baseline_scores = precision_recall_fscore_support(
    y_true=esnli_test_df["true_label"], y_pred=random_baseline_pred, pos_label=1
)

esnli_random_baseline_precision_macro_averaged = (
    esnli_random_baseline_scores[0][0] + esnli_random_baseline_scores[0][1]
) / 2
esnli_random_baseline_recall_macro_averaged = (
    esnli_random_baseline_scores[1][0] + esnli_random_baseline_scores[1][1]
) / 2
esnli_random_baseline_f1_macro_averaged = (
    esnli_random_baseline_scores[2][0] + esnli_random_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    esnli_random_baseline_precision_macro_averaged,
)
print("Recall (macro) (over all seeds):", esnli_random_baseline_recall_macro_averaged)
print("F1 (macro) (over all seeds):", esnli_random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
esnli_oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for composition, df in esnli_predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    esnli_oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        esnli_oracle_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Oracle Recall (macro):",
        esnli_oracle_scores_per_model[model]["test_macro_recall"],
    )
    print("Oracle F1 (macro):", esnli_oracle_scores_per_model[model]["test_macro_f1"])

### No technique

_Task description and input text only_


In [None]:
esnli_no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(esnli_all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    esnli_no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in esnli_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in esnli_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in esnli_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        esnli_no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        esnli_no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        esnli_no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
esnli_optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    esnli_optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_val_per_model[model].items()
    ]
    esnli_optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_val_per_model[model].items()
    ]
    esnli_optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_val_per_model[model].items()
    ]
    # Test split
    esnli_optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_test_per_model[model].items()
    ]
    esnli_optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_test_per_model[model].items()
    ]
    esnli_optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in esnli_all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    esnli_optimal_composition_val_f1_macro_i = np.argmax(
        esnli_optimal_f1_macro_averaged_scores_val
    )
    esnli_optimal_composition_name = list(esnli_all_scores_val_per_model[model].keys())[
        esnli_optimal_composition_val_f1_macro_i
    ]

    esnli_optimal_composition_scores_per_model[model] = {
        "composition_name": list(esnli_all_scores_val_per_model[model].keys())[
            esnli_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision": esnli_optimal_precision_macro_averaged_scores_test[
            esnli_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                esnli_optimal_composition_name
            ]
        ],
        "test_macro_recall": esnli_optimal_recall_macro_averaged_scores_test[
            esnli_optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                esnli_optimal_composition_name
            ]
        ],
        "test_macro_f1": esnli_optimal_f1_macro_averaged_scores_test[
            esnli_optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in esnli_all_scores_test_per_model[model][
                esnli_optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        esnli_optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        esnli_optimal_composition_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Optimal composition Recall (macro):",
        esnli_optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        esnli_optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
esnli_ensemble_scores_per_model = {}

for model in MODELS:
    all_seed_scores = []

    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for (
            composition_name,
            comp_preds,
        ) in esnli_predictions_per_composition_test_per_model[model].items():
            # print(composition_name)
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]
            # print("merging")

            # We need to work on sorted dictionaries, due to duplicates...
            comp_preds_sorted = comp_preds.sort_values(by="post_id")
            seed_df = seed_df.sort_values(by="post_id")

            try:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{seed}"
                ]
            except KeyError:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{RANDOM_SEED[0]}"
                ]

        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there are no ties, we mode[1] does not exist
        if len(mode.columns) == 1:
            # No tie exists
            seed_df["majority"] = mode[0]
        else:
            # If there is a tie, use a random value between 0 and 1
            seed_df["majority"] = np.where(
                mode[1].isna(), mode[0], np.random.randint(2)
            )

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    esnli_ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        esnli_ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        esnli_ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        esnli_ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20241009160411_esnli-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"esnli-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(esnli_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)

esnli_finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):", esnli_finetune_scores["test_macro_precision"]
)
print("Recall (macro) (over all seeds):", esnli_finetune_scores["test_macro_recall"])
print("F1 (macro) (over all seeds):", esnli_finetune_scores["test_macro_f1"])

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
esnli_best_on_test_scores = {}
for model in MODELS:
    esnli_best_composition = np.argmax(esnli_all_f1_scores_test_per_model[model])
    esnli_best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": esnli_all_f1_scores_test_per_model_seed_scores[
            model
        ][esnli_best_composition]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "esnli"

baselines = [
    ("BaseComposition", esnli_no_technique_scores_per_model),
    ("BestOnVal", esnli_optimal_composition_scores_per_model),
    ("BestOnTest", esnli_best_on_test_scores),
    ("Finetune", esnli_finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = esnli_composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in esnli_all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(esnli_all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(esnli_all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(esnli_all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        esnli_all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        esnli_all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(esnli_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(esnli_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        esnli_all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        esnli_all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(esnli_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(esnli_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        esnli_no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        esnli_no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(esnli_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(esnli_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_handle = plt.scatter(
        i + 1,
        esnli_optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        esnli_optimal_composition_scores_per_model[model]["test_macro_f1"],
        # ("Best-on-validation "f"({np.round(esnli_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(esnli_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        esnli_ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        esnli_ensemble_scores_per_model[model]["test_macro_f1"],
        # ("Majority ensemble " f"({np.round(esnli_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E -> ",
        (
            "Majority ensemble "
            f"({np.round(esnli_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            esnli_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            esnli_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            esnli_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            esnli_oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            esnli_oracle_scores_per_model[model]["test_macro_f1"],
            # (
            #     "Oracle "
            #     f"({np.round(esnli_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            # ),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                "Oracle "
                f"({np.round(esnli_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(esnli_all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(esnli_all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {esnli_optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    esnli_finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    esnli_finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(esnli_finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        esnli_maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        esnli_maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(esnli_maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        esnli_random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        esnli_random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(esnli_random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("ESNLI data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")
plt.savefig("outputs/figures/esnli__performance-box-plot.pdf")
plt.savefig("outputs/figures/esnli__performance-box-plot.svg")
plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
esnli_composition_counts_per_seed_per_model = {}

for model in esnli_composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    esnli_composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        comp_count = Counter(
            esnli_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in esnli_predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if (
                composition
                not in esnli_composition_counts_per_seed_per_model[model].keys()
            ):
                esnli_composition_counts_per_seed_per_model[model][composition] = []

            if composition in comp_count.keys():
                esnli_composition_counts_per_seed_per_model[model][composition].append(
                    comp_count[composition]
                )
            else:
                esnli_composition_counts_per_seed_per_model[model][composition].append(
                    0
                )

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(esnli_composition_counts_per_seed_per_model[model].keys())
    values = [
        np.mean(esnli_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(esnli_composition_counts_per_seed_per_model[model][comp])
        - np.min(esnli_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(esnli_composition_counts_per_seed_per_model[model][comp])
        - np.mean(esnli_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on esnli")

    plt.savefig(
        f"outputs/figures/esnli__{model}__composition-frequency.pdf",
        bbox_inches="tight",
    )
    plt.savefig(
        f"outputs/figures/esnli__{model}__composition-frequency.png",
        bbox_inches="tight",
    )
    # Show the plot
    plt.show()

In [None]:
# Create composition frequency latex tables
target_dataset = "esnli"
target_dataset_models = [
    m
    for m in esnli_composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for (
        composition_name,
        composition_counts,
    ) in esnli_composition_counts_per_seed_per_model[model].items():
        mean_counts = np.mean(composition_counts)

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            model
        ] = mean_counts


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in esnli_composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in esnli_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on ESNLI"
    )
    plt.savefig(f"outputs/figures/esnli__{model}__technique-cooccurrences.pdf")
    plt.show()

# Semeval corpus


In [None]:
MODELS = [
    "mistral-7b-instruct-v2",
    "command-r-v01",
    "llama3-70b-instruct",
]

TRAINING_DATASETS = ["semeval"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["semeval"])
semeval_train_df = data_handler.semeval_data["train"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]
semeval_val_df = data_handler.semeval_data["dev"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]
semeval_test_df = data_handler.semeval_data["test"].rename(
    columns={"label": "true_label"}
)[["md5_hash", "true_label"]]

In [None]:
semeval_composition_predictions_val_per_model = {}
semeval_composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "semeval_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        semeval_composition_predictions_val_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_val

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "semeval_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            # Since we have duplicates, we need to sort them first, then merge them (cannot use
            # df.merge properly)
            df_sorted = df.sort_values(by="post_id")
            composition_predictions_test = composition_predictions_test.sort_values(
                by="post_id"
            )

            composition_predictions_test[f"pred_best_composition_seed{seed}"] = (
                df_sorted[f"pred_best_composition_seed{seed}"]
            )

        semeval_composition_predictions_test_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_test

In [None]:
# Load composition-specific prediction files
semeval_output_dir = "outputs/prompt-predictions/semeval"
semeval_predictions_per_composition_val_per_model = {}
semeval_predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f for f in sorted(listdir(semeval_output_dir)) if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"semeval-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"semeval-greedy-dev_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(semeval_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_val[composition_name] = df

    semeval_predictions_per_composition_val_per_model[model] = (
        predictions_per_composition_val
    )

    # Test set
    composition_files_test = [
        f for f in sorted(listdir(semeval_output_dir)) if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"semeval-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"semeval-greedy-test_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(semeval_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_test[composition_name] = df

    semeval_predictions_per_composition_test_per_model[model] = (
        predictions_per_composition_test
    )

In [None]:
print("## Training split")
positive_instances_train = len(semeval_train_df[semeval_train_df.true_label == 1])
negative_instances_train = len(semeval_train_df[semeval_train_df.true_label == 0])
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(semeval_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(semeval_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(semeval_val_df[semeval_val_df.true_label == 1])
negative_instances_val = len(semeval_val_df[semeval_val_df.true_label == 0])
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(semeval_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(semeval_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(semeval_test_df[semeval_test_df.true_label == 1])
negative_instances_test = len(semeval_test_df[semeval_test_df.true_label == 0])
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(semeval_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(semeval_test_df), decimals=3)})"
)

## Adaptive prompting evaluation


In [None]:
semeval_composition_prediction_scores_per_model = {}

for model in semeval_composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]
    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in semeval_composition_predictions_val_per_model[model].iterrows():
            preds = semeval_predictions_per_composition_val_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in semeval_composition_predictions_test_per_model[model].iterrows():
            preds = semeval_predictions_per_composition_test_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]

            # TODO: REMOVE; ONLY TEMPORARY FIX FOR BROKEN DATA
            # (doesn't have any impact on compelete data, though)
            try:
                if f"output_{seed}" in preds.columns:
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                    )
                else:
                    # If we don't have predictions for other seeds, use the primary seed
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                    )
                y_true_seed.append(
                    preds[preds.post_id == row.post_id].iloc[0]["true_label"]
                )
            except IndexError:
                # print(f"No post found for id {row.post_id} in predictions. Skipping for now.")
                pass

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    semeval_composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        semeval_composition_prediction_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        semeval_composition_prediction_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        semeval_composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
semeval_all_scores_val_per_model = {}
semeval_all_scores_test_per_model = {}

for model in MODELS:
    # Validation split
    all_scores_val = {}
    for name, predictions in semeval_predictions_per_composition_val_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    semeval_all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in semeval_predictions_per_composition_test_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    semeval_all_scores_test_per_model[model] = all_scores_test

semeval_all_f1_scores_test_per_model = {}
semeval_all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    semeval_all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_test_per_model[model].items()
    ]
    semeval_all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in semeval_all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = semeval_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(semeval_test_df))]

semeval_maj_baseline_scores = precision_recall_fscore_support(
    y_true=semeval_test_df["true_label"], y_pred=majority_baseline_pred, pos_label=1
)

semeval_maj_baseline_precision_macro_averaged_test = (
    semeval_maj_baseline_scores[0][0] + semeval_maj_baseline_scores[0][1]
) / 2
semeval_maj_baseline_recall_macro_averaged_test = (
    semeval_maj_baseline_scores[1][0] + semeval_maj_baseline_scores[1][1]
) / 2
semeval_maj_baseline_f1_macro_averaged_test = (
    semeval_maj_baseline_scores[2][0] + semeval_maj_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    semeval_maj_baseline_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):", semeval_maj_baseline_recall_macro_averaged_test
)
print("F1 (macro) (over all seeds):", semeval_maj_baseline_f1_macro_averaged_test)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(semeval_test_df))

semeval_random_baseline_scores = precision_recall_fscore_support(
    y_true=semeval_test_df["true_label"], y_pred=random_baseline_pred, pos_label=1
)

semeval_random_baseline_precision_macro_averaged = (
    semeval_random_baseline_scores[0][0] + semeval_random_baseline_scores[0][1]
) / 2
semeval_random_baseline_recall_macro_averaged = (
    semeval_random_baseline_scores[1][0] + semeval_random_baseline_scores[1][1]
) / 2
semeval_random_baseline_f1_macro_averaged = (
    semeval_random_baseline_scores[2][0] + semeval_random_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    semeval_random_baseline_precision_macro_averaged,
)
print("Recall (macro) (over all seeds):", semeval_random_baseline_recall_macro_averaged)
print("F1 (macro) (over all seeds):", semeval_random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
semeval_oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for composition, df in semeval_predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    semeval_oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        semeval_oracle_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Oracle Recall (macro):",
        semeval_oracle_scores_per_model[model]["test_macro_recall"],
    )
    print("Oracle F1 (macro):", semeval_oracle_scores_per_model[model]["test_macro_f1"])

### No technique

_Task description and input text only_


In [None]:
semeval_no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(semeval_all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    semeval_no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in semeval_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in semeval_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in semeval_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        semeval_no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        semeval_no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        semeval_no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
semeval_optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    semeval_optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_val_per_model[model].items()
    ]
    semeval_optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_val_per_model[model].items()
    ]
    semeval_optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_val_per_model[model].items()
    ]
    # Test split
    semeval_optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_test_per_model[model].items()
    ]
    semeval_optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_test_per_model[model].items()
    ]
    semeval_optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in semeval_all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    semeval_optimal_composition_val_f1_macro_i = np.argmax(
        semeval_optimal_f1_macro_averaged_scores_val
    )
    semeval_optimal_composition_name = list(
        semeval_all_scores_val_per_model[model].keys()
    )[semeval_optimal_composition_val_f1_macro_i]

    semeval_optimal_composition_scores_per_model[model] = {
        "composition_name": list(semeval_all_scores_val_per_model[model].keys())[
            semeval_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision": semeval_optimal_precision_macro_averaged_scores_test[
            semeval_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                semeval_optimal_composition_name
            ]
        ],
        "test_macro_recall": semeval_optimal_recall_macro_averaged_scores_test[
            semeval_optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                semeval_optimal_composition_name
            ]
        ],
        "test_macro_f1": semeval_optimal_f1_macro_averaged_scores_test[
            semeval_optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in semeval_all_scores_test_per_model[model][
                semeval_optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        semeval_optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        semeval_optimal_composition_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Optimal composition Recall (macro):",
        semeval_optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        semeval_optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
semeval_ensemble_scores_per_model = {}

for model in MODELS:
    all_seed_scores = []

    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for (
            composition_name,
            comp_preds,
        ) in semeval_predictions_per_composition_test_per_model[model].items():
            # print(composition_name)
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]

            # We need to work on sorted dictionaries, due to duplicates...
            comp_preds_sorted = comp_preds.sort_values(by="post_id")
            seed_df = seed_df.sort_values(by="post_id")

            try:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{seed}"
                ]
            except KeyError:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{RANDOM_SEED[0]}"
                ]

        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there are no ties, we mode[1] does not exist
        if len(mode.columns) == 1:
            # No tie exists
            seed_df["majority"] = mode[0]
        else:
            # If there is a tie, use a random value between 0 and 1
            seed_df["majority"] = np.where(
                mode[1].isna(), mode[0], np.random.randint(2)
            )

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    semeval_ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        semeval_ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        semeval_ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        semeval_ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20240925121914_semeval-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"semeval-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(semeval_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)


semeval_finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    semeval_finetune_scores["test_macro_precision"],
)
print("Recall (macro) (over all seeds):", semeval_finetune_scores["test_macro_recall"])
print("F1 (macro) (over all seeds):", semeval_finetune_scores["test_macro_f1"])

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
semeval_best_on_test_scores = {}
for model in MODELS:
    semeval_best_composition = np.argmax(semeval_all_f1_scores_test_per_model[model])
    semeval_best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": semeval_all_f1_scores_test_per_model_seed_scores[
            model
        ][semeval_best_composition]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "semeval"

baselines = [
    ("BaseComposition", semeval_no_technique_scores_per_model),
    ("BestOnVal", semeval_optimal_composition_scores_per_model),
    ("BestOnTest", semeval_best_on_test_scores),
    ("Finetune", semeval_finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = semeval_composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in semeval_all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(semeval_all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(semeval_all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(semeval_all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        semeval_all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        semeval_all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(semeval_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(semeval_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        semeval_all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        semeval_all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(semeval_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(semeval_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        semeval_no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        semeval_no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(semeval_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(semeval_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_handle = plt.scatter(
        i + 1,
        semeval_optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        semeval_optimal_composition_scores_per_model[model]["test_macro_f1"],
        # ("Best-on-validation "f"({np.round(semeval_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(semeval_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        semeval_ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        semeval_ensemble_scores_per_model[model]["test_macro_f1"],
        # ("Majority ensemble " f"({np.round(semeval_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E -> ",
        (
            "Majority ensemble "
            f"({np.round(semeval_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            semeval_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            semeval_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            semeval_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            semeval_oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            semeval_oracle_scores_per_model[model]["test_macro_f1"],
            # (
            #     "Oracle "
            #     f"({np.round(semeval_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            # ),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                "Oracle "
                f"({np.round(semeval_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(semeval_all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(semeval_all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {semeval_optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    semeval_finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    semeval_finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(semeval_finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        semeval_maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        semeval_maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(semeval_maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        semeval_random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        semeval_random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(semeval_random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("SemEval data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")
plt.savefig("outputs/figures/semeval__performance-box-plot.pdf")
plt.savefig("outputs/figures/semeval__performance-box-plot.svg")
plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
semeval_composition_counts_per_seed_per_model = {}

for model in semeval_composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    semeval_composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        comp_count = Counter(
            semeval_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in semeval_predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if (
                composition
                not in semeval_composition_counts_per_seed_per_model[model].keys()
            ):
                semeval_composition_counts_per_seed_per_model[model][composition] = []

            if composition in comp_count.keys():
                semeval_composition_counts_per_seed_per_model[model][
                    composition
                ].append(comp_count[composition])
            else:
                semeval_composition_counts_per_seed_per_model[model][
                    composition
                ].append(0)

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(semeval_composition_counts_per_seed_per_model[model].keys())
    values = [
        np.mean(semeval_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(semeval_composition_counts_per_seed_per_model[model][comp])
        - np.min(semeval_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(semeval_composition_counts_per_seed_per_model[model][comp])
        - np.mean(semeval_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on SemEval")

    plt.savefig(
        f"outputs/figures/semeval__{model}__composition-frequency.pdf",
        bbox_inches="tight",
    )
    plt.savefig(
        f"outputs/figures/semeval__{model}__composition-frequency.png",
        bbox_inches="tight",
    )
    # Show the plot
    plt.show()

In [None]:
# Create composition frequency latex tables
target_dataset = "semeval"
target_dataset_models = [
    m
    for m in semeval_composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for (
        composition_name,
        composition_counts,
    ) in semeval_composition_counts_per_seed_per_model[model].items():
        mean_counts = np.mean(composition_counts)

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            model
        ] = mean_counts


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in semeval_composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in semeval_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on SemEval"
    )
    plt.savefig(f"outputs/figures/semeval__{model}__technique-cooccurrences.pdf")
    plt.show()

# Cobra frames corpus


In [None]:
MODELS = ["mistral-7b-instruct-v2", "command-r-v01", "llama3-70b-instruct"]

TRAINING_DATASETS = ["sbic", "stereoset", "cobra_frames"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["cobra_frames"])
cobra_frames_train_df = data_handler.cobra_frames["train"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]
cobra_frames_val_df = data_handler.cobra_frames["dev"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]
cobra_frames_test_df = data_handler.cobra_frames["test"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]

In [None]:
cobra_frames_composition_predictions_val_per_model = {}
cobra_frames_composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "cobra_frames_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        cobra_frames_composition_predictions_val_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_val

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "cobra_frames_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            # Since we have duplicates, we need to sort them first, then merge them (cannot use
            # df.merge properly)
            df_sorted = df.sort_values(by="post_id")
            composition_predictions_test = composition_predictions_test.sort_values(
                by="post_id"
            )

            composition_predictions_test[f"pred_best_composition_seed{seed}"] = (
                df_sorted[f"pred_best_composition_seed{seed}"]
            )

        cobra_frames_composition_predictions_test_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_test

In [None]:
# Load composition-specific prediction files
cobra_frames_output_dir = "outputs/prompt-predictions/cobra_frames"
cobra_frames_predictions_per_composition_val_per_model = {}
cobra_frames_predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f for f in sorted(listdir(cobra_frames_output_dir)) if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"cobra_frames-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(
                f"cobra_frames-greedy-dev_{model}_", ""
            ).replace(".parquet", "")

        df = pd.read_parquet(path.join(cobra_frames_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_val[composition_name] = df

    cobra_frames_predictions_per_composition_val_per_model[model] = (
        predictions_per_composition_val
    )

    # Test set
    composition_files_test = [
        f
        for f in sorted(listdir(cobra_frames_output_dir))
        if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"cobra_frames-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(
                f"cobra_frames-greedy-test_{model}_", ""
            ).replace(".parquet", "")

        df = pd.read_parquet(path.join(cobra_frames_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_test[composition_name] = df

    cobra_frames_predictions_per_composition_test_per_model[model] = (
        predictions_per_composition_test
    )

In [None]:
print("## Training split")
positive_instances_train = len(
    cobra_frames_train_df[cobra_frames_train_df.true_label == 1]
)
negative_instances_train = len(
    cobra_frames_train_df[cobra_frames_train_df.true_label == 0]
)
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(cobra_frames_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(cobra_frames_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(cobra_frames_val_df[cobra_frames_val_df.true_label == 1])
negative_instances_val = len(cobra_frames_val_df[cobra_frames_val_df.true_label == 0])
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(cobra_frames_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(cobra_frames_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(
    cobra_frames_test_df[cobra_frames_test_df.true_label == 1]
)
negative_instances_test = len(
    cobra_frames_test_df[cobra_frames_test_df.true_label == 0]
)
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(cobra_frames_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(cobra_frames_test_df), decimals=3)})"
)

## Predicting compositions performance evaluation

_aka. how well can the encoder model predict a composition that is correct_


In [None]:
target_data = "cobra_frames"

for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    output_dir = path.join("outputs/composition-predictions")
    prompt_compositions = cobra_frames_predictions_per_composition_test_per_model[
        model
    ].keys()

    scores_per_seed = {}
    for seed in RANDOM_SEED:
        # Load predictions of the adaptive prompting model for each text instance and composition
        seed_dir = list(
            filter(
                lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{target_data}"
                in x
                and f"seed{seed}" in x,
                sorted(listdir(output_dir)),
            )
        )[0]
        df = pd.read_parquet(
            path.join(output_dir, seed_dir, f"{target_data}_test_results.parquet")
        )
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())

        # COBRAFRAMES-specific fix
        # Remove posts that are not present in filtered cobraframe dataset above
        # (due to a bug in cluster) (should only be five posts)
        df = df[
            df["post_id"].isin(
                cobra_frames_predictions_per_composition_test_per_model[model][
                    list(prompt_compositions)[0]
                ]["post_id"]
            )
        ]

        # Load the id2component map to ensure that the predictions are in the same ordering as above
        with open(path.join(output_dir, seed_dir, "id2component_map.json"), "r") as f:
            id2component_map = json.load(f)
        component2id_map = {value: int(key) for key, value in id2component_map.items()}

        # Re-order the predicted probabilities per text instance according the to id2component map
        # loaded above
        adaptive_prompting_correctness_per_sample = []
        for i, row in df.iterrows():
            comp_df = cobra_frames_predictions_per_composition_test_per_model[model][
                row["pred_best_composition"]
            ]
            row_comp_df = comp_df[comp_df["post_id"] == row["post_id"]].iloc[0]
            try:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_{seed}"] == row_comp_df["true_label"]).astype(
                        int
                    )
                )
            except KeyError:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_23"] == row_comp_df["true_label"]).astype(int)
                )
        scores_per_seed[seed] = sum(adaptive_prompting_correctness_per_sample) / len(
            adaptive_prompting_correctness_per_sample
        )

    print(scores_per_seed)
    print(np.mean(list(scores_per_seed.values())))

## Adaptive prompting evaluation


In [None]:
cobra_frames_composition_prediction_scores_per_model = {}

for model in cobra_frames_composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]
    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in cobra_frames_composition_predictions_val_per_model[
            model
        ].iterrows():
            preds = cobra_frames_predictions_per_composition_val_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in cobra_frames_composition_predictions_test_per_model[
            model
        ].iterrows():
            preds = cobra_frames_predictions_per_composition_test_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]

            # TODO: REMOVE; ONLY TEMPORARY FIX FOR BROKEN DATA
            # (doesn't have any impact on compelete data, though)
            try:
                if f"output_{seed}" in preds.columns:
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                    )
                else:
                    # If we don't have predictions for other seeds, use the primary seed
                    y_pred_seed.append(
                        (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                    )
                y_true_seed.append(
                    preds[preds.post_id == row.post_id].iloc[0]["true_label"]
                )
            except IndexError:
                # print(f"No post found for id {row.post_id} in predictions. Skipping for now.")
                pass

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    cobra_frames_composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        cobra_frames_composition_prediction_scores_per_model[model][
            "test_macro_precision"
        ],
    )
    print(
        "Recall (macro) (over all seeds):",
        cobra_frames_composition_prediction_scores_per_model[model][
            "test_macro_recall"
        ],
    )
    print(
        "F1 (macro) (over all seeds):",
        cobra_frames_composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
cobra_frames_all_scores_val_per_model = {}
cobra_frames_all_scores_test_per_model = {}

for model in MODELS:
    # Validation split
    all_scores_val = {}
    for name, predictions in cobra_frames_predictions_per_composition_val_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    cobra_frames_all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in cobra_frames_predictions_per_composition_test_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    cobra_frames_all_scores_test_per_model[model] = all_scores_test

cobra_frames_all_f1_scores_test_per_model = {}
cobra_frames_all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    cobra_frames_all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_test_per_model[model].items()
    ]
    cobra_frames_all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in cobra_frames_all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = cobra_frames_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(cobra_frames_test_df))]

cobra_frames_maj_baseline_scores = precision_recall_fscore_support(
    y_true=cobra_frames_test_df["true_label"],
    y_pred=majority_baseline_pred,
    pos_label=1,
)

cobra_frames_maj_baseline_precision_macro_averaged_test = (
    cobra_frames_maj_baseline_scores[0][0] + cobra_frames_maj_baseline_scores[0][1]
) / 2
cobra_frames_maj_baseline_recall_macro_averaged_test = (
    cobra_frames_maj_baseline_scores[1][0] + cobra_frames_maj_baseline_scores[1][1]
) / 2
cobra_frames_maj_baseline_f1_macro_averaged_test = (
    cobra_frames_maj_baseline_scores[2][0] + cobra_frames_maj_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    cobra_frames_maj_baseline_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):",
    cobra_frames_maj_baseline_recall_macro_averaged_test,
)
print("F1 (macro) (over all seeds):", cobra_frames_maj_baseline_f1_macro_averaged_test)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(cobra_frames_test_df))

cobra_frames_random_baseline_scores = precision_recall_fscore_support(
    y_true=cobra_frames_test_df["true_label"], y_pred=random_baseline_pred, pos_label=1
)

cobra_frames_random_baseline_precision_macro_averaged = (
    cobra_frames_random_baseline_scores[0][0]
    + cobra_frames_random_baseline_scores[0][1]
) / 2
cobra_frames_random_baseline_recall_macro_averaged = (
    cobra_frames_random_baseline_scores[1][0]
    + cobra_frames_random_baseline_scores[1][1]
) / 2
cobra_frames_random_baseline_f1_macro_averaged = (
    cobra_frames_random_baseline_scores[2][0]
    + cobra_frames_random_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    cobra_frames_random_baseline_precision_macro_averaged,
)
print(
    "Recall (macro) (over all seeds):",
    cobra_frames_random_baseline_recall_macro_averaged,
)
print("F1 (macro) (over all seeds):", cobra_frames_random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
cobra_frames_oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for composition, df in cobra_frames_predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    cobra_frames_oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        cobra_frames_oracle_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Oracle Recall (macro):",
        cobra_frames_oracle_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Oracle F1 (macro):",
        cobra_frames_oracle_scores_per_model[model]["test_macro_f1"],
    )

### No technique

_Task description and input text only_


In [None]:
cobra_frames_no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(cobra_frames_all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    cobra_frames_no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in cobra_frames_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in cobra_frames_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in cobra_frames_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        cobra_frames_no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        cobra_frames_no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        cobra_frames_no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
cobra_frames_optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    cobra_frames_optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_val_per_model[model].items()
    ]
    cobra_frames_optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_val_per_model[model].items()
    ]
    cobra_frames_optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_val_per_model[model].items()
    ]
    # Test split
    cobra_frames_optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_test_per_model[model].items()
    ]
    cobra_frames_optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_test_per_model[model].items()
    ]
    cobra_frames_optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in cobra_frames_all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    cobra_frames_optimal_composition_val_f1_macro_i = np.argmax(
        cobra_frames_optimal_f1_macro_averaged_scores_val
    )
    cobra_frames_optimal_composition_name = list(
        cobra_frames_all_scores_val_per_model[model].keys()
    )[cobra_frames_optimal_composition_val_f1_macro_i]

    cobra_frames_optimal_composition_scores_per_model[model] = {
        "composition_name": list(cobra_frames_all_scores_val_per_model[model].keys())[
            cobra_frames_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision": cobra_frames_optimal_precision_macro_averaged_scores_test[
            cobra_frames_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                cobra_frames_optimal_composition_name
            ]
        ],
        "test_macro_recall": cobra_frames_optimal_recall_macro_averaged_scores_test[
            cobra_frames_optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                cobra_frames_optimal_composition_name
            ]
        ],
        "test_macro_f1": cobra_frames_optimal_f1_macro_averaged_scores_test[
            cobra_frames_optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in cobra_frames_all_scores_test_per_model[model][
                cobra_frames_optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        cobra_frames_optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        cobra_frames_optimal_composition_scores_per_model[model][
            "test_macro_precision"
        ],
    )
    print(
        "Optimal composition Recall (macro):",
        cobra_frames_optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        cobra_frames_optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
cobra_frames_ensemble_scores_per_model = {}

for model in MODELS:
    # print(model)
    all_seed_scores = []

    for seed in RANDOM_SEED:
        # print(seed)
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for (
            composition_name,
            comp_preds,
        ) in cobra_frames_predictions_per_composition_test_per_model[model].items():
            # print(composition_name)
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]
            # print("merging")

            # We need to work on sorted dictionaries, due to duplicates...
            comp_preds_sorted = comp_preds.sort_values(by="post_id")
            seed_df = seed_df.sort_values(by="post_id")

            try:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{seed}"
                ]
            except KeyError:
                seed_df[f"{composition_name}_{seed}"] = comp_preds_sorted[
                    f"output_{RANDOM_SEED[0]}"
                ]

        # print("moding")
        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there is a tie, use a random value between 0 and 1
        seed_df["majority"] = np.where(mode[1].isna(), mode[0], np.random.randint(2))

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    cobra_frames_ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        cobra_frames_ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        cobra_frames_ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        cobra_frames_ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20240730162242_cobra_frames-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"cobra_frames-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(cobra_frames_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)

cobra_frames_finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    cobra_frames_finetune_scores["test_macro_precision"],
)
print(
    "Recall (macro) (over all seeds):",
    cobra_frames_finetune_scores["test_macro_recall"],
)
print("F1 (macro) (over all seeds):", cobra_frames_finetune_scores["test_macro_f1"])

### Self-diagnosis baseline


In [None]:
output_dir = "results"
model_name = "self-diagnosis"

sd_results_df = pd.read_parquet(
    path.join(
        output_dir, model_name, "baseline_self_diagnosis_cobra_frames_test.parquet"
    )
)

sd_scores = precision_recall_fscore_support(
    y_true=sd_results_df["true_label"], y_pred=sd_results_df["output_23"], pos_label=1
)

cobra_frames_self_diagnosis_precision_macro_averaged_test = (
    sd_scores[0][0] + sd_scores[0][1]
) / 2
cobra_frames_self_diagnosis_recall_macro_averaged_test = (
    sd_scores[1][0] + sd_scores[1][1]
) / 2
cobra_frames_self_diagnosis_f1_macro_averaged_test = (
    sd_scores[2][0] + sd_scores[2][1]
) / 2

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    cobra_frames_self_diagnosis_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):",
    cobra_frames_self_diagnosis_recall_macro_averaged_test,
)
print(
    "F1 (macro) (over all seeds):", cobra_frames_self_diagnosis_f1_macro_averaged_test
)

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
cobra_frames_best_on_test_scores = {}
for model in MODELS:
    cobra_frames_best_composition = np.argmax(
        cobra_frames_all_f1_scores_test_per_model[model]
    )
    cobra_frames_best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": cobra_frames_all_f1_scores_test_per_model_seed_scores[
            model
        ][
            cobra_frames_best_composition
        ]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "cobra_frames"

baselines = [
    ("BaseComposition", cobra_frames_no_technique_scores_per_model),
    ("BestOnVal", cobra_frames_optimal_composition_scores_per_model),
    ("BestOnTest", cobra_frames_best_on_test_scores),
    ("Finetune", cobra_frames_finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = cobra_frames_composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in cobra_frames_all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(cobra_frames_all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(cobra_frames_all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(cobra_frames_all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        cobra_frames_all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        cobra_frames_all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(cobra_frames_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(cobra_frames_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        cobra_frames_all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        cobra_frames_all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(cobra_frames_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(cobra_frames_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        cobra_frames_no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        cobra_frames_no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(cobra_frames_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(cobra_frames_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_handle = plt.scatter(
        i + 1,
        cobra_frames_optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        cobra_frames_optimal_composition_scores_per_model[model]["test_macro_f1"],
        # ("Best-on-validation "f"({np.round(cobra_frames_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(cobra_frames_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        cobra_frames_ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        cobra_frames_ensemble_scores_per_model[model]["test_macro_f1"],
        # ("Majority ensemble " f"({np.round(cobra_frames_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E -> ",
        (
            "Majority ensemble "
            f"({np.round(cobra_frames_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            cobra_frames_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            cobra_frames_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            cobra_frames_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            cobra_frames_oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            cobra_frames_oracle_scores_per_model[model]["test_macro_f1"],
            # (
            #     "Oracle "
            #     f"({np.round(cobra_frames_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            # ),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                "Oracle "
                f"({np.round(cobra_frames_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(cobra_frames_all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(cobra_frames_all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {cobra_frames_optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    cobra_frames_finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    cobra_frames_finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(cobra_frames_finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        cobra_frames_maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        cobra_frames_maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(cobra_frames_maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        cobra_frames_random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        cobra_frames_random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(cobra_frames_random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add self-diagnosis as horizontal line scatter
    plt.axhline(
        cobra_frames_self_diagnosis_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        cobra_frames_self_diagnosis_f1_macro_averaged_test,
        f"Self-diagnosis baseline ({np.round(cobra_frames_self_diagnosis_f1_macro_averaged_test, decimals=3)})",
        # "I",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("Cobra Frames data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")
plt.savefig("outputs/figures/cobraframes__performance-box-plot.pdf")
plt.savefig("outputs/figures/cobraframes__performance-box-plot.svg")
plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
cobra_frames_composition_counts_per_seed_per_model = {}

for model in cobra_frames_composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    cobra_frames_composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        comp_count = Counter(
            cobra_frames_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in cobra_frames_predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if (
                composition
                not in cobra_frames_composition_counts_per_seed_per_model[model].keys()
            ):
                cobra_frames_composition_counts_per_seed_per_model[model][
                    composition
                ] = []

            if composition in comp_count.keys():
                cobra_frames_composition_counts_per_seed_per_model[model][
                    composition
                ].append(comp_count[composition])
            else:
                cobra_frames_composition_counts_per_seed_per_model[model][
                    composition
                ].append(0)

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(
        cobra_frames_composition_counts_per_seed_per_model[model].keys()
    )
    values = [
        np.mean(cobra_frames_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(cobra_frames_composition_counts_per_seed_per_model[model][comp])
        - np.min(cobra_frames_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(cobra_frames_composition_counts_per_seed_per_model[model][comp])
        - np.mean(cobra_frames_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on CobraFrames")

    plt.savefig(
        f"outputs/figures/cobraframes__{model}__composition-frequency.pdf",
        bbox_inches="tight",
    )
    plt.savefig(
        f"outputs/figures/cobraframes__{model}__composition-frequency.png",
        bbox_inches="tight",
    )
    # Show the plot
    plt.show()

In [None]:
# Count the number of times each composition produces a correct prediction for each split
target_dataset = "cobra_frames"

cobra_frames_train_correct_prediction_counts_per_seed_per_model = {}
cobra_frames_val_correct_prediction_counts_per_seed_per_model = {}
cobra_frames_test_correct_prediction_counts_per_seed_per_model = {}

# Train dataset (needs slightly different loading)
# We use the composition names form the test data loaded further above, but then load the
# predictions on the train data below from file
for (
    model,
    composition_predictions,
) in cobra_frames_predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in cobra_frames_train_correct_prediction_counts_per_seed_per_model.keys()
    ):
        cobra_frames_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        # Load composition predictions for trainin dataset
        if "cot" in composition:
            comp_no_cot = composition.replace("cot_", "")
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-cot-greedy-train_{model}_{comp_no_cot}.parquet"
            )
        else:
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-greedy-train_{model}_{composition}.parquet"
            )

        for seed in RANDOM_SEED:
            try:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        cobra_frames_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

# Val dataset
for (
    model,
    composition_predictions,
) in cobra_frames_predictions_per_composition_val_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in cobra_frames_val_correct_prediction_counts_per_seed_per_model.keys()
    ):
        cobra_frames_val_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = (
                cobra_frames_predictions_per_composition_val_per_model[model][
                    composition
                ]
            )
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        cobra_frames_val_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

# Test dataset
for (
    model,
    composition_predictions,
) in cobra_frames_predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in cobra_frames_test_correct_prediction_counts_per_seed_per_model.keys()
    ):
        cobra_frames_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = (
                cobra_frames_predictions_per_composition_test_per_model[model][
                    composition
                ]
            )
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        cobra_frames_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

In [None]:
# Create composition frequency tables
target_dataset = "cobra_frames"
target_dataset_models = [
    m
    for m in cobra_frames_composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for (
        composition_name,
        composition_counts,
    ) in cobra_frames_composition_counts_per_seed_per_model[model].items():
        # Calculate average composition frequencies for current model over seeds
        mean_frequency_counts = np.mean(composition_counts)

        # Calculate standard deviation of composition frequencies over seeds
        stddev_frequency_counts = np.std(composition_counts)

        # Calculate how often each composition results in the correct prediction per split
        mean_correct_prediction = np.mean(
            cobra_frames_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        # Calculate standard deviation of correct predictions per composition over seeds
        stddev_correct_prediction = np.std(
            cobra_frames_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_frequencies_test"
        ] = f"{mean_frequency_counts} (+- {np.round(stddev_frequency_counts, decimals=2):0.2f})"

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_correct_prediction_train"
        ] = f"{mean_correct_prediction} (+- {np.round(stddev_correct_prediction, decimals=2):0.2f})"


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in cobra_frames_composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in cobra_frames_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on CobraFrames"
    )
    plt.savefig(f"outputs/figures/cobraframes__{model}__technique-cooccurrences.pdf")
    plt.show()

# Stereoset corpus


In [None]:
MODELS = ["mistral-7b-instruct-v2", "command-r-v01", "llama3-70b-instruct"]

TRAINING_DATASETS = ["sbic", "stereoset", "cobra_frames"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["stereoset"])
stereoset_train_df = data_handler.stereoset_data["train"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]
stereoset_val_df = data_handler.stereoset_data["dev"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]
stereoset_test_df = data_handler.stereoset_data["test"].rename(
    columns={"hasBiasedImplication": "true_label", "text_hash": "md5_hash"}
)[["md5_hash", "true_label"]]

In [None]:
stereoset_composition_predictions_val_per_model = {}
stereoset_composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "stereoset_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        stereoset_composition_predictions_val_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_val

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "stereoset_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            composition_predictions_test = pd.merge(
                composition_predictions_test,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        stereoset_composition_predictions_test_per_model[
            f"{model}__{training_dataset}"
        ] = composition_predictions_test

In [None]:
# Load composition-specific prediction files
stereoset_output_dir = "outputs/prompt-predictions/stereoset"
stereoset_predictions_per_composition_val_per_model = {}
stereoset_predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f for f in sorted(listdir(stereoset_output_dir)) if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"stereoset-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"stereoset-greedy-dev_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(stereoset_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_val[composition_name] = df

    stereoset_predictions_per_composition_val_per_model[model] = (
        predictions_per_composition_val
    )

    # Test set
    composition_files_test = [
        f for f in sorted(listdir(stereoset_output_dir)) if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"stereoset-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"stereoset-greedy-test_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(stereoset_output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_test[composition_name] = df

    stereoset_predictions_per_composition_test_per_model[model] = (
        predictions_per_composition_test
    )

In [None]:
print("## Training split")
positive_instances_train = len(stereoset_train_df[stereoset_train_df.true_label == 1])
negative_instances_train = len(stereoset_train_df[stereoset_train_df.true_label == 0])
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(stereoset_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(stereoset_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(stereoset_val_df[stereoset_val_df.true_label == 1])
negative_instances_val = len(stereoset_val_df[stereoset_val_df.true_label == 0])
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(stereoset_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(stereoset_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(stereoset_test_df[stereoset_test_df.true_label == 1])
negative_instances_test = len(stereoset_test_df[stereoset_test_df.true_label == 0])
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(stereoset_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(stereoset_test_df), decimals=3)})"
)

## Predicting compositions performance evaluation

_aka. how well can the encoder model predict a composition that is correct_


In [None]:
target_data = "stereoset"

for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    output_dir = path.join("outputs/composition-predictions")
    prompt_compositions = stereoset_predictions_per_composition_test_per_model[
        model
    ].keys()

    scores_per_seed = {}
    for seed in RANDOM_SEED:
        # Load predictions of the adaptive prompting model for each text instance and composition
        seed_dir = list(
            filter(
                lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{target_data}"
                in x
                and f"seed{seed}" in x,
                sorted(listdir(output_dir)),
            )
        )[0]
        df = pd.read_parquet(
            path.join(output_dir, seed_dir, f"{target_data}_test_results.parquet")
        )
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())

        # COBRAFRAMES-specific fix
        # Remove posts that are not present in filtered cobraframe dataset above
        # (due to a bug in cluster) (should only be five posts)
        df = df[
            df["post_id"].isin(
                stereoset_predictions_per_composition_test_per_model[model][
                    list(prompt_compositions)[0]
                ]["post_id"]
            )
        ]

        # Load the id2component map to ensure that the predictions are in the same ordering as above
        with open(path.join(output_dir, seed_dir, "id2component_map.json"), "r") as f:
            id2component_map = json.load(f)
        component2id_map = {value: int(key) for key, value in id2component_map.items()}

        # Re-order the predicted probabilities per text instance according the to id2component map
        # loaded above
        adaptive_prompting_correctness_per_sample = []
        for i, row in df.iterrows():
            comp_df = stereoset_predictions_per_composition_test_per_model[model][
                row["pred_best_composition"]
            ]
            row_comp_df = comp_df[comp_df["post_id"] == row["post_id"]].iloc[0]
            try:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_{seed}"] == row_comp_df["true_label"]).astype(
                        int
                    )
                )
            except KeyError:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_23"] == row_comp_df["true_label"]).astype(int)
                )
        scores_per_seed[seed] = sum(adaptive_prompting_correctness_per_sample) / len(
            adaptive_prompting_correctness_per_sample
        )

    print(scores_per_seed)
    print(np.mean(list(scores_per_seed.values())))

## Adaptive prompting evaluation


In [None]:
stereoset_composition_prediction_scores_per_model = {}

for model in stereoset_composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in stereoset_composition_predictions_val_per_model[model].iterrows():
            preds = stereoset_predictions_per_composition_val_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in stereoset_composition_predictions_test_per_model[
            model
        ].iterrows():
            preds = stereoset_predictions_per_composition_test_per_model[
                model_name_without_data
            ][row[f"pred_best_composition_seed{seed}"]]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    stereoset_composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        stereoset_composition_prediction_scores_per_model[model][
            "test_macro_precision"
        ],
    )
    print(
        "Recall (macro) (over all seeds):",
        stereoset_composition_prediction_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        stereoset_composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
stereoset_all_scores_val_per_model = {}
stereoset_all_scores_test_per_model = {}

for model in MODELS:
    # Validation split
    all_scores_val = {}
    for name, predictions in stereoset_predictions_per_composition_val_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    stereoset_all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in stereoset_predictions_per_composition_test_per_model[
        model
    ].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    stereoset_all_scores_test_per_model[model] = all_scores_test

stereoset_all_f1_scores_test_per_model = {}
stereoset_all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    stereoset_all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_test_per_model[model].items()
    ]

    stereoset_all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in stereoset_all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = stereoset_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(stereoset_test_df))]

stereoset_maj_baseline_scores = precision_recall_fscore_support(
    y_true=stereoset_test_df["true_label"], y_pred=majority_baseline_pred, pos_label=1
)

stereoset_maj_baseline_precision_macro_averaged_test = (
    stereoset_maj_baseline_scores[0][0] + stereoset_maj_baseline_scores[0][1]
) / 2
stereoset_maj_baseline_recall_macro_averaged_test = (
    stereoset_maj_baseline_scores[1][0] + stereoset_maj_baseline_scores[1][1]
) / 2
stereoset_maj_baseline_f1_macro_averaged_test = (
    stereoset_maj_baseline_scores[2][0] + stereoset_maj_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    stereoset_maj_baseline_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):",
    stereoset_maj_baseline_recall_macro_averaged_test,
)
print("F1 (macro) (over all seeds):", stereoset_maj_baseline_f1_macro_averaged_test)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(stereoset_test_df))

stereoset_random_baseline_scores = precision_recall_fscore_support(
    y_true=stereoset_test_df["true_label"], y_pred=random_baseline_pred, pos_label=1
)

stereoset_random_baseline_precision_macro_averaged = (
    stereoset_random_baseline_scores[0][0] + stereoset_random_baseline_scores[0][1]
) / 2
stereoset_random_baseline_recall_macro_averaged = (
    stereoset_random_baseline_scores[1][0] + stereoset_random_baseline_scores[1][1]
) / 2
stereoset_random_baseline_f1_macro_averaged = (
    stereoset_random_baseline_scores[2][0] + stereoset_random_baseline_scores[2][1]
) / 2
print(
    "Precision (macro) (over all seeds):",
    stereoset_random_baseline_precision_macro_averaged,
)
print(
    "Recall (macro) (over all seeds):", stereoset_random_baseline_recall_macro_averaged
)
print("F1 (macro) (over all seeds):", stereoset_random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
stereoset_oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for composition, df in stereoset_predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    stereoset_oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        stereoset_oracle_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Oracle Recall (macro):",
        stereoset_oracle_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Oracle F1 (macro):", stereoset_oracle_scores_per_model[model]["test_macro_f1"]
    )

### No technique

_Task description and input text only_


In [None]:
stereoset_no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(stereoset_all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    stereoset_no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in stereoset_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in stereoset_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in stereoset_all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                "task-description-only"
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        stereoset_no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        stereoset_no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        stereoset_no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
stereoset_optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    stereoset_optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_val_per_model[model].items()
    ]
    stereoset_optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_val_per_model[model].items()
    ]
    stereoset_optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_val_per_model[model].items()
    ]
    # Test split
    stereoset_optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_test_per_model[model].items()
    ]
    stereoset_optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_test_per_model[model].items()
    ]
    stereoset_optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in stereoset_all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    stereoset_optimal_composition_val_f1_macro_i = np.argmax(
        stereoset_optimal_f1_macro_averaged_scores_val
    )
    stereoset_optimal_composition_name = list(
        stereoset_all_scores_val_per_model[model].keys()
    )[stereoset_optimal_composition_val_f1_macro_i]

    stereoset_optimal_composition_scores_per_model[model] = {
        "composition_name": list(stereoset_all_scores_val_per_model[model].keys())[
            stereoset_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision": stereoset_optimal_precision_macro_averaged_scores_test[
            stereoset_optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                stereoset_optimal_composition_name
            ]
        ],
        "test_macro_recall": stereoset_optimal_recall_macro_averaged_scores_test[
            stereoset_optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                stereoset_optimal_composition_name
            ]
        ],
        "test_macro_f1": stereoset_optimal_f1_macro_averaged_scores_test[
            stereoset_optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in stereoset_all_scores_test_per_model[model][
                stereoset_optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        stereoset_optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        stereoset_optimal_composition_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Optimal composition Recall (macro):",
        stereoset_optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        stereoset_optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
stereoset_ensemble_scores_per_model = {}

for model in MODELS:
    all_seed_scores = []

    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for (
            composition_name,
            comp_preds,
        ) in stereoset_predictions_per_composition_test_per_model[model].items():
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]
            try:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{seed}"]].rename(
                        columns={f"output_{seed}": f"{composition_name}_{seed}"}
                    ),
                    on="post_id",
                    how="left",
                )
            except KeyError:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{RANDOM_SEED[0]}"]].rename(
                        columns={
                            f"output_{RANDOM_SEED[0]}": f"{composition_name}_{seed}"
                        }
                    ),
                    on="post_id",
                    how="left",
                )

        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there is a tie, use a random value between 0 and 1
        seed_df["majority"] = np.where(mode[1].isna(), mode[0], np.random.randint(2))

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    stereoset_ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        stereoset_ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        stereoset_ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        stereoset_ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20240607232518_stereoset-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"stereoset-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(stereoset_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)

stereoset_finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    stereoset_finetune_scores["test_macro_precision"],
)
print(
    "Recall (macro) (over all seeds):", stereoset_finetune_scores["test_macro_recall"]
)
print("F1 (macro) (over all seeds):", stereoset_finetune_scores["test_macro_f1"])

### Self-diagnosis baseline


In [None]:
output_dir = "results"
model_name = "self-diagnosis"

sd_results_df = pd.read_parquet(
    path.join(output_dir, model_name, "baseline_self_diagnosis_stereoset_test.parquet")
)

sd_scores = precision_recall_fscore_support(
    y_true=sd_results_df["true_label"], y_pred=sd_results_df["output_23"], pos_label=1
)

stereoset_self_diagnosis_precision_macro_averaged_test = (
    sd_scores[0][0] + sd_scores[0][1]
) / 2
stereoset_self_diagnosis_recall_macro_averaged_test = (
    sd_scores[1][0] + sd_scores[1][1]
) / 2
stereoset_self_diagnosis_f1_macro_averaged_test = (
    sd_scores[2][0] + sd_scores[2][1]
) / 2

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):",
    stereoset_self_diagnosis_precision_macro_averaged_test,
)
print(
    "Recall (macro) (over all seeds):",
    stereoset_self_diagnosis_recall_macro_averaged_test,
)
print("F1 (macro) (over all seeds):", stereoset_self_diagnosis_f1_macro_averaged_test)

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
stereoset_best_on_test_scores = {}
for model in MODELS:
    stereoset_best_composition = np.argmax(
        stereoset_all_f1_scores_test_per_model[model]
    )
    stereoset_best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": stereoset_all_f1_scores_test_per_model_seed_scores[
            model
        ][stereoset_best_composition]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "stereoset"

baselines = [
    ("BaseComposition", stereoset_no_technique_scores_per_model),
    ("BestOnVal", stereoset_optimal_composition_scores_per_model),
    ("BestOnTest", stereoset_best_on_test_scores),
    ("Finetune", stereoset_finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = stereoset_composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in stereoset_all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(stereoset_all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(stereoset_all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(stereoset_all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        stereoset_all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        stereoset_all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(stereoset_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(stereoset_all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        stereoset_all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        stereoset_all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(stereoset_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(stereoset_all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        stereoset_no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        stereoset_no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(stereoset_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(stereoset_no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_handle = plt.scatter(
        i + 1,
        stereoset_optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        stereoset_optimal_composition_scores_per_model[model]["test_macro_f1"],
        # ("Best-on-validation " f"({np.round(stereoset_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(stereoset_optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        stereoset_ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        stereoset_ensemble_scores_per_model[model]["test_macro_f1"],
        # ("Majority ensemble " f"({np.round(stereoset_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E ->",
        (
            "Majority ensemble "
            f"({np.round(stereoset_ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            stereoset_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            stereoset_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            stereoset_composition_prediction_scores_per_model[
                f"{model}__{training_dataset}"
            ]["test_macro_f1"],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            stereoset_oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            stereoset_oracle_scores_per_model[model]["test_macro_f1"],
            # (
            #     "Oracle " f"({np.round(stereoset_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            # ),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                "Oracle "
                f"({np.round(stereoset_oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(stereoset_all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(stereoset_all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {stereoset_optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    stereoset_finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    stereoset_finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(stereoset_finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        stereoset_maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        stereoset_maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(stereoset_maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        stereoset_random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        stereoset_random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(stereoset_random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

    # Add self-diagnosis as horizontal line scatter
    plt.axhline(
        stereoset_self_diagnosis_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        stereoset_self_diagnosis_f1_macro_averaged_test,
        f"Self-diagnosis baseline ({np.round(stereoset_self_diagnosis_f1_macro_averaged_test, decimals=3)})",
        # "I",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("Stereoset data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")

plt.savefig("outputs/figures/stereoset__performance-box-plot.pdf")
plt.savefig("outputs/figures/stereoset__performance-box-plot.svg")

plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
stereoset_composition_counts_per_seed_per_model = {}

for model in stereoset_composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    stereoset_composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        # Calculate how often each composition is used
        comp_count = Counter(
            stereoset_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in stereoset_predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if (
                composition
                not in stereoset_composition_counts_per_seed_per_model[model].keys()
            ):
                stereoset_composition_counts_per_seed_per_model[model][composition] = []

            if composition in comp_count.keys():
                stereoset_composition_counts_per_seed_per_model[model][
                    composition
                ].append(comp_count[composition])
            else:
                stereoset_composition_counts_per_seed_per_model[model][
                    composition
                ].append(0)

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(stereoset_composition_counts_per_seed_per_model[model].keys())
    values = [
        np.mean(stereoset_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(stereoset_composition_counts_per_seed_per_model[model][comp])
        - np.min(stereoset_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(stereoset_composition_counts_per_seed_per_model[model][comp])
        - np.mean(stereoset_composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on Stereoset")

    plt.savefig(
        f"outputs/figures/stereoset__{model}__composition-frequency.pdf",
        bbox_inches="tight",
    )
    plt.savefig(
        f"outputs/figures/stereoset__{model}__composition-frequency.png",
        bbox_inches="tight",
    )
    # Show the plot
    plt.show()

In [None]:
# Count the number of times each composition produces a correct prediction for each split
target_dataset = "stereoset"

stereoset_train_correct_prediction_counts_per_seed_per_model = {}
stereoset_val_correct_prediction_counts_per_seed_per_model = {}
stereoset_test_correct_prediction_counts_per_seed_per_model = {}

# Train dataset (needs slightly different loading)
# We use the composition names form the test data loaded further above, but then load the
# predictions on the train data below from file
for (
    model,
    composition_predictions,
) in stereoset_predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in stereoset_train_correct_prediction_counts_per_seed_per_model.keys()
    ):
        stereoset_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        # Load composition predictions for trainin dataset
        if "cot" in composition:
            comp_no_cot = composition.replace("cot_", "")
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-cot-greedy-train_{model}_{comp_no_cot}.parquet"
            )
        else:
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-greedy-train_{model}_{composition}.parquet"
            )

        for seed in RANDOM_SEED:
            try:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        stereoset_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

# Val dataset
for (
    model,
    composition_predictions,
) in stereoset_predictions_per_composition_val_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in stereoset_val_correct_prediction_counts_per_seed_per_model.keys()
    ):
        stereoset_val_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = stereoset_predictions_per_composition_val_per_model[
                model
            ][composition]
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        stereoset_val_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

# Test dataset
for (
    model,
    composition_predictions,
) in stereoset_predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in stereoset_test_correct_prediction_counts_per_seed_per_model.keys()
    ):
        stereoset_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = stereoset_predictions_per_composition_test_per_model[
                model
            ][composition]
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        stereoset_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

In [None]:
# Create composition frequency tables
target_dataset = "stereoset"
target_dataset_models = [
    m
    for m in stereoset_composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for (
        composition_name,
        composition_counts,
    ) in stereoset_composition_counts_per_seed_per_model[model].items():
        # Calculate average composition frequencies for current model over seeds
        mean_frequency_counts = np.mean(composition_counts)

        # Calculate standard deviation of composition frequencies over seeds
        stddev_frequency_counts = np.std(composition_counts)

        # Calculate how often each composition results in the correct prediction per split
        mean_correct_prediction = np.mean(
            stereoset_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        # Calculate standard deviation of correct predictions per composition over seeds
        stddev_correct_prediction = np.std(
            stereoset_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_frequencies_test"
        ] = f"{mean_frequency_counts} (+- {np.round(stddev_frequency_counts, decimals=2):0.2f})"

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_correct_prediction_train"
        ] = f"{mean_correct_prediction} (+- {np.round(stddev_correct_prediction, decimals=2):0.2f})"


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in stereoset_composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in stereoset_composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    # mask = np.triu(np.ones_like(average_cooccurrences, dtype=bool))
    # masked_data = np.ma.masked_array(average_cooccurrences, mask)

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on Stereoset"
    )
    plt.savefig(f"outputs/figures/stereoset__{model}__technique-cooccurrences.pdf")
    plt.show()

# SBIC corpus


In [None]:
MODELS = ["mistral-7b-instruct-v2", "command-r-v01", "llama3-70b-instruct"]

TRAINING_DATASETS = ["sbic", "stereoset", "cobra_frames"]

## Preparation


In [None]:
np.random.seed(23)

In [None]:
data_handler = DataHandler(datasets_to_load=["sbic"])
sbic_train_df = data_handler.sbic_data["train_sub_split_balanced"].rename(
    columns={"hasBiasedImplication": "true_label"}
)[["md5_hash", "true_label"]]
sbic_val_df = data_handler.sbic_data["dev"].rename(
    columns={"hasBiasedImplication": "true_label"}
)[["md5_hash", "true_label"]]
sbic_test_df = data_handler.sbic_data["test"].rename(
    columns={"hasBiasedImplication": "true_label"}
)[["md5_hash", "true_label"]]

In [None]:
composition_predictions_val_per_model = {}
composition_predictions_test_per_model = {}

for model in MODELS:
    for training_dataset in TRAINING_DATASETS:
        # Validation set
        composition_predictions_val = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "sbic_val_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_val.columns:
                composition_predictions_val["input"] = df["input"]
                composition_predictions_val["post_id"] = df["post_id"]

            composition_predictions_val = pd.merge(
                composition_predictions_val,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        composition_predictions_val_per_model[f"{model}__{training_dataset}"] = (
            composition_predictions_val
        )

        # Test set
        composition_predictions_test = pd.DataFrame()
        for seed in RANDOM_SEED:
            output_dir = path.join("outputs/composition-predictions")
            seed_dir = list(
                filter(
                    lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{training_dataset}"
                    in x
                    and f"seed{seed}" in x,
                    sorted(listdir(output_dir)),
                )
            )[0]
            df = pd.read_parquet(
                path.join(output_dir, seed_dir, "sbic_test_results.parquet")
            )
            df["post_id"] = df.input.apply(
                lambda x: hashlib.md5(x.encode()).hexdigest()
            )
            df = df.rename(
                columns={"pred_best_composition": f"pred_best_composition_seed{seed}"}
            )

            if "input" not in composition_predictions_test.columns:
                composition_predictions_test["input"] = df["input"]
                composition_predictions_test["post_id"] = df["post_id"]

            composition_predictions_test = pd.merge(
                composition_predictions_test,
                df.loc[
                    :,
                    [
                        i
                        for i in df.columns
                        if i not in ["input", "index", "pred_probabilities"]
                    ],
                ],
                on="post_id",
                how="left",
            )

        composition_predictions_test_per_model[f"{model}__{training_dataset}"] = (
            composition_predictions_test
        )

In [None]:
# Load composition-specific prediction files
output_dir = "outputs/prompt-predictions/sbic"
predictions_per_composition_val_per_model = {}
predictions_per_composition_test_per_model = {}

for model in MODELS:
    # Validation set
    composition_files_val = [
        f for f in sorted(listdir(output_dir)) if "dev" in f and model in f
    ]
    predictions_per_composition_val = {}

    for f in composition_files_val:
        if "cot" in f:
            composition_name = f.replace(
                f"sbic-cot-greedy-dev_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"sbic-greedy-dev_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_val[composition_name] = df

    predictions_per_composition_val_per_model[model] = predictions_per_composition_val

    # Test set
    composition_files_test = [
        f for f in sorted(listdir(output_dir)) if "test" in f and model in f
    ]
    predictions_per_composition_test = {}

    for f in composition_files_test:
        if "cot" in f:
            composition_name = f.replace(
                f"sbic-cot-greedy-test_{model}_", "cot_"
            ).replace(".parquet", "")
        else:
            composition_name = f.replace(f"sbic-greedy-test_{model}_", "").replace(
                ".parquet", ""
            )

        df = pd.read_parquet(path.join(output_dir, f))
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        predictions_per_composition_test[composition_name] = df

    predictions_per_composition_test_per_model[model] = predictions_per_composition_test

In [None]:
print("## Training split")
positive_instances_train = len(sbic_train_df[sbic_train_df.true_label == 1])
negative_instances_train = len(sbic_train_df[sbic_train_df.true_label == 0])
print(
    f"Positive label: {positive_instances_train} ({np.round(positive_instances_train / len(sbic_train_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_train} ({np.round(negative_instances_train / len(sbic_train_df), decimals=3)})"
)

print("## Validation split")
positive_instances_val = len(sbic_val_df[sbic_val_df.true_label == 1])
negative_instances_val = len(sbic_val_df[sbic_val_df.true_label == 0])
print(
    f"Positive label: {positive_instances_val} ({np.round(positive_instances_val / len(sbic_val_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_val} ({np.round(negative_instances_val / len(sbic_val_df), decimals=3)})"
)

print("## Test split")
positive_instances_test = len(sbic_test_df[sbic_test_df.true_label == 1])
negative_instances_test = len(sbic_test_df[sbic_test_df.true_label == 0])
print(
    f"Positive label: {positive_instances_test} ({np.round(positive_instances_test / len(sbic_test_df), decimals=3)})"
)
print(
    f"Negative label: {negative_instances_test} ({np.round(negative_instances_test / len(sbic_test_df), decimals=3)})"
)

## Predicting compositions performance evaluation

_aka. how well can the encoder model predict a composition that is correct_


In [None]:
target_data = "sbic"

for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    output_dir = path.join("outputs/composition-predictions")
    prompt_compositions = predictions_per_composition_test_per_model[model].keys()

    scores_per_seed = {}
    for seed in RANDOM_SEED:
        # Load predictions of the adaptive prompting model for each text instance and composition
        seed_dir = list(
            filter(
                lambda x: f"deberta-v3-large_composition-prediction-for-{model}-on-{target_data}"
                in x
                and f"seed{seed}" in x,
                sorted(listdir(output_dir)),
            )
        )[0]
        df = pd.read_parquet(
            path.join(output_dir, seed_dir, f"{target_data}_test_results.parquet")
        )
        df["post_id"] = df.input.apply(lambda x: hashlib.md5(x.encode()).hexdigest())

        # COBRAFRAMES-specific fix
        # Remove posts that are not present in filtered cobraframe dataset above
        # (due to a bug in cluster) (should only be five posts)
        df = df[
            df["post_id"].isin(
                predictions_per_composition_test_per_model[model][
                    list(prompt_compositions)[0]
                ]["post_id"]
            )
        ]

        # Load the id2component map to ensure that the predictions are in the same ordering as above
        with open(path.join(output_dir, seed_dir, "id2component_map.json"), "r") as f:
            id2component_map = json.load(f)
        component2id_map = {value: int(key) for key, value in id2component_map.items()}

        # Re-order the predicted probabilities per text instance according the to id2component map
        # loaded above
        adaptive_prompting_correctness_per_sample = []
        for i, row in df.iterrows():
            comp_df = predictions_per_composition_test_per_model[model][
                row["pred_best_composition"]
            ]
            row_comp_df = comp_df[comp_df["post_id"] == row["post_id"]].iloc[0]
            try:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_{seed}"] == row_comp_df["true_label"]).astype(
                        int
                    )
                )
            except KeyError:
                adaptive_prompting_correctness_per_sample.append(
                    (row_comp_df[f"output_23"] == row_comp_df["true_label"]).astype(int)
                )
        scores_per_seed[seed] = sum(adaptive_prompting_correctness_per_sample) / len(
            adaptive_prompting_correctness_per_sample
        )

    print(scores_per_seed)
    print(np.mean(list(scores_per_seed.values())))

## Adaptive prompting evaluation


In [None]:
composition_prediction_scores_per_model = {}

for model in composition_predictions_val_per_model.keys():
    model_name_without_data = model[: model.find("__")]
    # Validation split
    all_seed_scores_val = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in composition_predictions_val_per_model[model].iterrows():
            preds = predictions_per_composition_val_per_model[model_name_without_data][
                row[f"pred_best_composition_seed{seed}"]
            ]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_val.append(scores)

    # Test split
    all_seed_scores_test = []
    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []
        for i, row in composition_predictions_test_per_model[model].iterrows():
            preds = predictions_per_composition_test_per_model[model_name_without_data][
                row[f"pred_best_composition_seed{seed}"]
            ]
            if f"output_{seed}" in preds.columns:
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0][f"output_{seed}"])
                )
            else:
                # If we don't have predictions for other seeds, use the primary seed
                y_pred_seed.append(
                    (preds[preds.post_id == row.post_id].iloc[0]["output_23"])
                )
            y_true_seed.append(
                preds[preds.post_id == row.post_id].iloc[0]["true_label"]
            )

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores_test.append(scores)

    composition_prediction_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores_test
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores_test
        ],
    }

    print(model, "=" * 50)
    print("Averaged scores")

    print(
        "Precision (macro) (over all seeds):",
        composition_prediction_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        composition_prediction_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        composition_prediction_scores_per_model[model]["test_macro_f1"],
    )

## Baselines


In [None]:
all_scores_val_per_model = {}
all_scores_test_per_model = {}

for model in MODELS:
    model_name_without_data = model[: model.find("__")]

    # Validation split
    all_scores_val = {}
    for name, predictions in predictions_per_composition_val_per_model[model].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_val[name] = scores

    all_scores_val_per_model[model] = all_scores_val

    # Test split
    all_scores_test = {}
    for name, predictions in predictions_per_composition_test_per_model[model].items():
        try:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{seed}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        # In some cases, we don't have more than one seed, so we fall back to the primary seed
        except KeyError:
            scores = [
                precision_recall_fscore_support(
                    y_true=predictions["true_label"],
                    y_pred=predictions[f"output_{RANDOM_SEED[0]}"],
                    pos_label=1,
                )
                for seed in RANDOM_SEED
            ]
        all_scores_test[name] = scores

    all_scores_test_per_model[model] = all_scores_test

all_f1_scores_test_per_model = {}
all_f1_scores_test_per_model_seed_scores = {}
for model in MODELS:
    all_f1_scores_test_per_model[model] = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in all_scores_test_per_model[model].items()
    ]
    all_f1_scores_test_per_model_seed_scores[model] = [
        [((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v]
        for k, v in all_scores_test_per_model[model].items()
    ]

### Trivial baselines


In [None]:
majority_label = sbic_test_df.true_label.mode()[0]
majority_baseline_pred = [majority_label for _ in range(len(sbic_test_df))]

maj_baseline_scores = precision_recall_fscore_support(
    y_true=sbic_test_df["true_label"], y_pred=majority_baseline_pred, pos_label=1
)

maj_baseline_precision_macro_averaged_test = (
    maj_baseline_scores[0][0] + maj_baseline_scores[0][1]
) / 2
maj_baseline_recall_macro_averaged_test = (
    maj_baseline_scores[1][0] + maj_baseline_scores[1][1]
) / 2
maj_baseline_f1_macro_averaged_test = (
    maj_baseline_scores[2][0] + maj_baseline_scores[2][1]
) / 2
print("Precision (macro) (over all seeds):", maj_baseline_precision_macro_averaged_test)
print("Recall (macro) (over all seeds):", maj_baseline_recall_macro_averaged_test)
print("F1 (macro) (over all seeds):", maj_baseline_f1_macro_averaged_test)

In [None]:
random_baseline_pred = np.random.randint(2, size=len(sbic_test_df))

random_baseline_scores = precision_recall_fscore_support(
    y_true=sbic_test_df["true_label"], y_pred=random_baseline_pred, pos_label=1
)

random_baseline_precision_macro_averaged = (
    random_baseline_scores[0][0] + random_baseline_scores[0][1]
) / 2
random_baseline_recall_macro_averaged = (
    random_baseline_scores[1][0] + random_baseline_scores[1][1]
) / 2
random_baseline_f1_macro_averaged = (
    random_baseline_scores[2][0] + random_baseline_scores[2][1]
) / 2
print("Precision (macro) (over all seeds):", random_baseline_precision_macro_averaged)
print("Recall (macro) (over all seeds):", random_baseline_recall_macro_averaged)
print("F1 (macro) (over all seeds):", random_baseline_f1_macro_averaged)

### Oracle

_Always chooses the correct label, if it was predicted by either of the compositions; only chooses the wrong label if no composition predicted the correct label_


In [None]:
oracle_scores_per_model = {}

for model in MODELS:
    # Compiling oracle predictions
    oracle_predictions_test = pd.DataFrame()
    for seed in RANDOM_SEED:
        all_seed_predictions = pd.DataFrame()
        for composition, df in predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in all_seed_predictions.columns:
                all_seed_predictions["input"] = df["input"]
            if "true_label" not in all_seed_predictions.columns:
                all_seed_predictions["true_label"] = df["true_label"]

            try:
                all_seed_predictions[f"{composition}_{seed}"] = df[f"output_{seed}"]
            except KeyError:
                all_seed_predictions[f"{composition}_{seed}"] = df[
                    f"output_{RANDOM_SEED[0]}"
                ]

        if "input" not in oracle_predictions_test.columns:
            oracle_predictions_test["input"] = all_seed_predictions["input"]
        if "true_label" not in oracle_predictions_test.columns:
            oracle_predictions_test["true_label"] = all_seed_predictions["true_label"]

        # For each sample, choose the true_label if at least one prediction is the true label
        # Since the true label is in this dataframe the first value, we check if it exists in all other columns
        # If yes, we use the true_label as oracle prediction and otherwise the value of the first column (as
        # this should be the wrong label, similar to all other columns)
        oracle_predictions_test[f"output_{seed}"] = all_seed_predictions.loc[
            :, [i for i in all_seed_predictions.columns if i not in ["input"]]
        ].apply(
            lambda row: (
                row["true_label"]
                if row["true_label"] in row.values[1:]
                else row.values[1]
            ),
            axis=1,
        )

    # Calculating scores
    oracle_seed_scores_test = [
        precision_recall_fscore_support(
            y_true=oracle_predictions_test["true_label"],
            y_pred=oracle_predictions_test[f"output_{seed}"],
            pos_label=1,
        )
        for seed in RANDOM_SEED
    ]

    oracle_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in oracle_seed_scores_test
            ]
        ),
    }

    print(model, "=" * 50)
    print(
        "Oracle Precision (macro):",
        oracle_scores_per_model[model]["test_macro_precision"],
    )
    print("Oracle Recall (macro):", oracle_scores_per_model[model]["test_macro_recall"])
    print("Oracle F1 (macro):", oracle_scores_per_model[model]["test_macro_f1"])

### No technique

_Task description and input text only_


In [None]:
no_technique_scores_per_model = {}

for model in MODELS:
    no_technique_i = list(all_scores_test_per_model[model].keys()).index(
        "task-description-only"
    )

    no_technique_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_scores_test_per_model[model]["task-description-only"]
        ],
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_scores_test_per_model[model]["task-description-only"]
        ],
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_scores_test_per_model[model][
                    "task-description-only"
                ]
            ]
        ),
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_scores_test_per_model[model]["task-description-only"]
        ],
    }

    print(model, "=" * 50)
    print(
        "No technique Precision (macro):",
        no_technique_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "No technique Recall (macro):",
        no_technique_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "No technique F1 (macro):",
        no_technique_scores_per_model[model]["test_macro_f1"],
    )

### Optimal composition

_Best composition on the validation set in terms of f1 macro score, evaluated on the test set for precision, recall and f1_


In [None]:
optimal_composition_scores_per_model = {}

for model in MODELS:
    # Validation split
    optimal_precision_macro_averaged_scores_val = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in all_scores_val_per_model[model].items()
    ]
    optimal_recall_macro_averaged_scores_val = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in all_scores_val_per_model[model].items()
    ]
    optimal_f1_macro_averaged_scores_val = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in all_scores_val_per_model[model].items()
    ]
    # Test split
    optimal_precision_macro_averaged_scores_test = [
        np.mean([((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in v])
        for k, v in all_scores_test_per_model[model].items()
    ]
    optimal_recall_macro_averaged_scores_test = [
        np.mean([((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in v])
        for k, v in all_scores_test_per_model[model].items()
    ]
    optimal_f1_macro_averaged_scores_test = [
        np.mean([((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in v])
        for k, v in all_scores_test_per_model[model].items()
    ]

    # Find optimal model scores on test set
    optimal_composition_val_f1_macro_i = np.argmax(optimal_f1_macro_averaged_scores_val)
    optimal_composition_name = list(all_scores_val_per_model[model].keys())[
        optimal_composition_val_f1_macro_i
    ]

    optimal_composition_scores_per_model[model] = {
        "composition_name": optimal_composition_name,
        "test_macro_precision": optimal_precision_macro_averaged_scores_test[
            optimal_composition_val_f1_macro_i
        ],
        "test_macro_precision_seed_scores": [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_scores_test_per_model[model][
                optimal_composition_name
            ]
        ],
        "test_macro_recall": optimal_recall_macro_averaged_scores_test[
            optimal_composition_val_f1_macro_i
        ],
        "test_macro_recall_seed_scores": [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_scores_test_per_model[model][
                optimal_composition_name
            ]
        ],
        "test_macro_f1": optimal_f1_macro_averaged_scores_test[
            optimal_composition_val_f1_macro_i
        ],
        "test_macro_f1_seed_scores": [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_scores_test_per_model[model][
                optimal_composition_name
            ]
        ],
    }

    print(model, "=" * 50)
    print(
        "Optimal validation composition:",
        optimal_composition_scores_per_model[model]["composition_name"],
    )
    print(
        "Optimal composition Precision (macro):",
        optimal_composition_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Optimal composition Recall (macro):",
        optimal_composition_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "Optimal composition F1 (macro):",
        optimal_composition_scores_per_model[model]["test_macro_f1"],
    )

### Component ensemble


In [None]:
ensemble_scores_per_model = {}

for model in MODELS:
    all_seed_scores = []

    for seed in RANDOM_SEED:
        y_true_seed = []
        y_pred_seed = []

        seed_df = pd.DataFrame()

        for composition_name, comp_preds in predictions_per_composition_test_per_model[
            model
        ].items():
            if "input" not in seed_df.columns:
                seed_df["input"] = comp_preds["input"]
                seed_df["post_id"] = comp_preds["post_id"]
                seed_df["true_label"] = comp_preds["true_label"]
            try:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{seed}"]].rename(
                        columns={f"output_{seed}": f"{composition_name}_{seed}"}
                    ),
                    on="post_id",
                    how="left",
                )
            except KeyError:
                seed_df = pd.merge(
                    seed_df,
                    comp_preds[["post_id", f"output_{RANDOM_SEED[0]}"]].rename(
                        columns={
                            f"output_{RANDOM_SEED[0]}": f"{composition_name}_{seed}"
                        }
                    ),
                    on="post_id",
                    how="left",
                )

        mode = seed_df.loc[
            :,
            [c for c in seed_df.columns if c not in ["input", "post_id", "true_label"]],
        ].mode(axis=1)
        # If there is a tie, use a random value between 0 and 1
        seed_df["majority"] = np.where(mode[1].isna(), mode[0], np.random.randint(2))

        y_true_seed = seed_df["true_label"]
        y_pred_seed = seed_df["majority"]

        scores = precision_recall_fscore_support(
            y_true=y_true_seed, y_pred=y_pred_seed, pos_label=1
        )
        all_seed_scores.append(scores)

    ensemble_scores_per_model[model] = {
        "test_macro_precision": np.mean(
            [
                ((seed_scores[0][0] + seed_scores[0][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_recall": np.mean(
            [
                ((seed_scores[1][0] + seed_scores[1][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
        "test_macro_f1": np.mean(
            [
                ((seed_scores[2][0] + seed_scores[2][1]) / 2)
                for seed_scores in all_seed_scores
            ]
        ),
    }

    print(model, "=" * 50)
    print("Averaged scores")
    print(
        "Precision (macro) (over all seeds):",
        ensemble_scores_per_model[model]["test_macro_precision"],
    )
    print(
        "Recall (macro) (over all seeds):",
        ensemble_scores_per_model[model]["test_macro_recall"],
    )
    print(
        "F1 (macro) (over all seeds):",
        ensemble_scores_per_model[model]["test_macro_f1"],
    )

### Finetuning baselines


In [None]:
output_dir = "results"

all_seed_scores = []
for seed in RANDOM_SEED:
    model_name = f"deberta-v3-large-finetune_20240605105831_sbic-seed{seed}"
    seed_df = pd.read_parquet(
        path.join(
            output_dir,
            f"sbic-test_predictions-{model_name}.parquet",
        )
    )

    seed_df = pd.merge(sbic_test_df, seed_df, on="md5_hash", how="left")

    scores = precision_recall_fscore_support(
        y_true=seed_df["true_label"],
        y_pred=seed_df[f"prediction_{model_name}"],
        pos_label=1,
    )
    all_seed_scores.append(scores)


finetune_scores = {
    "test_macro_precision": np.mean(
        [
            ((seed_scores[0][0] + seed_scores[0][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_precision_seed_scores": [
        ((seed_scores[0][0] + seed_scores[0][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_recall": np.mean(
        [
            ((seed_scores[1][0] + seed_scores[1][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_recall_seed_scores": [
        ((seed_scores[1][0] + seed_scores[1][1]) / 2) for seed_scores in all_seed_scores
    ],
    "test_macro_f1": np.mean(
        [
            ((seed_scores[2][0] + seed_scores[2][1]) / 2)
            for seed_scores in all_seed_scores
        ]
    ),
    "test_macro_f1_seed_scores": [
        ((seed_scores[2][0] + seed_scores[2][1]) / 2) for seed_scores in all_seed_scores
    ],
}

print("=" * 50)
print("Averaged scores")
print("Precision (macro) (over all seeds):", finetune_scores["test_macro_precision"])
print("Recall (macro) (over all seeds):", finetune_scores["test_macro_recall"])
print("F1 (macro) (over all seeds):", finetune_scores["test_macro_f1"])

### Self-diagnosis baseline


In [None]:
output_dir = "results"
model_name = "self-diagnosis"

sd_results_df = pd.read_parquet(
    path.join(output_dir, model_name, "baseline_self_diagnosis_sbic_test.parquet")
)

sd_scores = precision_recall_fscore_support(
    y_true=sd_results_df["true_label"], y_pred=sd_results_df["output_23"], pos_label=1
)

self_diagnosis_precision_macro_averaged_test = (sd_scores[0][0] + sd_scores[0][1]) / 2
self_diagnosis_recall_macro_averaged_test = (sd_scores[1][0] + sd_scores[1][1]) / 2
self_diagnosis_f1_macro_averaged_test = (sd_scores[2][0] + sd_scores[2][1]) / 2

print("=" * 50)
print("Averaged scores")
print(
    "Precision (macro) (over all seeds):", self_diagnosis_precision_macro_averaged_test
)
print("Recall (macro) (over all seeds):", self_diagnosis_recall_macro_averaged_test)
print("F1 (macro) (over all seeds):", self_diagnosis_f1_macro_averaged_test)

## Significance tests


In [None]:
from scipy.stats import ttest_rel

In [None]:
# Prepare best-on-test data
best_on_test_scores = {}
for model in MODELS:
    best_composition = np.argmax(all_f1_scores_test_per_model[model])
    best_on_test_scores[model] = {
        "test_macro_f1_seed_scores": all_f1_scores_test_per_model_seed_scores[model][
            best_composition
        ]
    }

In [None]:
alpha = 0.05
ttest_function = ttest_rel

target_dataset_to_evaluate = "sbic"

baselines = [
    ("BaseComposition", no_technique_scores_per_model),
    ("BestOnVal", optimal_composition_scores_per_model),
    ("BestOnTest", best_on_test_scores),
    ("Finetune", finetune_scores),
]

In [None]:
for model in MODELS:
    print(f"\n\n{model}")
    print("=" * 25)

    approach_name = "CompositionPrediction"
    approach_scores = composition_prediction_scores_per_model[
        f"{model}__{target_dataset_to_evaluate}"
    ]["test_macro_f1_seed_scores"]

    for baseline_name, baseline_scores_dict in baselines:
        if baseline_name == "Finetune":
            baseline_scores = baseline_scores_dict["test_macro_f1_seed_scores"]
        else:
            baseline_scores = baseline_scores_dict[model]["test_macro_f1_seed_scores"]

        if not np.mean(approach_scores) > np.mean(baseline_scores):
            print(
                f"Skipped {approach_name} vs. {baseline_name} ({np.mean(approach_scores)} vs. {np.mean(baseline_scores)})"
            )
            continue

        t_results = ttest_function(baseline_scores, approach_scores)
        # correct for one sided test, according to Hitchhiker's guide
        p_value = t_results[1] / 2

        if p_value <= alpha:
            print(
                f"{approach_name} is significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )
        else:
            print(
                f"{approach_name} is NOT significantly better than {baseline_name} with p-value {p_value:.4f} (t-test)."
            )

## Graphs


### Scores


In [None]:
SHOW_ALL_BASELINES = True

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
box = ax.boxplot(
    [
        all_model_scores
        for model, all_model_scores in all_f1_scores_test_per_model.items()
    ],
    labels=MODELS,
)
legend_entities_handlers_per_model = {}

for i, model in enumerate(MODELS):
    print("")
    print(model)

    print("Median:", np.median(all_f1_scores_test_per_model[model]))

    best_composition_i = np.argmax(all_f1_scores_test_per_model[model])
    worst_composition_i = np.argmin(all_f1_scores_test_per_model[model])
    # Add best score as scatter
    best_composition_handle = plt.scatter(
        i + 1,
        all_f1_scores_test_per_model[model][best_composition_i],
        alpha=0.6,
        color="red",
        zorder=3,
    )
    plt.text(
        i + 1,
        all_f1_scores_test_per_model[model][best_composition_i],
        # (
        #     f"Best-on-test "
        #     f"({np.round(all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        # ),
        "A",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="red",
        zorder=3,
    )
    print(
        "A ->",
        (
            f"Best-on-test "
            f"({np.round(all_f1_scores_test_per_model[model][best_composition_i], decimals=3)})"
        ),
    )

    # Add worst score as scatter
    worst_composition_handle = plt.scatter(
        i + 1,
        all_f1_scores_test_per_model[model][worst_composition_i],
        alpha=0.6,
        color="blue",
        zorder=3,
    )
    plt.text(
        i + 1,
        all_f1_scores_test_per_model[model][worst_composition_i],
        # (
        #     f"Worst-on-test "
        #     f"({np.round(all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        # ),
        "B",
        horizontalalignment="right",
        verticalalignment="bottom",
        color="blue",
        zorder=3,
    )
    print(
        "B ->",
        (
            f"Worst-on-test "
            f"({np.round(all_f1_scores_test_per_model[model][worst_composition_i], decimals=3)})"
        ),
    )

    # Add no-technique as scatter
    plt.scatter(
        i + 1,
        no_technique_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        no_technique_scores_per_model[model]["test_macro_f1"],
        # (
        #     f"Base composition "
        #     f"({np.round(no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "C",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "C ->",
        (
            f"Base composition "
            f"({np.round(no_technique_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add optimal model as scatter
    optimal_composition_name = optimal_composition_scores_per_model[model][
        "composition_name"
    ]
    optimal_composition_handle = plt.scatter(
        i + 1,
        optimal_composition_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="olive",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        optimal_composition_scores_per_model[model]["test_macro_f1"],
        # (
        #     "Best-on-validation "
        #     f"({np.round(optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "D",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="olive",
        zorder=3,
    )
    print(
        "D ->",
        (
            "Best-on-validation "
            f"({np.round(optimal_composition_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add naive ensemble as scatter
    plt.scatter(
        i + 1,
        ensemble_scores_per_model[model]["test_macro_f1"],
        alpha=0.6,
        color="black",
        marker="x",
        zorder=3,
    )
    plt.text(
        i + 1,
        ensemble_scores_per_model[model]["test_macro_f1"],
        # (
        #     "Majority ensemble "
        #     f"({np.round(ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        # ),
        "E",
        horizontalalignment="left",
        verticalalignment="bottom",
        color="black",
        zorder=3,
    )
    print(
        "E ->",
        (
            "Majority ensemble "
            f"({np.round(ensemble_scores_per_model[model]['test_macro_f1'], decimals=3)})"
        ),
    )

    # Add composition predictions as scatters
    for k, training_dataset in enumerate(TRAINING_DATASETS):
        composition_prediction_score = np.round(
            composition_prediction_scores_per_model[f"{model}__{training_dataset}"][
                "test_macro_f1"
            ],
            decimals=3,
        )
        plt.scatter(
            i + 1,
            composition_prediction_scores_per_model[f"{model}__{training_dataset}"][
                "test_macro_f1"
            ],
            alpha=0.6,
            color="green",
            marker="*",
            zorder=3,
        )
        plt.text(
            i + 1,
            composition_prediction_scores_per_model[f"{model}__{training_dataset}"][
                "test_macro_f1"
            ],
            # f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
            f"F{k}",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="green",
            zorder=3,
        )
        print(
            f"F{k} ->",
            f"Composition prediction ({training_dataset}) ({composition_prediction_score})",
        )

    if SHOW_ALL_BASELINES:
        # Add oracle as scatter
        plt.scatter(
            i + 1,
            oracle_scores_per_model[model]["test_macro_f1"],
            alpha=0.6,
            color="black",
            marker="x",
            zorder=3,
        )
        plt.text(
            i + 1,
            oracle_scores_per_model[model]["test_macro_f1"],
            # (f"Oracle ({np.round(oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"),
            "G",
            horizontalalignment="left",
            verticalalignment="bottom",
            color="black",
            zorder=3,
        )
        print(
            "G ->",
            (
                f"Oracle ({np.round(oracle_scores_per_model[model]['test_macro_f1'], decimals=3)})"
            ),
        )

    # Always in the following order: best-on-test, worst-on-test, best-on-validation
    legend_entities_handlers_per_model[model] = dict(
        zip(
            [
                f"bot: {list(all_scores_test_per_model[model].keys())[best_composition_i]}",
                f"wot: {list(all_scores_test_per_model[model].keys())[worst_composition_i]}",
                f"bov: {optimal_composition_scores_per_model[model]['composition_name']}",
            ],
            [
                best_composition_handle,
                worst_composition_handle,
                optimal_composition_handle,
            ],
        )
    )


# Add finetune model as scatter
plt.axhline(
    finetune_scores["test_macro_f1"],
    color="black",
    linestyle="dashed",
    alpha=0.4,
    zorder=3,
)
plt.text(
    0,
    finetune_scores["test_macro_f1"],
    f"DeBERTa-v3-large (finetuned) ({np.round(finetune_scores['test_macro_f1'], decimals=3)})",
    horizontalalignment="left",
    verticalalignment="top",
    color="black",
    zorder=3,
)

# Conditionally show outlier baselines
if SHOW_ALL_BASELINES:
    # Add majority label baseline as scatter
    plt.axhline(
        maj_baseline_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        maj_baseline_f1_macro_averaged_test,
        f"Majority label baseline ({np.round(maj_baseline_f1_macro_averaged_test, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )
    # Add random baseline as scatter
    plt.axhline(
        random_baseline_f1_macro_averaged,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        random_baseline_f1_macro_averaged,
        f"Random baseline ({np.round(random_baseline_f1_macro_averaged, decimals=3)})",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

    # Add self-diagnosis as horizontal line scatter
    plt.axhline(
        self_diagnosis_f1_macro_averaged_test,
        color="black",
        linestyle="dashed",
        alpha=0.4,
        zorder=3,
    )
    plt.text(
        0,
        self_diagnosis_f1_macro_averaged_test,
        f"Self-diagnosis baseline ({np.round(self_diagnosis_f1_macro_averaged_test, decimals=3)})",
        # "I",
        horizontalalignment="left",
        verticalalignment="top",
        color="black",
        zorder=3,
    )

handlers = []
labels = []
for model in MODELS:
    # Add handlers, with first being dummy handler
    handlers.append(
        plt.scatter([0], [0], marker="None", linestyle="None", label=f"dummy-{model}")
    )
    handlers.extend(list(legend_entities_handlers_per_model[model].values()))

    # Add labels, with first being model label
    labels.append(model)
    labels.extend(list(legend_entities_handlers_per_model[model].keys()))

legend = fig.legend(handlers, labels, ncol=len(MODELS), loc="outside lower center")

ax.set_xlim(0, 4)
plt.title("SBIC data")
plt.xlabel("Model")
plt.ylabel("F1 (macro) (over all seeds)")
plt.savefig("outputs/figures/sbic__performance-box-plot.pdf")
plt.savefig("outputs/figures/sbic__performance-box-plot.svg")
plt.show()

### Composition frequency

_How often was each composition chosen (bar chart with box plot), how often was each technique and combination of technqiues chosen (heatmap)_


In [None]:
composition_counts_per_seed_per_model = {}

for model in composition_predictions_test_per_model.keys():
    model_name_without_data = model[: model.find("__")]

    composition_counts_per_seed_per_model[model] = {}
    for seed in RANDOM_SEED:
        comp_count = Counter(
            composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]
        )
        for composition in predictions_per_composition_test_per_model[
            model_name_without_data
        ]:
            if composition not in composition_counts_per_seed_per_model[model].keys():
                composition_counts_per_seed_per_model[model][composition] = []

            if composition in comp_count.keys():
                composition_counts_per_seed_per_model[model][composition].append(
                    comp_count[composition]
                )
            else:
                composition_counts_per_seed_per_model[model][composition].append(0)

    # Calculate bar heights (mean) and error bars (standard deviation)
    compositions = list(composition_counts_per_seed_per_model[model].keys())
    values = [
        np.mean(composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    lower_errors = [
        np.mean(composition_counts_per_seed_per_model[model][comp])
        - np.min(composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]
    upper_errors = [
        np.max(composition_counts_per_seed_per_model[model][comp])
        - np.mean(composition_counts_per_seed_per_model[model][comp])
        for comp in compositions
    ]

    # Combine the data into a list of tuples and sort by values (mean)
    sorted_data = sorted(
        zip(values, lower_errors, upper_errors, compositions),
        key=lambda x: x[0],
        reverse=False,
    )

    # Unpack the sorted data
    values, lower_errors, upper_errors, compositions = zip(*sorted_data)

    # Create asymmetric error arrays
    asymmetric_errors = [lower_errors, upper_errors]

    # Bar chart positions
    x_pos = np.arange(len(compositions))

    # Plot bars
    plt.figure(figsize=(10, 20))
    # bars = plt.bar(x_pos, values, yerr=asymmetric_errors, align="center", alpha=0.7, capsize=0)
    bars = plt.barh(x_pos, values, align="center", alpha=0.7)

    # Add labels and title
    plt.yticks(x_pos, compositions, ha="right")
    plt.ylabel("Compositions")
    plt.xlabel("Count")
    plt.title(f"{model}: Composition counts (over five random seeds) on SBIC")

    plt.savefig(
        f"outputs/figures/sbic__{model}__composition-frequency.pdf", bbox_inches="tight"
    )
    plt.savefig(
        f"outputs/figures/sbic__{model}__composition-frequency.png", bbox_inches="tight"
    )
    # Show the plot
    plt.show()

In [None]:
# Count the number of times each composition produces a correct prediction for each split
target_dataset = "sbic"

sbic_train_correct_prediction_counts_per_seed_per_model = {}
sbic_val_correct_prediction_counts_per_seed_per_model = {}
sbic_test_correct_prediction_counts_per_seed_per_model = {}

# Train dataset (needs slightly different loading)
# We use the composition names form the test data loaded further above, but then load the
# predictions on the train data below from file
for (
    model,
    composition_predictions,
) in predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in sbic_train_correct_prediction_counts_per_seed_per_model.keys()
    ):
        sbic_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        # Load composition predictions for trainin dataset
        if "cot" in composition:
            comp_no_cot = composition.replace("cot_", "")
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-cot-greedy-train_{model}_{comp_no_cot}.parquet"
            )
        else:
            model_composition_df_train = pd.read_parquet(
                f"outputs/prompt-predictions/{target_dataset}/{target_dataset}-greedy-train_{model}_{composition}.parquet"
            )

        for seed in RANDOM_SEED:
            try:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df_train[
                        model_composition_df_train["true_label"]
                        == model_composition_df_train[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        sbic_train_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

# Val dataset
for model, composition_predictions in predictions_per_composition_val_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in sbic_val_correct_prediction_counts_per_seed_per_model.keys()
    ):
        sbic_val_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = predictions_per_composition_val_per_model[model][
                composition
            ]
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        sbic_val_correct_prediction_counts_per_seed_per_model[model_name_in_convention][
            composition
        ] = correct_predictions_per_seed

# Test dataset
for (
    model,
    composition_predictions,
) in predictions_per_composition_test_per_model.items():
    model_name_in_convention = f"{model}__{target_dataset}"

    if (
        model_name_in_convention
        not in sbic_test_correct_prediction_counts_per_seed_per_model.keys()
    ):
        sbic_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ] = {}

    for composition in composition_predictions.keys():
        correct_predictions_per_seed = []

        for seed in RANDOM_SEED:
            model_composition_df = predictions_per_composition_test_per_model[model][
                composition
            ]
            try:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_{seed}"]
                    ]
                )
            except KeyError:
                correct_predictions = len(
                    model_composition_df[
                        model_composition_df["true_label"]
                        == model_composition_df[f"output_23"]
                    ]
                )
            correct_predictions_per_seed.append(correct_predictions)

        sbic_test_correct_prediction_counts_per_seed_per_model[
            model_name_in_convention
        ][composition] = correct_predictions_per_seed

In [None]:
# Create composition frequency tables
target_dataset = "sbic"
target_dataset_models = [
    m
    for m in composition_counts_per_seed_per_model.keys()
    if m.endswith(target_dataset)
]

composition_counts_mean_per_target_dataset_models = {}

for model in target_dataset_models:
    for composition_name, composition_counts in composition_counts_per_seed_per_model[
        model
    ].items():
        # Calculate average composition frequencies for current model over seeds
        mean_frequency_counts = np.mean(composition_counts)

        # Calculate standard deviation of composition frequencies over seeds
        stddev_frequency_counts = np.std(composition_counts)

        # Calculate how often each composition results in the correct prediction per split
        mean_correct_prediction = np.mean(
            sbic_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        # Calculate standard deviation of correct predictions per composition over seeds
        stddev_correct_prediction = np.std(
            sbic_train_correct_prediction_counts_per_seed_per_model[model][
                composition_name
            ]
        )

        if (
            composition_name
            not in composition_counts_mean_per_target_dataset_models.keys()
        ):
            composition_counts_mean_per_target_dataset_models[composition_name] = {}

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_frequencies_test"
        ] = f"{mean_frequency_counts} (+- {np.round(stddev_frequency_counts, decimals=2):0.2f})"

        composition_counts_mean_per_target_dataset_models[composition_name][
            f"{model}__mean_correct_prediction_train"
        ] = f"{mean_correct_prediction} (+- {np.round(stddev_correct_prediction, decimals=2):0.2f})"


# Make composition names nicer for final table
counts_with_updated_composition_names = {}
for (
    composition_name,
    composition_counts,
) in composition_counts_mean_per_target_dataset_models.items():
    composition_name_reformat_rules = {
        "cot": "Reasoning steps",
        "category-few-shot": "In-context (category)",
        "random-few-shot": "In-context (random)",
        "similar-few-shot": "In-context (similar)",
        "definitions": "Definitions",
        "directional-stimulus": "Dir. stimulus",
        "system-prompts": "Persona",
        "task-description-only": "Base composition",
    }
    composition_name_reformat = ", ".join(
        [
            composition_name_reformat_rules[comp].capitalize()
            for comp in composition_name.split("_")
        ]
    )

    counts_with_updated_composition_names[composition_name_reformat] = (
        composition_counts
    )

composition_frequency_output_file = path.join(
    f"outputs/tables/composition-frequencies-{target_dataset}.csv"
)
pd.DataFrame(
    data=counts_with_updated_composition_names
).transpose().sort_index().to_csv(composition_frequency_output_file)

In [None]:
techniques = [
    "cateogory-few-shot",
    "cot",
    "definitions",
    "directional-stimulus",
    "random-few-shot",
    "similar-few-shot",
    "system-prompts",
    "task-description-only",
]
for model in composition_predictions_test_per_model.keys():
    all_cooccurrences = []
    for t_outer in techniques:
        t_cooccurrences = []
        for seed in RANDOM_SEED:
            seed_cooccurrences = np.zeros(len(techniques))
            for pred in composition_predictions_test_per_model[model][
                f"pred_best_composition_seed{seed}"
            ]:
                if t_outer in pred:
                    for t_inner in techniques:
                        if t_inner in pred and t_inner != t_outer:
                            seed_cooccurrences[techniques.index(t_inner)] += 1
            t_cooccurrences.append(seed_cooccurrences)
        all_cooccurrences.append(t_cooccurrences)

    average_cooccurrences = np.array(
        [
            [np.mean(coocc) for coocc in list(zip(*per_seed_occurrences))]
            for per_seed_occurrences in all_cooccurrences
        ]
    )

    # mask = np.triu(np.ones_like(average_cooccurrences, dtype=bool))
    # masked_data = np.ma.masked_array(average_cooccurrences, mask)

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(average_cooccurrences, cmap="plasma")

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(techniques)), labels=techniques)
    ax.set_yticks(np.arange(len(techniques)), labels=techniques)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(techniques)):
        for j in range(len(techniques)):
            text = ax.text(
                j, i, average_cooccurrences[i, j], ha="center", va="center", color="w"
            )

    ax.set_title(
        f"{model}: Average (over all seeds) cooccurrence for predicted compositions on SBIC"
    )
    plt.savefig(f"outputs/figures/sbic__{model}__technique-cooccurrences.pdf")
    plt.show()