# TwitterAAE classification evaluation


## Loading & Prep


In [None]:
import pandas as pd
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from tabulate import tabulate

In [None]:
np.random.seed(42)

In [None]:
twitteraae_test = pd.read_csv(
    "intermediate/twitter-aae/twitteraae-test-labeled-prep.csv"
)
twitteraae_baseline = pd.read_csv("output/twitteraae-test_predictions-baseline.csv")
twitteraae_finetuned = pd.read_csv(
    "output/twitteraae-test_predictions-deberta-v3-base-aee-classifier.csv"
)
twitteraae_finetuned_subsample = pd.read_csv(
    "output/twitteraae-test_predictions-deberta-v3-base-aee-classifier-interleaving.csv"
)

In [None]:
# Majority baseline
majority_label = twitteraae_test["aae_dialect_label"].value_counts(sort=True).index[0]
majority_predictions = np.full(len(twitteraae_test), majority_label)

# Pseudo-random baseline
random_predictions = np.random.randint(2, size=len(twitteraae_test))

In [None]:
predictions = {
    "baseline_majority": majority_predictions,
    "baseline_random": random_predictions,
    "baseline_twitteraae": twitteraae_baseline["prediction_twitteraae_baseline"],
    "approach_finetuned": twitteraae_finetuned[
        "prediction_deberta-v3-base-aee-classifier"
    ],
    "approach_finetuned_subsample": twitteraae_finetuned_subsample[
        "prediction_deberta-v3-base-aee-classifier-interleaving"
    ],
}

In [None]:
predictions_df = twitteraae_test.copy()
predictions_df["prediction_baseline_majority"] = majority_predictions
predictions_df["prediction_baseline_random"] = random_predictions
predictions_df["prediction_baseline_twitteraae"] = twitteraae_baseline[
    "prediction_twitteraae_baseline"
]
predictions_df["prediction_approach_finetuned"] = twitteraae_finetuned[
    "prediction_deberta-v3-base-aee-classifier"
]
predictions_df["prediction_approach_finetuned_subsample"] = (
    twitteraae_finetuned_subsample[
        "prediction_deberta-v3-base-aee-classifier-interleaving"
    ]
)

In [None]:
no_aae_samples = predictions_df[predictions_df["aae_dialect_label"] == 1]
aae_subsample = predictions_df[predictions_df["aae_dialect_label"] == 0].sample(
    n=len(no_aae_samples), random_state=23
)
predictions_subsampled = pd.concat([no_aae_samples, aae_subsample])

## Dataset


In [None]:
print("Class distribution (test set)")
len_neg = len(twitteraae_test[twitteraae_test["aae_dialect_label"] == 0])
len_pos = len(twitteraae_test[twitteraae_test["aae_dialect_label"] == 1])
print(f"Non-AAE:\t{len_neg} ({len_neg / len(twitteraae_test):.2f})")
print(f"AAE:\t\t{len_pos} ({len_pos / len(twitteraae_test):.2f})")
print("--> Positive class is minority class")

In [None]:
print("Class distribution (sampled test set)")
len_neg = len(predictions_subsampled[predictions_subsampled["aae_dialect_label"] == 0])
len_pos = len(predictions_subsampled[predictions_subsampled["aae_dialect_label"] == 1])
print(f"Non-AAE:\t{len_neg} ({len_neg / len(predictions_subsampled):.2f})")
print(f"AAE:\t\t{len_pos} ({len_pos / len(predictions_subsampled):.2f})")

## Confusion matrix


### Full test set


In [None]:
y_true = twitteraae_test["aae_dialect_label"]
for approach_name, prediction in predictions.items():
    print("=" * 20)
    print(approach_name)
    print("True positive:\t", np.sum((y_true == 1) & (prediction == 1)))
    print("False positive:\t", np.sum((y_true == 0) & (prediction == 1)))
    print("True negative:\t", np.sum((y_true == 0) & (prediction == 0)))
    print("False negative:\t", np.sum((y_true == 1) & (prediction == 0)))

In [None]:
for approach_name in predictions.keys():
    print("=" * 20)
    print(approach_name)

    ConfusionMatrixDisplay.from_predictions(
        predictions_df["aae_dialect_label"],
        predictions_df[f"prediction_{approach_name}"],
        normalize="true",
    )
    plt.show()

### Subsampled test set


In [None]:
y_true = predictions_subsampled["aae_dialect_label"]
for approach_name in predictions.keys():
    print("=" * 20)
    print(approach_name)
    print(
        "True positive:\t",
        np.sum(
            (y_true == 1) & (predictions_subsampled[f"prediction_{approach_name}"] == 1)
        ),
    )
    print(
        "False positive:\t",
        np.sum(
            (y_true == 0) & (predictions_subsampled[f"prediction_{approach_name}"] == 1)
        ),
    )
    print(
        "True negative:\t",
        np.sum(
            (y_true == 0) & (predictions_subsampled[f"prediction_{approach_name}"] == 0)
        ),
    )
    print(
        "False negative:\t",
        np.sum(
            (y_true == 1) & (predictions_subsampled[f"prediction_{approach_name}"] == 0)
        ),
    )

In [None]:
for approach_name in predictions.keys():
    print("=" * 20)
    print(approach_name)

    ConfusionMatrixDisplay.from_predictions(
        predictions_subsampled["aae_dialect_label"],
        predictions_subsampled[f"prediction_{approach_name}"],
        normalize="true",
    )
    plt.show()

## Scores


### Full test set


In [None]:
table_0_headers = ["Accuracy (0)", "Precision (0)", "Recall (0)", "F1 (0)"]
table_0 = []
table_1_headers = ["Accuracy (1)", "Precision (1)", "Recall (1)", "F1 (1)"]
table_1 = []
table_macro_headers = ["Accuracy", "Precision (macro)", "Recall (macro)", "F1 (macro)"]
table_macro = []


for approach_name, prediction in predictions.items():
    test_accuracy_0 = accuracy_score(
        y_true=twitteraae_test["aae_dialect_label"], y_pred=prediction
    )
    test_precision_0, test_recall_0, test_f1_0, support_0 = (
        precision_recall_fscore_support(
            y_true=twitteraae_test["aae_dialect_label"],
            y_pred=prediction,
            pos_label=0,
            average="binary",
        )
    )
    table_0.append(
        [
            approach_name,
            np.round(test_accuracy_0, decimals=3),
            np.round(test_precision_0, decimals=3),
            np.round(test_recall_0, decimals=3),
            np.round(test_f1_0, decimals=3),
        ]
    )

    test_accuracy_1 = accuracy_score(
        y_true=twitteraae_test["aae_dialect_label"], y_pred=prediction
    )
    test_precision_1, test_recall_1, test_f1_1, support_1 = (
        precision_recall_fscore_support(
            y_true=twitteraae_test["aae_dialect_label"],
            y_pred=prediction,
            pos_label=1,
            average="binary",
        )
    )
    table_1.append(
        [
            approach_name,
            np.round(test_accuracy_1, decimals=3),
            np.round(test_precision_1, decimals=3),
            np.round(test_recall_1, decimals=3),
            np.round(test_f1_1, decimals=3),
        ]
    )

    test_accuracy_macro = "-"
    (
        test_precision_macro,
        test_recall_macro,
        test_f1_macro,
        support_macro,
    ) = precision_recall_fscore_support(
        y_true=twitteraae_test["aae_dialect_label"],
        y_pred=prediction,
        average="macro",
    )
    table_macro.append(
        [
            approach_name,
            test_accuracy_macro,
            np.round(test_precision_macro, decimals=3),
            np.round(test_recall_macro, decimals=3),
            np.round(test_f1_macro, decimals=3),
        ]
    )

print("=" * 20)
print("Negative class binary scores")
print(tabulate(table_0, headers=table_0_headers, tablefmt="rounded_grid"))

print("=" * 20)
print("Positive class binary scores")
print(tabulate(table_1, headers=table_1_headers, tablefmt="rounded_grid"))

print("=" * 20)
print("Macro averaged scores")
print(tabulate(table_macro, headers=table_macro_headers, tablefmt="rounded_grid"))

### Subsampled test set


In [None]:
table_0_headers = ["Accuracy (0)", "Precision (0)", "Recall (0)", "F1 (0)"]
table_0 = []
table_1_headers = ["Accuracy (1)", "Precision (1)", "Recall (1)", "F1 (1)"]
table_1 = []
table_macro_headers = ["Accuracy", "Precision (macro)", "Recall (macro)", "F1 (macro)"]
table_macro = []


for approach_name in predictions.keys():
    prediction = predictions_subsampled[f"prediction_{approach_name}"]
    y_true = predictions_subsampled["aae_dialect_label"]

    test_accuracy_0 = accuracy_score(y_true=y_true, y_pred=prediction)
    test_precision_0, test_recall_0, test_f1_0, support_0 = (
        precision_recall_fscore_support(
            y_true=y_true,
            y_pred=prediction,
            pos_label=0,
            average="binary",
        )
    )
    table_0.append(
        [
            approach_name,
            np.round(test_accuracy_0, decimals=3),
            np.round(test_precision_0, decimals=3),
            np.round(test_recall_0, decimals=3),
            np.round(test_f1_0, decimals=3),
        ]
    )

    test_accuracy_1 = accuracy_score(y_true=y_true, y_pred=prediction)
    test_precision_1, test_recall_1, test_f1_1, support_1 = (
        precision_recall_fscore_support(
            y_true=y_true,
            y_pred=prediction,
            pos_label=1,
            average="binary",
        )
    )
    table_1.append(
        [
            approach_name,
            np.round(test_accuracy_1, decimals=3),
            np.round(test_precision_1, decimals=3),
            np.round(test_recall_1, decimals=3),
            np.round(test_f1_1, decimals=3),
        ]
    )

    test_accuracy_macro = "-"
    (
        test_precision_macro,
        test_recall_macro,
        test_f1_macro,
        support_macro,
    ) = precision_recall_fscore_support(
        y_true=y_true,
        y_pred=prediction,
        average="macro",
    )
    table_macro.append(
        [
            approach_name,
            test_accuracy_macro,
            np.round(test_precision_macro, decimals=3),
            np.round(test_recall_macro, decimals=3),
            np.round(test_f1_macro, decimals=3),
        ]
    )

print("=" * 20)
print("Negative class binary scores")
print(tabulate(table_0, headers=table_0_headers, tablefmt="rounded_grid"))

print("=" * 20)
print("Positive class binary scores")
print(tabulate(table_1, headers=table_1_headers, tablefmt="rounded_grid"))

print("=" * 20)
print("Macro averaged scores")
print(tabulate(table_macro, headers=table_macro_headers, tablefmt="rounded_grid"))

## Significance tests


### $t$-test


In [None]:
from scipy.stats import shapiro, ttest_ind, wilcoxon

In [None]:
alpha = 0.05
n_splits = 10

#### For full test set


In [None]:
twitteraae_test_splits = np.array_split(predictions_df, n_splits)

In [None]:
approaches = ["approach_finetuned", "approach_finetuned_subsample"]
baselines = ["baseline_twitteraae"]

for score_index in [0, 1, 2]:
    if score_index == 0:
        print("\n\n\n")
        print("Precision")
    elif score_index == 1:
        print("\n\n\n")
        print("Recall")
    elif score_index == 2:
        print("\n\n\n")
        print("F1 score")

    for pos_label in [0, 1, -1]:
        if pos_label == 0:
            print("Negative label")
        elif pos_label == 1:
            print("Positive label")
        elif pos_label == -1:
            print("Macro averaged")

        approach_prediction_splits = {}
        for approach in [*approaches, *baselines]:
            if pos_label == -1:
                split_scores = [
                    precision_recall_fscore_support(
                        y_pred=split[f"prediction_{approach}"],
                        y_true=split["aae_dialect_label"],
                        average="macro",
                    )[score_index]
                    for split in twitteraae_test_splits
                ]
            else:
                split_scores = [
                    precision_recall_fscore_support(
                        y_pred=split[f"prediction_{approach}"],
                        y_true=split["aae_dialect_label"],
                        average="binary",
                        pos_label=pos_label,
                    )[score_index]
                    for split in twitteraae_test_splits
                ]
            approach_prediction_splits[approach] = split_scores

        approach_pairs = [(x, y) for x in approaches for y in baselines]
        results = {}
        # Test for significance
        for approach_A, approach_B in approach_pairs:
            scores_A = approach_prediction_splits[approach_A]
            scores_B = approach_prediction_splits[approach_B]

            if not np.mean(scores_A) > np.mean(scores_B):
                continue
            # =======================================
            # Check for nomality
            score_differences = [a - b for a, b in zip(scores_A, scores_B)]

            # If normality > alpha, null hypothesis that test is normally distributed can be rejected
            normality = shapiro(score_differences)[1]

            # =======================================
            # Test for significance, with test depending on normality
            if normality > alpha:
                # print("Normality test is significant. Running t-test.")
                # two sided t-test
                # Passing the scores of the baseline first, to ensure testing the correct hypothesis
                t_results = ttest_ind(scores_B, scores_A)
                # correct for one sided test, according to Hitchhiker's guide
                p_value = t_results[1] / 2

                if p_value <= alpha:
                    print(
                        f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)"
                    )
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Significant with {p_value:.4f} (t-test)"
                    )
                else:
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Not significant with {p_value:.4f} (t-test)"
                    )
            else:
                # print("Normality test not significant. Ommitting test for now.")
                results[f"{approach_A}___{approach_B}"] = "No normal distribution"
                # We can use the wilcoxon-signed rank test when normality is not given, as it is
                # non-parametric and thus does not make assumptions about the distribution
                w_results = wilcoxon(scores_A, scores_B)
                p_value = w_results[1]

                if p_value <= alpha:
                    print(
                        f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                    )
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Significant with {p_value:.4f} (wilcoxon)"
                    )
                else:
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Not significant with {p_value:.4f} (wilcoxon)"
                    )
        print("-" * 50)

#### For subsampled test set


In [None]:
shuffled_predictions_subsampled = predictions_subsampled.sample(frac=1)
twitteraae_test_subsampled_splits = np.array_split(
    shuffled_predictions_subsampled, n_splits
)

In [None]:
approaches = ["approach_finetuned", "approach_finetuned_subsample"]
baselines = ["baseline_twitteraae"]

for score_index in [0, 1, 2]:
    if score_index == 0:
        print("\n\n\n")
        print("Precision")
    elif score_index == 1:
        print("\n\n\n")
        print("Recall")
    elif score_index == 2:
        print("\n\n\n")
        print("F1 score")

    for pos_label in [0, 1, -1]:
        if pos_label == 0:
            print("Negative label")
        elif pos_label == 1:
            print("Positive label")
        elif pos_label == -1:
            print("Macro averaged")

        approach_prediction_splits = {}
        for approach in [*approaches, *baselines]:
            if pos_label == -1:
                split_scores = [
                    precision_recall_fscore_support(
                        y_pred=split[f"prediction_{approach}"],
                        y_true=split["aae_dialect_label"],
                        average="macro",
                    )[score_index]
                    for split in twitteraae_test_subsampled_splits
                ]
            else:
                split_scores = [
                    precision_recall_fscore_support(
                        y_pred=split[f"prediction_{approach}"],
                        y_true=split["aae_dialect_label"],
                        average="binary",
                        pos_label=pos_label,
                    )[score_index]
                    for split in twitteraae_test_subsampled_splits
                ]
            approach_prediction_splits[approach] = split_scores

        approach_pairs = [(x, y) for x in approaches for y in baselines]
        results = {}
        # Test for significance
        for approach_A, approach_B in approach_pairs:
            scores_A = approach_prediction_splits[approach_A]
            scores_B = approach_prediction_splits[approach_B]

            if not np.mean(scores_A) > np.mean(scores_B):
                continue
            # =======================================
            # Check for nomality
            score_differences = [a - b for a, b in zip(scores_A, scores_B)]

            # If normality > alpha, null hypothesis that test is normally distributed can be rejected
            normality = shapiro(score_differences)[1]

            # =======================================
            # Test for significance, with test depending on normality
            if normality > alpha:
                # print("Normality test is significant. Running t-test.")
                # two sided t-test
                # Passing the scores of the baseline first, to ensure testing the correct hypothesis
                t_results = ttest_ind(scores_B, scores_A)
                # correct for one sided test, according to Hitchhiker's guide
                p_value = t_results[1] / 2

                if p_value <= alpha:
                    print(
                        f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)"
                    )
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Significant with {p_value:.4f} (t-test)"
                    )
                else:
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Not significant with {p_value:.4f} (t-test)"
                    )
            else:
                # print("Normality test not significant. Ommitting test for now.")
                results[f"{approach_A}___{approach_B}"] = "No normal distribution"
                # We can use the wilcoxon-signed rank test when normality is not given, as it is
                # non-parametric and thus does not make assumptions about the distribution
                w_results = wilcoxon(scores_A, scores_B)
                p_value = w_results[1]

                if p_value <= alpha:
                    print(
                        f"{approach_A} is significantly better than {approach_B} with p-value {p_value:.4f} (t-test)."
                    )
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Significant with {p_value:.4f} (wilcoxon)"
                    )
                else:
                    results[f"{approach_A}___{approach_B}"] = (
                        f"Not significant with {p_value:.4f} (wilcoxon)"
                    )
        print("-" * 50)