# Plots for the poster

In [None]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../src")

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from utils import setup_logging
from utils_ext.ml import load_wandb_history, load_wandb_summary
from utils_ext.plot import Plotter
from utils_ext.tools import parse_logs
from cache import CACHE

plt.ioff()
setup_logging()

PATH_DATA = Path("../data")
PATH_OUTPUT = Path("../output/results")
CACHE.init(cache_dir="../output/cache")

# setup plotter

FONTSIZE_SMALL = 6
FONTSIZE_DEFAULT = 8
FONTSIZE_LARGE = 10

Plotter.setup(
    css_patches=["overflow_auto", "gray_background"]
)
Plotter.configure(
    basewidth=3.25,
    fontsize=FONTSIZE_DEFAULT,
    latex=False,
    rcparams={
        "lines.linewidth": 1,  # default: 1.5
        "axes.labelpad": 2,  # default: 4
    },
    save_dir=PATH_OUTPUT / "plots_poster",
    save_format="pdf",
)
Plotter.configure(
    latex=True,
    latex_preamble="\n".join(
        [
            r"\usepackage[utf8]{inputenc}",
            r"\usepackage[T1]{fontenc}",
            r"\usepackage{microtype}",
            r"\usepackage{lmodern}",  # for 8-bit Latin Modern font
            r"\usepackage[sc]{mathpazo}",  # for Palatino font
            r"\usepackage{amsmath,amssymb,amsfonts,mathrsfs}",
        ]
    ),
)

In [None]:
# NAME_MAPPING = {
#     "cardiffnlp/twitter-roberta-base-sentiment-latest": "roberta-b-cardiffnlp",
#     "tabularisai/multilingual-sentiment-analysis": "distilbert-b-tabularisai",
#     "siebert/sentiment-roberta-large-english": "roberta-l-siebert",
#     "FacebookAI/roberta-large": "roberta-l",
# }
DESCRIPTION_MAPPING = {
    "cardiffnlp/twitter-roberta-base-sentiment-latest": "RoBERTa-B (CardiffNLP)",
    "tabularisai/multilingual-sentiment-analysis": "DistilBERT-B (tabularisai)",
    "siebert/sentiment-roberta-large-english": "RoBERTa-L (SiEBERT)",
    "FacebookAI/roberta-large": "RoBERTa-L",
}

def annotate_ours(ax, x, y=0.25):
    ax.axvline(x=x, color="tab:gray", linestyle="--", alpha=0.5)
    x_axes = ax.transAxes.inverted().transform(ax.transData.transform((x, 0)))[0]
    ax.text(
        x_axes-0.04, y, "ours",
        fontsize=FONTSIZE_SMALL,
        color="tab:gray",
        rotation=90,
        # clip_on=False,
        transform=ax.transAxes,
    )

## Main plots

In [None]:
Plotter.configure(save_always=True)

In [None]:
def plot_finetuned_classifier_class_weights(plot_group):
    def format_class_weights(x):
        if pd.isna(x):
            return "none"
        elif x == "auto":
            return "balanced"
        elif x == "[1.553279,1.038669,1.096438]":
            return "adjusted"
        else:
            raise ValueError(f"Unknown class weights: {x}")

    # load data
    df = load_wandb_summary(
        PATH_OUTPUT / "data" / "finetuned_classifier_class_weights.csv",
        {
            "train_score": "train_score",
            "val_score": "val_score",
            "pipeline.trainer.class_weights": "class_weights",
        },
    )
    df["class_weights"] = df["class_weights"].apply(format_class_weights)

    # add test scores
    test_scores = pd.Series({
        "20250522-231950-polite-boar-341": 0.85694,
        "20250522-232111-sneaky-sponge-426": 0.85906,
        "20250522-232140-agreeable-auk-109": 0.85491,
    })
    df["test_score"] = test_scores

    # update dataframe index and order
    df = df.set_index("class_weights")
    df = df.reindex(["none", "balanced", "adjusted"])

    # plot
    fig, ax = Plotter.create()
    df["train_score"].plot.bar(
        ax=ax,
        color="tab:blue",
        xlabel="class weights",
        ylabel="train score",
        ylim=(0.905, 0.955),
        yticks=[0.91, 0.93, 0.95],
    )
    ax.set_xticklabels(df.index, rotation=35, ha="right")
    plot_group.add_plot(fig, "finetuned_classifier-class_weights-train")

    fig, ax = Plotter.create()
    df["val_score"].plot.bar(
        ax=ax,
        color="tab:orange",
        xlabel="class weights",
        ylabel="validation score",
        ylim=(0.845, 0.895),
        yticks=[0.85, 0.87, 0.89],
    )
    ax.set_xticklabels(df.index, rotation=35, ha="right")
    plot_group.add_plot(fig, "finetuned_classifier-class_weights-val")

    fig, ax = Plotter.create()
    df["test_score"].plot.bar(
        ax=ax,
        color="tab:green",
        xlabel="class weights",
        ylabel="(public) test score",
        ylim=(0.845, 0.895),
        yticks=[0.85, 0.87, 0.89],
    )
    ax.set_xticklabels(df.index, rotation=35, ha="right")
    plot_group.add_plot(fig, "finetuned_classifier-class_weights-test")

    print(df.reset_index().to_latex(
        index=False,
        header=["class weights", 'train score', 'val score', '(public) test score'],
        float_format="%.3f",
    ))


with Plotter.group(
    figwidth=0.75,
    grid_ncols=3,
    consistent_size=True,
    save_kw=dict(transparent=True),
    # save=True,
) as plot_group:
    plot_finetuned_classifier_class_weights(plot_group)

In [None]:
def plot_finetuned_classifier_loss(plot_group):
    # load data
    df = load_wandb_history(
        PATH_OUTPUT / "data" / "finetuned_classifier_loss.csv",
        {
            "train/epoch": "epoch",
            "train/loss": "train_loss",
            "eval/loss": "val_loss",
        },
    )

    df_by_model = {
        "cardiffnlp/twitter-roberta-base-sentiment-latest": df["20250523-172805-illustrious-horse-512"],
        "tabularisai/multilingual-sentiment-analysis": pd.concat([
            df["20250527-203545-peaceful-wren-73"],
            df["20250528-085739-calm-shrike-582"],
        ]),
        "siebert/sentiment-roberta-large-english": pd.concat([
            df["20250528-084638-polite-sloth-517"],
            df["20250529-011210-bouncy-cat-232"],
            df["20250529-171132-inquisitive-stoat-571"],
        ]),
        "FacebookAI/roberta-large": pd.concat([
            df["20250528-203625-enthused-kite-698"],
            df["20250528-235716-wistful-boar-764"],
            df["20250528-235901-legendary-crab-346"],
        ]),
    }

    for name, df_model in df_by_model.items():
        # smooth train loss
        df_model["train_loss_ema"] = df_model["train_loss"].ewm(alpha=0.05).mean()

        # plot
        fig, ax = Plotter.create()
        df_model[["epoch", "train_loss_ema"]].dropna().plot.line(
            ax=ax,
            x="epoch",
            y="train_loss_ema",
            color="tab:blue",
            ylabel="loss",
        )
        df_model[["epoch", "val_loss"]].dropna().plot.line(
            ax=ax,
            x="epoch",
            y="val_loss",
            color="tab:orange",
        )
        df_model[["epoch", "train_loss"]].dropna().plot.line(
            ax=ax,
            x="epoch",
            y="train_loss",
            color="tab:blue",
            alpha=0.25,
            ylabel="loss",
        )
        # ax.set_title(DESCRIPTION_MAPPING[name])
        ax.legend(["train loss", "validation loss"], loc="lower left", fontsize=FONTSIZE_SMALL)
        annotate_ours(ax, x=2, y=0.05)
        plot_group.add_plot(fig, f"finetuned_classifier-loss-{name.replace('/', '_').replace('-', '_')}")


with Plotter.group(
    figwidth=0.8,
    grid_ncols=2,
    consistent_size=True,
    save_kw=dict(transparent=True),
    # save=True,
) as plot_group:
    plot_finetuned_classifier_loss(plot_group)

In [None]:
def plot_finetuned_classifier_freeze(plot_group):
    def format_layer(x):
        if pd.isna(x):
            return "full"
        else:
            last_layer = eval(x)[-1]
            if last_layer.endswith("embeddings"):
                return "embeddings"
            else:
                return "layer " + last_layer.split(".")[-1]

    # load data
    df = load_wandb_summary(
        PATH_OUTPUT / "data" / "finetuned_classifier_freeze.csv",
        {
            "train_score": "train_score",
            "val_score": "val_score",
            "pipeline.freeze": "freeze",
            "Runtime": "train_time",
        },
    )
    df["freeze"] = df["freeze"].apply(format_layer)
    df["train_time"] = df["train_time"] / 3600

    # parse number of trainable parameters from log files
    logs = parse_logs(
        "../output/experiments_freeze/*/job-*.log",
        patterns={
            "name": (r"Job name: (.+)", None),
            "n_trainable_params": (r"Number of trainable parameters: ([\d,]+)", lambda x: int(x.replace(",", ""))),
        },
    )
    logs = pd.DataFrame(logs)
    logs = logs.set_index("name")
    df["n_trainable_params"] = logs["n_trainable_params"]

    # update dataframe index and order
    df = df.set_index("freeze")
    df = df.loc[::-1]

    # plot
    fig, ax = Plotter.create()
    ax2 = ax.twinx()
    df.plot.line(
        ax=ax,
        y=["train_score", "val_score"],
        marker="o",
        markersize=3,
        color={"train_score": "tab:blue", "val_score": "tab:orange"},
        ylabel="score",
    )
    Plotter.set(
        ax,
        legend=dict(loc="upper left", labels=["train score", "validation score"], fontsize=FONTSIZE_SMALL),
    )
    df.plot.line(
        ax=ax2,
        y="train_time",
        marker="o",
        markersize=3,
        linestyle="--",
        color="tab:gray",
        ylabel="time (h)",
    )
    ax.set_xlabel("parameters frozen up to")
    Plotter.set(
        ax2,
        legend=dict(loc="lower right", labels=["train time"], fontsize=FONTSIZE_SMALL),
    )
    ax.set_xticks(ticks=range(len(df.index))[1::2], labels=df.index[1::2], rotation=35, ha="right")
    ax.axvline(x=7, color="tab:green", linestyle="--", alpha=0.5)
    annotate_ours(ax, x=13)
    plot_group.add_plot(fig, "finetuned_classifier-freeze")


with Plotter.group(
    figwidth=0.85,
    grid_ncols=2,
    consistent_size=True,
    save_kw=dict(transparent=True),
    # save=True,
) as plot_group:
    plot_finetuned_classifier_freeze(plot_group)
    plot_finetuned_classifier_loss(plot_group)
    del plot_group.plots[-3:]
    plot_group.plots[-1] = (plot_group.plots[-1][0], "finetuned_classifier-loss")

In [None]:
def plot_finetuned_classifier_post_on_hard(plot_group):
    df = load_wandb_summary(
        PATH_OUTPUT / "data" / "finetuned_classifier_post_on_hard.csv",
        {
            "train_score": "train_score",
            "val_score": "val_score",
            "eval/loss": "eval_loss",
            "pipeline.preprocessing.difficulty_filter.p": "p",
            "train_runtime": "train_time",
            "_wandb.runtime": "wandb_time",
        },
    )
    df["p"] = df["p"].apply(lambda x: x if not pd.isna(x) else "none")
    df.loc["20250529-164632-charming-stoat-395", "train_time"] = df.loc["20250529-164632-charming-stoat-395", "wandb_time"] - 285   # this run was interrupted
    df["train_time"] = df["train_time"] / 3600  # convert to hours
    df = df.set_index("p")
    df = df.loc[["none", 0.5, 0.4, 0.3, 0.2, 0.1]]

    fig, ax = Plotter.create()
    ax2 = ax.twinx()
    df.plot.line(
        ax=ax,
        y=["train_score", "val_score"],
        # y=["val_score"],
        marker="o",
        markersize=4,
        color={"train_score": "tab:blue", "val_score": "tab:orange"},
        ylabel="score",
    )
    Plotter.set(
        ax,
        legend=dict(loc="upper left", labels=["train score", "validation score"], fontsize=FONTSIZE_SMALL),
        # legend=dict(loc="upper left", labels=["validation score"], fontsize=FONTSIZE_SMALL),
    )
    df.plot.line(
        ax=ax2,
        y="eval_loss",
        marker="o",
        markersize=4,
        linestyle="--",
        color="tab:orange",
        ylabel="loss",
    )
    Plotter.set(
        ax2,
        legend=dict(loc="lower right", labels=["validation loss"], fontsize=FONTSIZE_SMALL),
    )
    ax.set_xlabel("fraction of hard samples")
    annotate_ours(ax, x=3.5)
    plot_group.add_plot(fig, "finetuned_classifier-post_on_hard")


with Plotter.group(
    figwidth=1,
    grid_ncols=2,
    consistent_size=True,
    save_kw=dict(transparent=True),
    # save=True,
) as plot_group:
    plot_finetuned_classifier_post_on_hard(plot_group)

In [None]:
from utils import evaluate_score, load_data
from cache import load_embeddings
from pipelines.pretrained_classifier import map_to_labels


def plot_model_voting(plot_group):
    model_names = [
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "cardiffnlp/twitter-xlm-roberta-base-sentiment",
        "nlptown/bert-base-multilingual-uncased-sentiment",
        "siebert/sentiment-roberta-large-english",
        "tabularisai/multilingual-sentiment-analysis",
    ]

    def load_predictions(model_name):
        return load_embeddings("huggingface", model_name, "predictions_train.csv", load_kwargs={"index_col": 0}, verbose=False)

    def load_labels(model_name, label_mapping=None):
        predictions = load_predictions(model_name)
        labels = map_to_labels(predictions, model_name)
        if label_mapping is not None:
            labels = labels.map(label_mapping)
        return labels

    def load_labels_all(model_names, label_mapping=None):
        label_pred = {}
        for model_name in model_names:
            label_pred[model_name] = load_labels(model_name, label_mapping)
        return pd.DataFrame(label_pred)

    label_mapping = {
        "negative": -1,
        "neutral": 0,
        "positive": 1,
    }
    train_dataset = load_data(PATH_DATA / "training.csv")

    # load predictions
    sentences = train_dataset["sentence"]
    label_true = train_dataset["label"]
    label_pred = load_labels_all(model_names)

    # count number of votes for each sample
    model_voting = label_pred.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)

    # compute score weighted by votes for each sample
    scores_individual = 0.5 * (2 - np.abs(model_voting.columns.map(label_mapping).values - label_true.map(label_mapping).values[:, None]))
    scores = (model_voting * scores_individual).sum(axis=1) / model_voting.sum(axis=1)

    # finalize dataframe
    # label_pred_count.rename(columns=label_mapping_inverted, inplace=True)
    model_voting["label"] = label_true
    model_voting["score"] = scores
    model_voting["sentence"] = sentences

    # plot distribution of weighted scores
    fig, ax = Plotter.create()
    model_voting["score"].value_counts().sort_index().plot.bar(ax)
    ax.set_xlabel("difficulty score")
    ax.set_ylabel("count")
    plot_group.add_plot(fig, "finetuned_classifier-difficulty_scores")

    # compute scores per model per score
    scores_per_bin = {}
    for score in np.sort(model_voting["score"].unique()):
        label_true_bin = label_true[model_voting["score"] == score]
        label_pred_bin = label_pred[model_voting["score"] == score]
        scores_per_bin[score] = [evaluate_score(label_true_bin, label_pred_bin[model_name]) for model_name in model_names]

with Plotter.group(
    figwidth=0.6,
    grid_ncols=1,
    consistent_size=True,
    save_kw=dict(transparent=True),
    # save=True,
) as plot_group:
    plot_model_voting(plot_group)

In [None]:
def plot_finetuned_classifier_final():
    def format_name(x):
        if x.startswith("/"):
            return df.loc[x.split("/")[-2], "model"]
        else:
            return x

    # load data
    df = load_wandb_summary(
        PATH_OUTPUT / "data" / "finetuned_classifier_final.csv",
        {
            "pipeline.model.pretrained_model_name_or_path": "model",
            "pipeline.preprocessing.sanitizer": "preprocessing",
            "pipeline.preprocessing.difficulty_filter.p": "post on hard",
            "Runtime": "train_time",
            "train_score": "train_score",
            "val_score": "val_score",
        },
    )
    df["model"] = df["model"].apply(format_name)
    df["preprocessing"] = df["preprocessing"].apply(lambda x: "without" if pd.isna(x) else "with")
    df["post on hard"] = df["post on hard"].apply(lambda x: "without" if pd.isna(x) else "with")
    # create multi-index table
    df = df.pivot(index=["model"], columns=["preprocessing", "post on hard"], values="val_score")
    df = df.reindex(
        index=[
            "cardiffnlp/twitter-roberta-base-sentiment-latest",
            "tabularisai/multilingual-sentiment-analysis",
            "siebert/sentiment-roberta-large-english",
            "FacebookAI/roberta-large",
        ],
        columns=pd.MultiIndex.from_product(
            [["without", "with"], ["without", "with"]],
            names=["preprocessing", "post on hard"],
        ),
    )
    # construct final table
    df_final = pd.DataFrame({
        "pretrained": {
            "cardiffnlp/twitter-roberta-base-sentiment-latest": 0.829226,
            "tabularisai/multilingual-sentiment-analysis": 0.737220,
            "siebert/sentiment-roberta-large-english": 0.676876,
            "FacebookAI/roberta-large": None,
        },
        "+ finetuned": df[("without", "without")],
        "+ post-tuned": df[("without", "with")],
        "+ preprocessing": df[("with", "with")],
    })
    # map model names
    df.index = df.index.map(DESCRIPTION_MAPPING)
    df_final.index = df_final.index.map(DESCRIPTION_MAPPING)


    # def color_cell(val, style="latex"):
    #     # normalize between min and max
    #     min_val = df.min().min()
    #     max_val = df.max().max()
    #     norm = (val - min_val) / (max_val - min_val) if max_val > min_val else 0
    #     # map to color between white (FFFFFF) and blue (0066FF)
    #     def interp(a, b):
    #         return int(a + (b - a) * norm)
    #     r = interp(200, 0)
    #     g = interp(200, 102)
    #     b = interp(255, 255)
    #     color = f"{r:02X}{g:02X}{b:02X}"
    #     if style == "latex":
    #         return f'\\cellcolor[HTML]{{{color}}} {val:.6f}'
    #     else:
    #         return f'background-color: #{color}; color: black;'

    display(df)
    print(df_final.to_latex(
        multirow=True,
        multicolumn=True,
        float_format="%.3f"
    ))
    # display(df.style.map(color_cell, style="html"))
    # print(df.map(color_cell, style="latex").to_latex(
    #     multirow=True,
    #     multicolumn=True,
    #     escape=False,
    #     float_format="%.3f"
    # ))
plot_finetuned_classifier_final()

In [None]:
Plotter.configure(save_always=False)
plt.close()