# Root Mean Squared Error (RMSE) and parity plots

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Read data function

In [None]:
def read_data(model_name, data_path):
    """Read data for all 5 folds and return as a list of dictionaries."""

    data = []

    for fold in range(5):

        # read exported results
        train_measured = np.loadtxt(f"{data_path}y_train_{fold}.csv")
        train_pred = np.loadtxt(f"{data_path}ypred_{model_name}_train_{fold}.csv")
        test_measured = np.loadtxt(f"{data_path}y_test_{fold}.csv")
        test_pred = np.loadtxt(f"{data_path}ypred_{model_name}_test_{fold}.csv")
        train_labels_samples = np.loadtxt(f"{data_path}labels_train_{fold}.csv").astype(
            np.int64
        )
        test_labels_samples = np.loadtxt(f"{data_path}labels_test_{fold}.csv").astype(
            np.int64
        )

        # expand labels to match 6 replicates per sample
        train_labels = np.repeat(train_labels_samples, 6)
        test_labels = np.repeat(test_labels_samples, 6)

        # calculate R2 scores
        train_r2 = r2_score(train_measured, train_pred)
        test_r2 = r2_score(test_measured, test_pred)

        # store fold data
        fold_data = {
            "train_measured": train_measured,
            "train_pred": train_pred,
            "test_measured": test_measured,
            "test_pred": test_pred,
            "train_labels": train_labels,
            "test_labels": test_labels,
            "train_r2": train_r2,
            "test_r2": test_r2,
        }

        data.append(fold_data)

    return data

## Parity plot function

In [None]:
def parity_plot(data, model_name, property_type, labeldict, zoomed=False):
    """Plot all 5 folds using pre-loaded data with optional zoom for SCB."""

    # settings for each property
    property_settings = {
        "density": {
            "xlim": (0.87, 0.98),
            "ylim": (0.87, 0.98),
            "parity_line": [0.88, 0.970],
            "xlabel": "Measured density [g/cm$^{3}$]",
            "ylabel": "Predicted density [g/cm$^{3}$]",
            "figsize": (5, 5),
        },
        "crystallinity": {
            "xlim": (0.15, 0.75),
            "ylim": (0.15, 0.75),
            "parity_line": [0.22, 0.71],
            "xlabel": "Measured crystallinity",
            "ylabel": "Predicted crystallinity",
            "figsize": (5, 5),
        },
        "SCB": {
            "xlim": (-20, 380),
            "ylim": (-20, 380),
            "parity_line": [-10, 370],
            "xlabel": "Measured SCB [CH$_{3}$/1000C]",
            "ylabel": "Predicted SCB [CH$_{3}$/1000C]",
            "figsize": (5, 5),
            # zoomed-in plot setting
            "zoomed_xlim": (0, 40),
            "zoomed_ylim": (0, 40),
            "zoomed_parity_line": [0, 40],
            "zoomed_figsize": (3, 3),
        },
    }

    settings = property_settings[property_type]
    colors = sns.color_palette("tab10", len(labeldict))

    # use zoomed settings if requested
    if zoomed and property_type == "SCB":
        xlim = settings["zoomed_xlim"]
        ylim = settings["zoomed_ylim"]
        parity_line = settings["zoomed_parity_line"]
        figsize = settings["zoomed_figsize"]
    else:
        xlim = settings["xlim"]
        ylim = settings["ylim"]
        parity_line = settings["parity_line"]
        figsize = settings["figsize"]

    # plot each fold
    for fold, fold_data in enumerate(data):

        plt.figure(figsize=figsize)

        if zoomed and property_type == "SCB":
            # Filter data for zoom range
            train_mask_zoom = (fold_data["train_measured"] >= xlim[0]) & (
                fold_data["train_measured"] <= xlim[1]
            )
            test_mask_zoom = (fold_data["test_measured"] >= xlim[0]) & (
                fold_data["test_measured"] <= xlim[1]
            )

            # Calculate R2 for zoomed data
            train_r2 = r2_score(
                fold_data["train_measured"][train_mask_zoom],
                fold_data["train_pred"][train_mask_zoom],
            )
            test_r2 = r2_score(
                fold_data["test_measured"][test_mask_zoom],
                fold_data["test_pred"][test_mask_zoom],
            )
        else:
            # Use all data
            train_mask_zoom = np.ones(len(fold_data["train_measured"]), dtype=bool)
            test_mask_zoom = np.ones(len(fold_data["test_measured"]), dtype=bool)
            train_r2 = fold_data["train_r2"]
            test_r2 = fold_data["test_r2"]

        # create legend entries
        legend_elements = []

        # plot each polymer type
        for polymer_name, label in labeldict.items():
            color = colors[label]

            # training data (filled circles)
            train_mask = (fold_data["train_labels"] == label) & train_mask_zoom
            plt.scatter(
                fold_data["train_measured"][train_mask],
                fold_data["train_pred"][train_mask],
                color=color,
                alpha=0.8,
                s=70 if not zoomed else 50,
            )

            # test data (empty triangles)
            test_mask = (fold_data["test_labels"] == label) & test_mask_zoom
            plt.scatter(
                fold_data["test_measured"][test_mask],
                fold_data["test_pred"][test_mask],
                facecolors="none",
                edgecolors=color,
                marker="^",
                s=70 if not zoomed else 50,
                linewidth=1.5,
            )

            # add polymer type to legend for non-zoomed plots
            if not zoomed:
                legend_elements.append(
                    plt.Line2D(
                        [0],
                        [0],
                        marker="o",
                        color="w",
                        markerfacecolor=color,
                        markersize=8,
                        label=polymer_name,
                        linestyle="",
                    )
                )

        # add train/test indicators to legend for non-zoomed
        if not zoomed:
            legend_elements.append(
                plt.Line2D(
                    [0],
                    [0],
                    marker="o",
                    color="w",
                    markerfacecolor="gray",
                    markersize=8,
                    label="Train",
                    linestyle="",
                )
            )
            legend_elements.append(
                plt.Line2D(
                    [0],
                    [0],
                    marker="^",
                    color="w",
                    markerfacecolor="none",
                    markeredgecolor="gray",
                    markersize=8,
                    label="Test",
                    linestyle="",
                )
            )

        # parity line
        plt.plot(parity_line, parity_line, "k--", linewidth=1)

        plt.xlim(xlim)
        plt.ylim(ylim)
        plt.xlabel(settings["xlabel"], fontsize=14)
        plt.ylabel(settings["ylabel"], fontsize=14)

        # title and R2
        if zoomed:
            plt.title(f"R² = {train_r2:.2f} (train), {test_r2:.2f} (test)", fontsize=10)
            # plt.legend(handles=legend_elements, frameon=True, fontsize=6, loc='upper left', bbox_to_anchor=(0.02, 0.98))

            plt.legend(
                handles=legend_elements, frameon=True, fontsize=6, loc="upper left"
            )

        else:
            plt.title(
                f"{model_name} - Fold {fold+1}\nTrain R² = {train_r2:.2f}, Test R² = {test_r2:.2f}"
            )
            plt.xticks(fontsize=14)
            plt.yticks(fontsize=14)
            # Add legend for non-zoomed plots
            plt.legend(handles=legend_elements, frameon=False, fontsize=10, loc="best")

        plt.tight_layout()
        plt.show()

## RMSE plots

In [None]:
def plot_rmse_with_errorbars(mean_rmse, std_rmse, labels, property_type):
    """Plot average RMSE with standard deviation as error bars for ML models."""

    # Settings for each property
    property_settings = {
        "density": {
            "measurement_error": 0.00247,
            "text_y_offset": 0.0030,
            "y_tick_interval": 0.004,
            "ylabel": "Density RMSE [g/cm$^{3}$]",
            "save_filename": "rmse_den",
        },
        "crystallinity": {
            "measurement_error": 0.028,
            "text_y_offset": 0.015,
            "y_tick_interval": 0.03,
            "ylabel": "Crystallinity RMSE",
            "save_filename": "rmse_crys",
        },
        "SCB": {
            "measurement_error": 2.65,
            "text_y_offset": 0.5,
            "y_ticks": [0, 5, 10, 15, 20],
            "ylabel": "SCB RMSE [CH$_{3}$/1000C]",
            "save_filename": "rmse_SCB",
        },
    }

    settings = property_settings[property_type]

    # Plotting
    x_pos = np.arange(len(labels))
    fig, ax = plt.subplots(figsize=(5, 3))

    # Error bar properties
    error_bar_settings = {"elinewidth": 1, "capsize": 5, "capthick": 1}

    # Plot models with error bars
    for i in range(len(labels)):
        ax.errorbar(
            x_pos[i],
            mean_rmse[i],
            yerr=std_rmse[i],
            fmt="o",
            markerfacecolor="black",
            markeredgecolor="black",
            markersize=8,
            ecolor="black",
            **error_bar_settings
        )

    # Add measurement error line
    ax.axhline(
        y=settings["measurement_error"],
        color="blue",
        linestyle="--",
        linewidth=1.5,
        alpha=0.7,
    )

    # Add text label for measurement error line
    ax.text(
        len(labels) - 1,
        settings["text_y_offset"],
        "Measurement error",
        va="bottom",
        ha="right",
        fontsize=13,
        bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"),
    )

    # x-axis setting
    ax.set_xticks(x_pos)
    ax.set_xticklabels(labels, fontsize=12)
    ax.set_xlim([-0.6, len(labels) - 0.3])
    ax.tick_params(axis="x", pad=5)

    # y-axis setting
    if property_type == "SCB":
        ax.set_yticks(settings["y_ticks"])
    else:
        y_ticks = np.arange(
            0,
            np.max(mean_rmse) + settings["y_tick_interval"] + 0.002,
            settings["y_tick_interval"],
        )
        ax.set_yticks(y_ticks)

    plt.yticks(fontsize=14)
    ax.set_ylabel(settings["ylabel"], fontsize=14)

    plt.tight_layout()
    plt.show()

In [None]:
labeldict = {"HDPE": 0, "LDPE": 1, "LLDPE": 2, "PP": 3, "PE/PP blend": 4, "PE blend": 5}
models = ["PLSR", "PCR", "LASSO", "LR", "RF"]

## Density

### RMSE plot

In [None]:
df_den = pd.read_csv("../ModelData/density/density_results.csv")
df_den.head()
plot_rmse_with_errorbars(
    df_den["Mean Test RMSE"], df_den["Std Test RMSE"], df_den["Model"], "density"
)

### Parity plot

In [None]:
data_den = read_data("PLSR", "../ModelData/density/")
parity_plot(data_den, "PLSR", "density", labeldict)

## Crystallinity

### RMSE plot

In [None]:
df_crys = pd.read_csv("../ModelData/crystallinity/crystallinity_results.csv")
df_crys.head()
plot_rmse_with_errorbars(
    df_crys["Mean Test RMSE"],
    df_crys["Std Test RMSE"],
    df_crys["Model"],
    "crystallinity",
)

### Parity plot

In [None]:
data_crys = read_data("PLSR", "../ModelData/crystallinity/")
parity_plot(data_crys, "PLSR", "crystallinity", labeldict)

## SCB

### RMSE plot

In [None]:
df_scb = pd.read_csv("../ModelData/SCB/scb_results.csv")
df_scb.head()
plot_rmse_with_errorbars(
    df_scb["Mean Test RMSE"], df_scb["Std Test RMSE"], df_scb["Model"], "SCB"
)

### Parity plot

In [None]:
data_SCB = read_data("PLSR", "../ModelData/SCB/")
parity_plot(data_SCB, "PLSR", "SCB", labeldict)

### Plot SCB zoom in area

In [None]:
parity_plot(data_SCB, "PLSR", "SCB", labeldict, zoomed=True)