In [14]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

# Compare MinMax and RobustScaler

In [2]:
def load_and_unscale(file_path, scaler, exp_no=0):
    xhat_unscaled = []
    
    for imputer_name in ["gain_imputer", "missforest_imputer", "mean_imputer", "knn_imputer"]:
        print(f"Load and scale for {imputer_name}")
        # load file
        file_name = f"testing_xhat_{imputer_name}_exp{exp_no}_grid.csv"
        temp_scaled = pd.read_csv(file_path + file_name)
        temp_scaled["imputer"] = temp_scaled["imputer"].replace({"missforest": "missforest_imputer"})
        
        # undo scaling
        temp_unscaled = pd.DataFrame(scaler.inverse_transform(temp_scaled[scaler.feature_names_in_]), columns=scaler.feature_names_in_)
        
        temp_unscaled = pd.concat([temp_unscaled.sort_values(["LATITUDE", "LONGITUDE", "LEV_M"]).reset_index(drop=True), 
                                   temp_scaled.sort_values(["LATITUDE", "LONGITUDE", "LEV_M"]).reset_index(drop=True)[
                                       ["experiment", "iteration", "missing_value_proportion", "imputer"]]], axis=1)
        xhat_unscaled.append(temp_unscaled)
    
    print()
    
    return pd.concat(xhat_unscaled)

In [3]:
def compute_error(df_xhat, df_y, num_decimals=12, error_name="rmse", 
                  parameters=['P_TEMPERATURE', 'P_SALINITY', 'P_OXYGEN', 'P_NITRATE','P_SILICATE', 'P_PHOSPHATE']):
    df = df_xhat.copy()
    df[["LATITUDE", "LONGITUDE", "LEV_M"]] = round(df[["LATITUDE", "LONGITUDE", "LEV_M"]], num_decimals)

    df_merged = pd.merge(left=df, right=round(df_y, num_decimals), how="left",
                         on=["LATITUDE", "LONGITUDE", "LEV_M"], suffixes=("_xhat", "_y"))
    
    squaring = True if error_name == "mse" else False
    
    for p in parameters:
        df_merged[f"{error_name}_{p}"] = df_merged.apply(lambda row : mean_squared_error([row[f"{p}_y"]], [row[f"{p}_xhat"]], squared=squaring), axis = 1)

    return df_merged

In [4]:
def plot_scaler_comparison(df, error_name="rmse", exp_no=0, save_as=None):
    for param in ["P_TEMPERATURE", "P_SALINITY", "P_OXYGEN", "P_NITRATE", "P_SILICATE", "P_PHOSPHATE"]:
        sns.lineplot(df, x="missing_value_proportion", y=error_name + "_" + param + "_minmax", label="MinMaxScaler")#, hue="imputer")
        sns.lineplot(df, x="missing_value_proportion", y=error_name + "_" + param + "_robust", label="RobustScaler")#, hue="imputer")
        
        delta_error = df.copy()
        delta_error["delta_error"] = df[f"rmse_{param}_robust"] - df[f"rmse_{param}_minmax"]
        sns.lineplot(delta_error, x="missing_value_proportion", y="delta_error", label="$RMSE_{minmax}-RMSE_{robust}$")  # f"$\Delta {error_name.upper()}$")
        
        plt.axhline(0, color="black")
        
        plt.title(f"Predicting {map_param_label[param].lower()} (exp{exp_no})")
        plt.xlabel("Missing value proportion [%]")
        plt.ylabel(f"RMSE [{map_param_unit[param]}]")
        plt.legend()
        if save_as:
            plt.savefig(f"output/scalerComparison_exp{exp_no}_{param}.png")
        plt.show()

In [12]:
# load scalers
scaler_m = pickle.load(open("C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/minmaxscaler/models/scaler.pickle", "rb"))
scaler_r = pickle.load(open("C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/robustscaler/models/scaler.pickle", "rb"))

# load test file
df_test_unscaled = pd.read_csv("C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/data/test_table_0.8.csv")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


**Experiment 0**

In [9]:
# load predictions and undo scaling
path_exp0_minmax = f"C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/minmaxscaler/test_results/exp0/"
xhat0_unscaled_m = load_and_unscale(path_exp0_minmax, scaler_m, exp_no=0)

path_exp0_robust = f"C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/robustscaler/test_results/exp0/"
xhat0_unscaled_r = load_and_unscale(path_exp0_robust, scaler_r, exp_no=0)

Load and scale for gain_imputer
Load and scale for missforest_imputer
Load and scale for mean_imputer
Load and scale for knn_imputer

Load and scale for gain_imputer
Load and scale for missforest_imputer
Load and scale for mean_imputer
Load and scale for knn_imputer



In [10]:
# average over iterations
xhat0_unscaled_m_i = xhat0_unscaled_m.groupby(["missing_value_proportion", "imputer", "experiment", "LATITUDE", "LONGITUDE", "LEV_M"]).mean().reset_index()
xhat0_unscaled_r_i = xhat0_unscaled_r.groupby(["missing_value_proportion", "imputer", "experiment", "LATITUDE", "LONGITUDE", "LEV_M"]).mean().reset_index()

In [None]:
# compute error
rmse0_unscaled_m = compute_error(xhat0_unscaled_m_i, df_test_unscaled, num_decimals=12, error_name="rmse")
rmse0_unscaled_r = compute_error(xhat0_unscaled_r_i, df_test_unscaled, num_decimals=12, error_name="rmse")

In [None]:
# combine both dataframes
df0 = pd.merge(left=rmse0_unscaled_m, right=rmse0_unscaled_r, how="left", on=["imputer", "missing_value_proportion", 
                                                                           "LATITUDE", "LONGITUDE", "LEV_M"], 
              suffixes=("_minmax", "_robust")).reset_index()

In [None]:
plot_scaler_comparison(df0, error_name="rmse", exp_no=0)

In [None]:
# average improvement with minmax
for param in ["P_TEMPERATURE", "P_SALINITY", "P_OXYGEN", "P_NITRATE", "P_SILICATE", "P_PHOSPHATE"]:
    print(param +  ":    " + str((df0[f"rmse_{param}_robust"] - df0[f"rmse_{param}_minmax"]).mean()))

**Experiment 1**

In [None]:
# load predictions and undo scaling
path_exp1_minmax = f"C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/minmaxscaler/test_results/exp1/"
xhat1_unscaled_m = load_and_unscale(path_exp1_minmax, scaler_m, exp_no=1)

path_exp1_robust = f"C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/output/imputation/robustscaler/test_results/exp1/"
xhat1_unscaled_r = load_and_unscale(path_exp1_robust, scaler_r, exp_no=1)

In [None]:
# average over iterations
xhat1_unscaled_m_i = xhat1_unscaled_m.groupby(["missing_value_proportion", "imputer", "experiment", "LATITUDE", "LONGITUDE", "LEV_M"]).mean().reset_index()
xhat1_unscaled_r_i = xhat1_unscaled_r.groupby(["missing_value_proportion", "imputer", "experiment", "LATITUDE", "LONGITUDE", "LEV_M"]).mean().reset_index()

In [None]:
# compute error
rmse1_unscaled_m = compute_error(xhat1_unscaled_m_i, df_test_unscaled, 12, "rmse")

In [None]:
rmse1_unscaled_r = compute_error(xhat1_unscaled_r_i, df_test_unscaled, 12, "rmse")

In [None]:
# combine both dataframes
df1 = pd.merge(left=rmse1_unscaled_m, right=rmse1_unscaled_r, how="left", on=["imputer", "missing_value_proportion", 
                                                                           "LATITUDE", "LONGITUDE", "LEV_M"], 
              suffixes=("_minmax", "_robust")).reset_index()

In [None]:
df1["missing_value_proportion"].replace({"_P_NITRATE": ""}, regex=True, inplace=True)
df1["missing_value_proportion"].replace({"_P_SILICATE": ""}, regex=True, inplace=True)
df1["missing_value_proportion"].replace({"_P_PHOSPHATE": ""}, regex=True, inplace=True)
df1["missing_value_proportion"].replace({"_P_TEMPERATURE": ""}, regex=True, inplace=True)
df1["missing_value_proportion"].replace({"_P_SALINITY": ""}, regex=True, inplace=True)
df1["missing_value_proportion"].replace({"_P_OXYGEN": ""}, regex=True, inplace=True)
df1["missing_value_proportion"] = df1["missing_value_proportion"].astype(float)

In [None]:
plot_scaler_comparison(df1, error_name="rmse", exp_no=1)

In [None]:
# average improvement with minmax
for param in ["P_TEMPERATURE", "P_SALINITY", "P_OXYGEN", "P_NITRATE", "P_SILICATE", "P_PHOSPHATE"]:
    print(param +  ":    " + str((df1[f"rmse_{param}_robust"] - df1[f"rmse_{param}_minmax"]).mean()))