In [1]:
from typing import Tuple
import pandas as pd

from utils.error_metrics import MulticlassErrorMetrics, DatasetCategory

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

drop = ["Date", "Area"]
df_train = pd.read_csv("../data/processed/SAIS_train.csv").drop(drop, axis=1)
df_dev = pd.read_csv("../data/processed/SAIS_dev.csv").drop(drop, axis=1)
df_test = pd.read_csv("../data/processed/SAIS_test.csv").drop(drop, axis=1)
df_train_tab = pd.read_csv(
    "../data/processed/SAIS_train_tabularised.csv", index_col=0
).drop(drop, axis=1)
df_dev_tab = pd.read_csv(
    "../data/processed/SAIS_dev_tabularised.csv", index_col=0
).drop(drop, axis=1)
df_test_tab = pd.read_csv(
    "../data/processed/SAIS_test_tabularised.csv", index_col=0
).drop(drop, axis=1)

sets_std = {
    DatasetCategory.TRAINING: df_train,
    DatasetCategory.DEVELOPMENT: df_dev,
    DatasetCategory.TEST: df_test,
}

sets_tab = {
    DatasetCategory.TRAINING: df_train_tab,
    DatasetCategory.DEVELOPMENT: df_dev_tab,
    DatasetCategory.TEST: df_test_tab,
}


def xy_split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return (
        df.drop(columns=["mapped_hazard_forecast", "mapped_hazard_observed"]),
        df_train["mapped_hazard_forecast"],
    )


def fit_and_compute_errors(
    model_name: str,
    pipe: Pipeline,
    sets: dict[DatasetCategory, pd.DataFrame],
    errs: MulticlassErrorMetrics,
):
    training_set = sets[DatasetCategory.TRAINING]
    x, y = xy_split(training_set)
    pipe.fit(x, y)

    for ds_type, ds in sets.items():
        x, _ = xy_split(ds)
        errs.compute_errors(model_name, ds_type, pipe.predict(x))

In [2]:
most_frequent_value = df_train["mapped_hazard_forecast"].value_counts().index[0]

errs_summary = MulticlassErrorMetrics(
    dataset_name="sais",
    classes=sorted(df_train["mapped_hazard_forecast"].unique()),
    y_true_train=df_train["mapped_hazard_forecast"],
    y_true_dev=df_dev["mapped_hazard_forecast"],
    y_true_test=df_test["mapped_hazard_forecast"],
)

errs_summary.compute_errors_all_sets(
    "constant", most_frequent_value, most_frequent_value, most_frequent_value
)
errs_summary.compute_errors_all_sets(
    "observed",
    df_train["mapped_hazard_observed"],
    df_dev["mapped_hazard_observed"],
    df_test["mapped_hazard_observed"],
)

pipeline = Pipeline(
    [("scaler", StandardScaler()), ("model", LogisticRegression(random_state=1,max_iter=int(1e4)))]
)

fit_and_compute_errors(
    "softmax regression (snow profiles)", pipeline, sets_std, errs_summary
)

In [3]:
fit_and_compute_errors(
    "softmax regression (snow profiles tabularised)", pipeline, sets_tab, errs_summary
)

errs_summary.save_assets()