In [None]:
from typing import Tuple
import pandas as pd

from utils.error_metrics import MulticlassErrorMetrics, DatasetCategory

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def get_set_with_dependend_variabels(df_set: pd.DataFrame, df_dependent) -> pd.DataFrame:
    df = df_set[['Date','Area','mapped_hazard_forecast', 'mapped_hazard_observed']].copy()
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    df_dependent['Date'] = pd.to_datetime(df_dependent['Date']).dt.date
    df = df.merge(df_dependent, on=["Date", "Area"])
    return df

def xy_split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return (
        df.drop(columns=["Date", "Area", "mapped_hazard_forecast", "mapped_hazard_observed"]),
        df_train["mapped_hazard_forecast"],
    )


def fit_and_compute_errors(
    model_name: str,
    pipe: Pipeline,
    sets: dict[DatasetCategory, pd.DataFrame],
    errs: MulticlassErrorMetrics,
):
    training_set = sets[DatasetCategory.TRAINING]
    x, y = xy_split(training_set)
    pipe.fit(x, y)

    for ds_type, ds in sets.items():
        x, _ = xy_split(ds)
        errs.compute_errors(model_name, ds_type, pipe.predict(x))

df_train = pd.read_csv("../data/processed/SAIS_train.csv", index_col=0)
df_dev = pd.read_csv("../data/processed/SAIS_dev.csv", index_col=0)
df_test = pd.read_csv("../data/processed/SAIS_test.csv", index_col=0)
df_train_tab = pd.read_csv(
    "../data/processed/SAIS_train_tabularised.csv", index_col=0
)
df_dev_tab = pd.read_csv(
    "../data/processed/SAIS_dev_tabularised.csv", index_col=0
)
df_test_tab = pd.read_csv(
    "../data/processed/SAIS_test_tabularised.csv", index_col=0
)

df_weather_data_tab = pd.read_csv("../data/proprietary/weather/visual_crossing/weather_days_tabularised.csv")

sets_std = {
    DatasetCategory.TRAINING: df_train,
    DatasetCategory.DEVELOPMENT: df_dev,
    DatasetCategory.TEST: df_test,
}

sets_tab = {
    DatasetCategory.TRAINING: df_train_tab,
    DatasetCategory.DEVELOPMENT: df_dev_tab,
    DatasetCategory.TEST: df_test_tab,
}


sets_weather_tab = {
    DatasetCategory.TRAINING: get_set_with_dependend_variabels(df_train, df_weather_data_tab),
    DatasetCategory.DEVELOPMENT: get_set_with_dependend_variabels(df_dev, df_weather_data_tab),
    DatasetCategory.TEST: get_set_with_dependend_variabels(df_test, df_weather_data_tab),
}

In [2]:
most_frequent_value = df_train["mapped_hazard_forecast"].value_counts().index[0]

errs_summary = MulticlassErrorMetrics(
    dataset_name="sais",
    classes=sorted(df_train["mapped_hazard_forecast"].unique()),
    y_true_train=df_train["mapped_hazard_forecast"],
    y_true_dev=df_dev["mapped_hazard_forecast"],
    y_true_test=df_test["mapped_hazard_forecast"],
)

errs_summary.compute_errors_all_sets(
    "constant", most_frequent_value, most_frequent_value, most_frequent_value
)
errs_summary.compute_errors_all_sets(
    "observed",
    df_train["mapped_hazard_observed"],
    df_dev["mapped_hazard_observed"],
    df_test["mapped_hazard_observed"],
)

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(random_state=1, max_iter=int(1e4))),
    ]
)

fit_and_compute_errors(
    "softmax regression (snow profiles)", pipeline, sets_std, errs_summary
)

fit_and_compute_errors(
    "softmax regression (snow profiles across season)", pipeline, sets_tab, errs_summary
)

fit_and_compute_errors(
    "softmax regression (weather data across season)", pipeline, sets_weather_tab, errs_summary
)

errs_summary.save_assets()


In [4]:
errs_summary.get_error_table_for_set(DatasetCategory.DEVELOPMENT)

Unnamed: 0,MSE,average error,highest error,accuracy,precision (multiclass),precision (micro),precision (macro),recall (multiclass),recall (micro),recall (macro),$F_1$ (multiclass),$F_1$ (micro),$F_1$ (macro),confusion matrix
constant,1.626653,-0.892167,-2,0.328586,"[0.0, 0.0, 0.33, 0.0]",0.328586,0.082146,"[0.0, 0.0, 1.0, 0.0]",0.328586,0.25,"[0.0, 0.0, 0.49, 0.0]",0.328586,0.12366,"[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [..."
observed,0.284842,0.156663,3,0.766022,"[0.77, 0.69, 0.85, 0.68]",0.766022,0.746918,"[0.94, 0.68, 0.73, 0.4]",0.766022,0.687038,"[0.85, 0.69, 0.78, 0.5]",0.766022,0.704455,"[[0.9424920127795527, 0.05750798722044728, 0.0..."
softmax regression (snow profiles),0.523906,0.057986,3,0.599186,"[0.68, 0.45, 0.61, 0.45]",0.599186,0.547548,"[0.8, 0.31, 0.75, 0.1]",0.599186,0.490051,"[0.73, 0.37, 0.67, 0.17]",0.599186,0.485679,"[[0.7955271565495208, 0.16613418530351437, 0.0..."
softmax regression (snow profiles across season),1.713123,-0.038657,3,0.307223,"[0.3, 0.31, 0.36, 0.03]",0.307223,0.250948,"[0.3, 0.29, 0.37, 0.04]",0.307223,0.25032,"[0.3, 0.3, 0.36, 0.04]",0.307223,0.25042,"[[0.2971246006389776, 0.2939297124600639, 0.34..."
softmax regression (weather data across season),0.582909,-0.017294,-3,0.574771,"[0.69, 0.44, 0.62, 0.42]",0.574771,0.544563,"[0.65, 0.48, 0.61, 0.42]",0.574771,0.539997,"[0.67, 0.46, 0.62, 0.42]",0.574771,0.541831,"[[0.645367412140575, 0.29073482428115016, 0.06..."


In [5]:
df_weather_data_tab.head()

Unnamed: 0,Date,Area,day_of_season,humidity_0,humidity_1,humidity_2,humidity_3,humidity_4,humidity_5,humidity_6,...,cloudcover_128,cloudcover_129,cloudcover_130,cloudcover_131,cloudcover_132,cloudcover_133,cloudcover_134,cloudcover_135,cloudcover_136,cloudcover_137
0,2007-12-14,Creag Meagaidh,1,73.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2007-12-15,Creag Meagaidh,2,76.0,73.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2007-12-16,Creag Meagaidh,3,76.1,76.0,73.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2007-12-17,Creag Meagaidh,4,82.1,76.1,76.0,73.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2007-12-18,Creag Meagaidh,5,81.4,82.1,76.1,76.0,73.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
