In [1]:
from typing import Dict, Tuple
import pandas as pd
import csv
import json

from utils.error_metrics import MulticlassErrorMetrics, DatasetCategory

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
import os


def get_dataset_with_dependend_variabels(
    df_set: pd.DataFrame, df_dependent: pd.DataFrame
) -> pd.DataFrame:
    df = df_set[
        ["Date", "Area", "mapped_hazard_forecast", "mapped_hazard_observed"]
    ].copy()
    df["Date"] = pd.to_datetime(df["Date"]).dt.date
    df_dependent["Date"] = pd.to_datetime(df_dependent["Date"]).dt.date
    df = df.merge(df_dependent, on=["Date", "Area"])
    return df


def xy_split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return (
        df.drop(
            columns=["Date", "Area", "mapped_hazard_forecast", "mapped_hazard_observed"]
        ),
        df["mapped_hazard_forecast"],
    )


def fit_and_compute_errors(
    errs: MulticlassErrorMetrics,
    independent_variables_and_sets: Tuple[str, Dict[DatasetCategory, pd.DataFrame]],
    model_name_and_pipeline: Tuple[str, Pipeline],
):
    independent_variables = independent_variables_and_sets[0]
    sets = independent_variables_and_sets[1]
    model_name = model_name_and_pipeline[0]
    pipe = model_name_and_pipeline[1]

    training_set = sets[DatasetCategory.TRAINING]
    x, y = xy_split(training_set)
    pipe.fit(x, y)

    for ds_type, ds in sets.items():
        x, _ = xy_split(ds)
        y_pred = pipe.predict(x)
        errs.compute_errors(model_name, independent_variables, ds_type, y_pred)

def params_to_file(file_name: str, params: dict[str, str]) -> None:
    with open(f"../data/aux/params/{file_name.replace(".*","")}.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Parameter", "Value"])
        for key, value in params.items():
            writer.writerow([key, value])

def params_from_file(file_name: str):
    hypertuned_params = {}
    file_name = f"../data/aux/params/{file_name.replace(".*","")}.csv"
    if not os.path.exists(file_name):
        raise FileNotFoundError(f"The file {file_name} does not exist.")
    with open(file_name, mode="r") as infile:
        reader = csv.reader(infile)
        next(reader)
        for rows in reader:
            # value = None
            # if rows[1]
            # try:
            #     value = float(rows[1])
            #     value = int(value)
            # except ValueError:
            #     if value is None:
            #         value = rows[1]
            v = json.loads(rows[1].lower())
            hypertuned_params[rows[0]] = v
    return hypertuned_params


In [2]:
df_train = pd.read_csv("../data/processed/SAIS_train.csv", index_col=0)
df_test = pd.read_csv("../data/processed/SAIS_test.csv", index_col=0)

df_tab = pd.read_csv("../data/processed/SAIS_tabularised.csv", index_col=0)

df_train_tab = df_tab.loc[df_train.index]
df_test_tab = df_tab.loc[df_test.index]

df_weather_data_daily_tab = pd.read_csv(
    "../data/proprietary/weather/visual_crossing/weather_observed_days_tabularised.csv"
)
df_weather_data_48h_then_daily_tab = pd.read_csv(
    "../data/proprietary/weather/visual_crossing/weather_observed_tabularised_48_hours_then_days.csv"
)

snowprofiles = (
    "snow profiles",
    {
        DatasetCategory.TRAINING: df_train,
        DatasetCategory.TEST: df_test,
    },
)

snowprofiles_whole_season = (
    "snow profiles across season",
    {
        DatasetCategory.TRAINING: df_train_tab,
        DatasetCategory.TEST: df_test_tab,
    },
)

weather_daily_whole_season = (
    "weather daily data across season",
    {
        DatasetCategory.TRAINING: get_dataset_with_dependend_variabels(
            df_train, df_weather_data_daily_tab
        ),
        DatasetCategory.TEST: get_dataset_with_dependend_variabels(
            df_test, df_weather_data_daily_tab
        ),
    },
)

weather_48h_then_daily_season = (
    "weather 48h hourly then daily data across season",
    {
        DatasetCategory.TRAINING: get_dataset_with_dependend_variabels(
            df_train, df_weather_data_48h_then_daily_tab
        ),
        DatasetCategory.TEST: get_dataset_with_dependend_variabels(
            df_test, df_weather_data_48h_then_daily_tab
        ),
    },
)

In [None]:
most_frequent_value = df_train["mapped_hazard_forecast"].value_counts().index[0]

errs_summary = MulticlassErrorMetrics(
    dataset_name="sais",
    classes=sorted(df_train["mapped_hazard_forecast"].unique()),
    y_true_train=df_train["mapped_hazard_forecast"],
    y_true_dev=None,
    y_true_test=df_test["mapped_hazard_forecast"],
)

run = lambda model, data: fit_and_compute_errors(
    errs=errs_summary,
    independent_variables_and_sets=data,
    model_name_and_pipeline=model,
)

model_name_const = "const"
errs_summary.compute_errors_all_sets(
    model_name_const,
    None,
    most_frequent_value,
    None,
    most_frequent_value,
)
model_name_obs = "observed"
errs_summary.compute_errors_all_sets(
    model_name_obs,
    None,
    df_train["mapped_hazard_observed"],
    None,
    df_test["mapped_hazard_observed"],
)

scaler = StandardScaler()
softmax = (
    "softmax",
    Pipeline(
        [
            ("scaler", scaler),
            ("model", LogisticRegression(random_state=1, max_iter=int(1e4))),
        ]
    ),
)

random_forest = (
    "random forest",
    Pipeline(
        [
            ("scaler", scaler),
            ("model", RandomForestClassifier(random_state=1)),
        ]
    ),
)

mlp = (
    "MLP",
    Pipeline(
        [
            ("scaler", scaler),
            ("model", MLPClassifier(random_state=1, early_stopping=True)),
        ]
    ),
)


# mlp_tuned = (
#     "MLP (tuned)",
#         Pipeline(
#         [
#             ("scaler", scaler),
#             ("model", MLPClassifier(random_state=1, early_stopping=True).set_params(**params_from_file("mlp_hypertuned"))),
#         ]
# ),
# )    


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [34]:
# run(softmax, snowprofiles)

# run(softmax, snowprofiles_whole_season)

# run(softmax, weather_daily_whole_season)

# run(softmax, weather_48h_then_daily_season)

# run(random_forest, weather_48h_then_daily_season)

# run(mlp, weather_48h_then_daily_season)

run(mlp_tuned, weather_48h_then_daily_season)

errs_summary.save_assets()

errs_summary.get_error_table_for_set(DatasetCategory.TEST)

InvalidParameterError: The 'early_stopping' parameter of MLPClassifier must be an instance of 'bool' or an instance of 'numpy.bool'. Got 'True' instead.

In [5]:
errs_summary.show_confusion_matrix(
    model_name=mlp_tuned[0],
    independent_variables=weather_48h_then_daily_season[0],
    set_cat=DatasetCategory.TEST,
)

ValueError: Model (MLP tuned) and indebendent variables (weather 48h hourly then daily data across season) combination not found

## Hyperparameter tuning

In [None]:
rerun_mlp = False
rerun_random_forest = True

# long running
if rerun_random_forest:
    grid = {}

# long running
if rerun_mlp:

    grid = {
        "model__hidden_layer_sizes": [
            (100,),
            (1000,),
            (100, 300, 100),
            (10, 30, 10),
            (20,),
        ],
        "model__activation": ["relu", "logistic", "tanh"],
        "model__solver": ["sgd", "adam"],
        "model__alpha": [0.0001, 0.001, 0.01],
        "model__learning_rate": ["constant", "invscaling", "adaptive"],
        "model__momentum": [0.1, 0.5, 0.9],
    }

    random_search_mlp = RandomizedSearchCV(
        estimator=mlp[1],
        n_jobs=-1,
        cv=RepeatedKFold(n_splits=10, n_repeats=3, random_state=1),
        param_distributions=grid,
        scoring="f1_macro",
        n_iter=20,
    )

    run(("MLP (tuned)", random_search_mlp), weather_48h_then_daily_season)

    best_params = random_search_mlp.best_estimator_["model"].get_params()

