In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score, classification_report
from catboost import Pool
import optuna
import shap
import plotly.express as px

from var import DATA_OUT
from src.forecast import instantiate_and_fit_model, objective

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df['tid_within_3h'] = df['tid_within_3h'].replace(
    {True: 1, False: 0}
)

In [None]:
# class_weights = [
#     1 / y_train.eq(False).sum(),
#     1 / y_train.eq(True).sum(),
# ]

In [None]:
# Features
X = df[
    [
        'ie_fix',
        'ie_pct_change',
        'ie_mav_3h',
        'ie_mav_6h',
        'ie_mav_12h',
        'ie_mav_24h',
        'hf',
        'f_107_adj',
        'solar_zenith_angle',
    ]
].copy()

# Target
y = df['tid_within_3h'].copy()

In [None]:
params = {
    #'iterations': 2_000, # 1_000
    'eval_metric': 'F1:use_weights=True', # ['Precision', 'Recall']
    'random_seed': 42,
    # 'class_weights': class_weights,
    'auto_class_weights': 'Balanced',
    # 'cat_features': cat_features,
    # 'verbose': True,
    'od_type': 'Iter',
    #'od_wait': 200,
    'use_best_model': True,
    'has_time': True,
}

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5)

f1s = []
for train_idx, test_idx in ts_cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Objective function
    objective_partial = lambda trial: objective(
        trial,
        X_train,
        y_train,
        X_test,
        y_test,
        params,
    )
    
    # Ottimizzazione con Optuna
    study = optuna.create_study(study_name='catboost_clf', direction='maximize')
    study.optimize(objective_partial, n_trials=10)

    # Recupero dei migliori parametri
    best_params = study.best_params

    cat_model = instantiate_and_fit_model(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        loss_function='Logloss',
        params=best_params,
    )

    y_pred = cat_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    f1s.append(f1)
    print(classification_report(y_test, y_pred))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
np.round(np.mean(f1s), 2), np.round(np.std(f1s, ddof=1), 2)

In [None]:
np.round(np.median(f1s), 2)

## SHAP

In [None]:
shap_values = cat_model.get_feature_importance(
    Pool(X_test, label=y_test), #, cat_features=cat_features),
    type="ShapValues",
)

shap_values = shap_values[:,:-1]

In [None]:
shap.initjs()

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot(
    'hf',
    shap_values,
    X_test,
    interaction_index='solar_zenith_angle',
)

In [None]:
shap.dependence_plot(
    'ie_fix',
    shap_values,
    X_test,
    interaction_index='ie_mav_12h',
)

In [None]:
df[df['tid_within_3h'].eq(1)].tail(10)

In [None]:
row = X_test.index.get_loc('2022-12-24 02:30') # 16:00 - 22:00

explainer = shap.TreeExplainer(cat_model)
shap_values_ = explainer.shap_values(X_test)

shap.force_plot(
    explainer.expected_value,
    shap_values[row,:],
    X_test.iloc[row,:],
)