In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report
from catboost import Pool, CatBoostClassifier
import mlflow
from mlflow.types.schema import Schema, ColSpec
import optuna
import shap
shap.initjs()
import plotly.express as px
import matplotlib.pyplot as plt

from var import DATA_OUT, MODEL_CATB, IMAGE_OUT, FORECAST_HOURS_IN_ADVANCE
from src.forecast import instantiate_and_fit_model, objective, get_or_create_experiment
from src.var import ML_SERVER_URI, EXPERIMENT_NAME

# logging level: WARNING (INFO logs are suppressed)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# MLFlow web server URI
mlflow.set_tracking_uri(ML_SERVER_URI)

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'] = df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].replace(
    {True: 1, False: 0}
)

In [None]:
X = df[
    [
        'ie_fix',
        'ie_variation',
        'il_fix',
        'il_variation',
        'iu_fix',
        'iu_variation',
        'io_fix',
        'ie_mav_3h',
        'ie_mav_6h',
        'ie_mav_12h',
        'ie_mav_24h',
        'iu_mav_3h',
        'iu_mav_6h',
        'iu_mav_12h',
        'iu_mav_24h',
        'il_mav_3h',
        'il_mav_6h',
        'il_mav_12h',
        'il_mav_24h',
        'hf',
        'f_107_adj',
        'hp_30',
        'smr',
        'solar_zenith_angle',
    ]
].copy()

y = df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].copy()

In [None]:
cat_features = [
    'ie_variation',
    'il_variation',
    'iu_variation'
]

static_params = {
    "eval_metric": "F1:use_weights=True",
    "random_seed": 42,
    "auto_class_weights": "SqrtBalanced", # "Balanced",
    'cat_features': cat_features,
    "od_type": "Iter",
    "use_best_model": True,
    "has_time": True,
    "od_wait": 200,
}

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5)

## Optuna (hyper-params optimisation)

In [None]:
obj = lambda trial: objective(
    trial,
    X=X,
    y=y,
    cv=ts_cv,
    params=static_params,
)

study = optuna.create_study(study_name='catboost_clf', direction='maximize')
study.optimize(obj, n_trials=40, show_progress_bar=True)

trial = study.best_trial

In [None]:
print("Best trial:")
print(f"  F1: {trial.value:.3f}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
# Fine-tuned hyperparams
best_params = study.best_params
static_params.update(best_params)

## Fine-tuned and cross-validated model

**[Model signatures](https://www.mlflow.org/docs/latest/models.html#model-signature)** define what the model expects (input, output and parameters) and enforce it later in deployment.

Signatures are fetched by the Tracking UI and Model Registry UI to display model inputs, outputs and params; they are also utilized by MLflow model deployment tools to validate inference inputs according to the model’s assigned signature

In [None]:
mlflow.set_experiment(
    experiment_id=get_or_create_experiment(EXPERIMENT_NAME)
);

In [None]:
input_schema = Schema(
    [ColSpec("double", col_) for col_ in X.columns]
)

output_schema = Schema(
    [ColSpec("long", y.name)]
)

signature = mlflow.models.ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
with mlflow.start_run() as ml_run:
    # run_id = 'b286bda4b728430d86c901f7efcede74'
    # cat_model = mlflow.catboost.load_model(f'runs:/{run_id}/model')
    # 
    # 
    # mlflow.log_params(
    #     cat_model.get_all_params()
    # )
    
    f1s = []
    for i, (train_idx, test_idx) in enumerate(ts_cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        cat_model = instantiate_and_fit_model(
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            loss_function='Logloss',
            params=static_params,
        )
        
        # See GitHub issue #8044 (MLflow) to understand why we need i
        params = {f'{param}_{i}': value for param, value in cat_model.get_all_params().items()}
        mlflow.log_params(params)

        y_pred = cat_model.predict(X_test)
        
        prcs, rcll, f1, supp = precision_recall_fscore_support(y_test, y_pred)
        f1s.append(f1[1])
        mlflow.log_metrics(
            {
                'f1_0': f1[0], 'precision_0': prcs[0], 'recall_0': rcll[0],
                'f1_1': f1[1], 'precision_1': prcs[1], 'recall_1': rcll[1]
            }
        )
        mlflow.catboost.log_model(cat_model, 'model', signature=signature)

In [None]:
print(f'Achieved F1-score: {np.mean(f1s):.2f} ± {np.std(f1s, ddof=1):.2f}')

## Load from model registry

In [None]:
run_id = 'b286bda4b728430d86c901f7efcede74'

cat_model = mlflow.catboost.load_model(f'runs:/{run_id}/model')

## SHAP

In [None]:
shap_values = cat_model.get_feature_importance(
    Pool(X_test, label=y_test, cat_features=cat_features),
    type="ShapValues",
)

shap_values = shap_values[:,:-1]

In [None]:
# shap.summary_plot(shap_values, X_test, show=False, max_display=11)
# plt.savefig(
#     Path(IMAGE_OUT, 'shap_summary.png'),
#     dpi=300,
# )

shap.summary_plot(shap_values, X_test, max_display=25)

In [None]:
shap.dependence_plot(
    'iu_mav_24h',
    shap_values,
    X_test,
    interaction_index='hf',
)

In [None]:
df[df['tid_within_3h'].eq(1)].tail(10)

In [None]:
row = X_test.index.get_loc('2022-12-01 14:00') # 16:00 - 22:00

explainer = shap.TreeExplainer(cat_model)
shap_values_ = explainer.shap_values(X_test)

shap.force_plot(
    explainer.expected_value,
    shap_values[row,:],
    X_test.iloc[row,:],
    link='logit',
)

#.savefig(
#    Path(IMAGE_OUT, 'shap_force.png'),
#    dpi=300,
#)

In [None]:
y_test.iloc[row], y_pred[row]

In [None]:
y_pred[y_pred==1].shape[0] / y_pred.shape[0]

## Evaluation of classification

In [None]:
df_eval = X.copy(deep=True)
df_eval['true'] = y
df_eval['pred'] = cat_model.predict(X)

In [None]:
px.line(
    df_eval.loc['2022-11',['true','pred']]
)

Studia il lag che può interessare, facendo una rolling, lagged correlation fra parametri a L1 e target

In [None]:
n_days = 1
window = 2 * 24 * n_days

df_eval['true'].rolling(window=window).corr(
    df_eval['pred'].shift(-4)
).median()