In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    f1_score,
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve,
    auc,
    precision_recall_curve,
)
from catboost import Pool, CatBoostClassifier
import mlflow
from mlflow.types.schema import Schema, ColSpec
import optuna
import shap
shap.initjs()
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from var import DATA_OUT, MODEL_CATB, IMAGE_OUT, FORECAST_HOURS_IN_ADVANCE
from src.forecast import instantiate_and_fit_model, objective, get_or_create_experiment
from src.var import ML_SERVER_URI, EXPERIMENT_NAME

# logging level: WARNING (INFO logs are suppressed)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# MLFlow web server URI
mlflow.set_tracking_uri(ML_SERVER_URI)

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'] = df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].replace(
    {True: 1, False: 0}
)

In [None]:
X = df[
    [
        'ie_fix',
        'ie_variation',
        'il_fix',
        'il_variation',
        'iu_fix',
        'iu_variation',
        'io_fix',
        'ie_mav_3h',
        'ie_mav_6h',
        'ie_mav_12h',
        'ie_mav_24h',
        'iu_mav_3h',
        'iu_mav_6h',
        'iu_mav_12h',
        'iu_mav_24h',
        'il_mav_3h',
        'il_mav_6h',
        'il_mav_12h',
        'il_mav_24h',
        'hf',
        'f_107_adj',
        'hp_30',
        'smr',
        'solar_zenith_angle',
        'bz',
        'vx',
        'rho',
        *[col_ for col_ in df.columns if col_.startswith('local_warning')],
        *[col_ for col_ in df.columns if col_.startswith('spectral_contribution')],
        *[col_ for col_ in df.columns if col_.startswith('azimuth')],
        *[col_ for col_ in df.columns if col_.startswith('velocity')],
    ]
].copy()

y = df[f'tid_within_{FORECAST_HOURS_IN_ADVANCE}h'].copy()

In [None]:
cat_features = [
    *[col_ for col_ in df.columns if col_.endswith('variation')],
    *[col_ for col_ in df.columns if col_.startswith('local_warning')],
]

static_params = {
    "eval_metric": "F1:use_weights=True",
    "random_seed": 42,
    # "auto_class_weights": "SqrtBalanced", # "Balanced",
    "cat_features": cat_features,
    "od_type": "Iter",
    "use_best_model": True,
    "has_time": True,
    "od_wait": 200,
}

In [None]:
# n_days_for_testing = 365
# ts_cv = TimeSeriesSplit(n_splits=5, test_size=n_days_for_testing*24*2)

ts_cv = TimeSeriesSplit(n_splits=5)

In [None]:
# fig, axs = plt.subplots(5, 1, figsize=(40, 20), sharex=True)
# 
# for fold, (train_idx, val_idx) in enumerate(ts_cv.split(X)):
#     train = X.iloc[train_idx]
#     test = X.iloc[val_idx]
#     train['hf'].plot(
#         ax=axs[fold],
#         title=f'Train/Test split - fold {fold + 1}',
#     )
#     test['hf'].plot(ax=axs[fold])
#     axs[fold].axvline(test.index.min(), color='black', ls='--')
#     axs[fold].set_ylabel('HF index')
# 
# plt.savefig(
#     Path(IMAGE_OUT, f'train_test_split.png', dpi=500, bbox_inches='tight')
# )
# plt.show()

## Optuna (hyper-params optimisation)

In [None]:
obj = lambda trial: objective(
    trial,
    X=X,
    y=y,
    cv=ts_cv,
    params=static_params,
)

study = optuna.create_study(study_name='catboost_clf', direction='maximize')
study.optimize(obj, n_trials=25, show_progress_bar=True)

trial = study.best_trial

In [None]:
print("Best trial:")
print(f"  F1: {trial.value:.3f}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
# Fine-tuned hyperparams
best_params = study.best_params
static_params.update(best_params)

## Fine-tuned and cross-validated model

**[Model signatures](https://www.mlflow.org/docs/latest/models.html#model-signature)** define what the model expects (input, output and parameters) and enforce it later in deployment.

Signatures are fetched by the Tracking UI and Model Registry UI to display model inputs, outputs and params; they are also utilized by MLflow model deployment tools to validate inference inputs according to the model’s assigned signature

In [None]:
mlflow.set_experiment(
    experiment_id=get_or_create_experiment(EXPERIMENT_NAME)
);

In [None]:
input_schema = Schema(
    [ColSpec("double", col_) for col_ in X.columns]
)

output_schema = Schema(
    [ColSpec("long", y.name)]
)

signature = mlflow.models.ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
with mlflow.start_run() as ml_run:
    run_id = '1928ac3df36f475ea43404e29ba66793'
    cat_model = mlflow.catboost.load_model(f'runs:/{run_id}/model')
    
    
    mlflow.log_params(
        cat_model.get_all_params()
    )
    
    f1s = []
    for i, (train_idx, test_idx) in enumerate(ts_cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # cat_model = instantiate_and_fit_model(
        #     X_train=X_train,
        #     y_train=y_train,
        #     X_test=X_test,
        #     y_test=y_test,
        #     loss_function='Logloss',
        #     params=static_params,
        # )
        # 
        # # See GitHub issue #8044 (MLflow) to understand why we need i
        # params = {f'{param}_{i}': value for param, value in cat_model.get_all_params().items()}
        # mlflow.log_params(params)

        y_pred = cat_model.predict(X_test)
        
        prcs, rcll, f1, supp = precision_recall_fscore_support(y_test, y_pred)
        f1s.append(f1[1])
        # mlflow.log_metrics(
        #     {
        #         'f1_0': f1[0], 'precision_0': prcs[0], 'recall_0': rcll[0],
        #         'f1_1': f1[1], 'precision_1': prcs[1], 'recall_1': rcll[1]
        #     }
        # )
        # mlflow.catboost.log_model(cat_model, 'model', signature=signature)

In [None]:
print(f'Achieved F1-score: {np.mean(f1s):.2f} ± {np.std(f1s, ddof=1):.2f}')

In [None]:
weights = [tr_idx.shape[0] / X.shape[0] for tr_idx, _ in ts_cv.split(X)]
w_avg = sum(f1 * w_ for f1, w_ in zip(f1s, weights)) / sum(weights)

print(f'Achieved F1-score (weighted): {w_avg:.2f} ± {np.std(f1s, ddof=1):.2f}')

## Load from model registry

In [None]:
run_id = '1928ac3df36f475ea43404e29ba66793'

cat_model = mlflow.catboost.load_model(f'runs:/{run_id}/model')

## SHAP

In [None]:
px.bar(
    feat_importances.sort_values(),
    orientation='h',
    template='simple_white',
)

In [None]:
shap_values = cat_model.get_feature_importance(
    Pool(X_test, label=y_test, cat_features=cat_features),
    type="ShapValues",
)

shap_values = shap_values[:,:-1]

In [None]:
# shap.summary_plot(shap_values, X_test, show=False, max_display=11)
# plt.savefig(
#     Path(IMAGE_OUT, 'shap_summary.png'),
#     dpi=300,
# )

shap.summary_plot(shap_values, X_test, max_display=20)

In [None]:
shap.dependence_plot(
    'spectral_contribution_ro',
    shap_values,
    X_test,
    interaction_index='hf',
)

In [None]:
from src.io import read_time_series
from var import DATA_IN

df_tid = read_time_series(
    Path(DATA_IN, 'TID_catalog.csv'),
    column_names=[
        'duration',
        'period',
        'amplitude',
        'spectral_contribution',
        'velocity',
        'azimuth',
        'quality_index',
        'datetime',
    ],
)

In [None]:
df_tid.loc['2022-12']

In [None]:
row = X_test.index.get_loc('2022-03-11 23:00') # 16:00 - 22:00

explainer = shap.TreeExplainer(cat_model)
shap_values_ = explainer.shap_values(X_test)

shap.force_plot(
    explainer.expected_value,
    shap_values[row,:],
    X_test.iloc[row,:],
    link='logit',
)

#.savefig(
#    Path(IMAGE_OUT, 'shap_force.png'),
#    dpi=300,
#)

In [None]:
y_test.iloc[row], y_pred[row]

In [None]:
y_pred[y_pred==1].shape[0] / y_pred.shape[0]

## Evaluation of classification

In [None]:
df_eval = X.copy(deep=True)
df_eval['true'] = y
df_eval['pred'] = cat_model.predict(X)
df_eval['pred_pr'] = cat_model.predict_proba(X)[:,1]

In [None]:
f1_score(
    df_eval['true'],
    df_eval['pred'],
)

In [None]:
roc_auc = roc_auc_score(
    df_eval['true'],
    df_eval['pred_pr'],
)

In [None]:
fpr, tpr, thresholds = roc_curve(
    df_eval['true'],
    df_eval['pred_pr'],
)

In [None]:
p, r, t = precision_recall_curve(
    df_eval['true'],
    df_eval['pred_pr'],
    drop_intermediate=True
)

In [None]:
pr_auc = auc(r, p)

In [None]:
f1_scores = 2 * (p*r) / (p+r)
opt_thr = t[np.argmax(f1_scores)]

In [None]:
opt_thr, f1_scores[np.argmax(f1_scores)]

In [None]:
np.where(np.logical_and(p>=0.80, p<0.8001))

In [None]:
idx = 13980

p[idx].round(3), r[idx].round(3), f1_scores[idx].round(3)

In [None]:
fig = px.scatter(x=fpr, y=tpr)

fig.update_layout(
    # height=800,
    # width=800,
    # autosize=False,
    shapes=[
        dict(
            type='line',
            x0=0,
            y0=0,
            x1=1,
            y1=1,
            line=dict(color='navy', width=2, dash='dash'),
        )
    ],
    title=f'ROC Curve (ROC-AUC: {roc_auc:.2f})',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    template='ggplot2',
)

# fig.write_html(
#     Path(IMAGE_OUT,f'plot_roc_curve.html')
# )

fig.show()

In [None]:
fig = px.scatter(x=r, y=p)

fig.update_layout(
    # height=800,
    # width=800,
    # autosize=False,
    shapes=[
        dict(
            type='line',
            x0=0,
            y0=1,
            x1=1,
            y1=0,
            line=dict(color='navy', width=2, dash='dash'),
        )
    ],
    title=f'PR Curve (PR-AUC: {pr_auc:.2f})',
    xaxis=dict(title='Recall'),
    yaxis=dict(title='Precision'),
    template='ggplot2',
)

# fig.write_html(
#     Path(IMAGE_OUT,f'plot_pr_curve.html')
# )

fig.show()

## Plot features vs target

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

period = '2022-03'

df_plt = df_eval.loc[
    f'{period}',
    ['iu_mav_6h','iu_fix','hf','smr','true','pred']
]

n_cols = len(df_plt.columns)

fig = make_subplots(
    rows=n_cols,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.04,
    subplot_titles=df_plt.columns,
)

for i, col in enumerate(df_plt.columns, start=1):
    fig.add_trace(
        go.Scatter(
            x=df_plt[col].index,
            y=df_plt[col].values,
            name=col,
        ),
        row=i,
        col=1,
    )
    
fig.update_layout(
    template='plotly_white',
    height=800,
    width=1_000,
    autosize=False,
    title=f'Period: <b>{period}</b>',
)

fig.show()

# fig.write_html(
#     Path(IMAGE_OUT,f'plot_features_target_{period}.html')
# )