In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score, classification_report
from catboost import Pool, CatBoostClassifier
import optuna
import shap
import plotly.express as px

from var import DATA_OUT, MODEL_CATB, IMAGE_OUT
from src.forecast import instantiate_and_fit_model, objective

# logging level: WARNING (INFO logs are suppressed)
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df['tid_within_3h'] = df['tid_within_3h'].replace(
    {True: 1, False: 0}
)

In [None]:
# df_plt = df.loc['2015':'2017']
# 
# n_days = 10
# window = 48 * n_days
# 
# px.line(
#     df_plt['hf'].rolling(window=window, min_periods=1).corr(
#         df_plt['tid_within_3h']
#     )
# )

In [None]:
# df_plt = df.loc['2015-04']
# 
# n_days = 1
# window = 48 * n_days
# 
# fig = px.line(
#     df_plt['hf'].rolling(window=window, min_periods=1).corr(
#         df_plt['ie_fix']
#     )
# )
# 
# fig.add_scatter(
#     x=df_plt['tid_within_3h'].index,
#     y=df_plt['tid_within_3h'],
# )

In [None]:
# Features
X = df[
    [
        'ie_fix',
        'ie_variation',
        'il_fix',
        'il_variation',
        'iu_fix',
        'iu_variation',
        'ie_mav_3h',
        'ie_mav_6h',
        'ie_mav_12h',
        'ie_mav_24h',
        'hf',
        'f_107_adj',
        'solar_zenith_angle',
    ]
].copy()

# Target
y = df['tid_within_3h'].copy()

In [None]:
cat_features = [
    'ie_variation',
    'il_variation',
    'iu_variation'
]

static_params = {
    "eval_metric": "F1:use_weights=True",
    "random_seed": 42,
    "auto_class_weights": "SqrtBalanced", # "Balanced",
    'cat_features': cat_features,
    "od_type": "Iter",
    "use_best_model": True,
    "has_time": True,
    "od_wait": 200,
}

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5)

## Optuna (Bayesian optimisation)

In [None]:
obj = lambda trial: objective(
    trial,
    X=X,
    y=y,
    cv=ts_cv,
    params=static_params,
)

study = optuna.create_study(study_name='catboost_clf', direction='maximize')
study.optimize(obj, n_trials=40, show_progress_bar=True)

trial = study.best_trial

In [None]:
print("Best trial:")
print(f"  F1: {trial.value:.3f}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
# Fine-tuned hyperparams
best_params = study.best_params
static_params.update(best_params)

## Fine-tuned and cross-validated model

In [None]:
f1s = []

for train_idx, test_idx in ts_cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    cat_model = instantiate_and_fit_model(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        loss_function='Logloss',
        params=static_params,
    )
    
    # cat_model = CatBoostClassifier().load_model(
    #     Path(MODEL_CATB, 'classifier_optuna')
    # )

    y_pred = cat_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    f1s.append(f1)
    print(classification_report(y_test, y_pred))

In [None]:
print(f'Achieved F1-score: {np.mean(f1s):.2f} ± {np.std(f1s, ddof=1):.2f}')

In [None]:
cat_model.save_model(
    Path(MODEL_CATB, 'classifier_sqrt_blcd_opt')
)

## SHAP

In [None]:
shap_values = cat_model.get_feature_importance(
    Pool(X_test, label=y_test, cat_features=cat_features),
    type="ShapValues",
)

shap_values = shap_values[:,:-1]

In [None]:
shap.initjs()

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot(
    'hf',
    shap_values,
    X_test,
    interaction_index='solar_zenith_angle',
)

In [None]:
shap.dependence_plot(
    'ie_fix',
    shap_values,
    X_test,
    interaction_index='ie_mav_12h',
)

In [None]:
df[df['tid_within_3h'].eq(1)].tail(10)

In [None]:
row = X_test.index.get_loc('2022-12-23 19:30') # 16:00 - 22:00

explainer = shap.TreeExplainer(cat_model)
shap_values_ = explainer.shap_values(X_test)

shap.force_plot(
    explainer.expected_value,
    shap_values[row,:],
    X_test.iloc[row,:],
)

In [None]:
y_pred[y_pred==1].shape[0] / y_pred.shape[0]