In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score, classification_report
# import plotly.express as px
# from catboost import Pool
# import shap

from var import DATA_OUT
from src.forecast import instantiate_and_fit_model

In [None]:
df = pd.read_pickle(Path(DATA_OUT, 'df_dataset.pickle'))

df['tid_within_3h'] = df['tid_within_3h'].replace(
    {True: 1, False: 0}
)

In [None]:
df.sample()

In [None]:
# class_weights = [
#     1 / y_train.eq(False).sum(),
#     1 / y_train.eq(True).sum(),
# ]

In [None]:
X = df[
    [
        'ie_fix',
        'ie_pct_change',
        'ie_mav_3h',
        'ie_mav_6h',
        'ie_mav_12h',
        'ie_mav_24h',
        'hf',
        'f_107_adj',
        'solar_zenith_angle',
    ]
].copy()
y = df['tid_within_3h'].copy()

In [None]:
params = {
    'iterations': 2_000, # 1_000
    'eval_metric': 'F1:use_weights=True', # ['Precision', 'Recall']
    'random_seed': 42,
    # 'class_weights': class_weights,
    'auto_class_weights': 'Balanced',
    # 'cat_features': cat_features,
    # 'verbose': True,
    'od_type': 'Iter',
    'od_wait': 200,
    'use_best_model': True,
}

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5)

for train_idx, test_idx in ts_cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    cat_model = instantiate_and_fit_model(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        loss_function='Logloss',
        params=params,
    )

    y_pred = cat_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f'F1-score: {f1:.2f}')
    print(classification_report(y_test, y_pred))

In [None]:
(
    cat_model.feature_importances_,
    cat_model.feature_names_,
)