In [1]:
import datetime
import tqdm

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from py import sqlite_utils as utils

In [2]:
conn = utils.connect_sqlite()

In [3]:
df_train = utils.execute_sql_query("""
SELECT *
FROM power_features_train
""", conn=conn)
df_test = utils.execute_sql_query("""
SELECT *
FROM power_features_test
""", conn=conn)

## Baseline CatBoost model

In this experiment, train the model with MAE directly, so that the metric for training and evaluation stays the same.

In [4]:
base_params = {'loss_function': 'MAE', 'iterations': 5000, 'random_seed': 338}

In [5]:
model = catboost.CatBoostRegressor(
    **base_params
)

In [6]:
X_train = df_train.iloc[:, 2:].values
y_train = df_train.iloc[:, 1].values

X_test = df_test.iloc[:, 2:].values

In [7]:
model.fit(X_train, y_train, silent=True)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
preds = pd.concat([
    pd.DataFrame({'hour': df_train['hour'], 'pred_value': train_preds}),
    pd.DataFrame({'hour': df_test['hour'], 'pred_value': test_preds})
])

In [8]:
utils.insert_predictions(preds, 'baseline_catboost_mae', conn=conn)

Model version is 0
Inserted 34407 rows


Initial params for grid search:

In [9]:
{k: v for k, v in model.get_all_params().items() if k in ['depth', 'learning_rate', 'l2_leaf_reg']}

{'l2_leaf_reg': 3, 'depth': 6, 'learning_rate': 0.029999999329447743}

## CatBoost parameter optimization

In [10]:
def calculate_cv_score(X, y, params, n_splits=5):
    """
    Cross-validate CatBoostRegressor with given params on TimeSeriesSplit.
    Returns mean and std of per-split MAE values on eval subsets.
    """
    splitter = TimeSeriesSplit(n_splits=n_splits)
    results = []

    for train_idx, eval_idx in splitter.split(X):
        X_tr, X_ev = X[train_idx], X[eval_idx]
        y_tr, y_ev = y[train_idx], y[eval_idx]

        model = catboost.CatBoostRegressor(
            **base_params,
            **params
        )
        model.fit(X_tr, y_tr, silent=True)
        results.append(mean_absolute_error(y_ev, model.predict(X_ev)))

    return np.mean(results), np.std(results)

In [11]:
grid = []
for depth in [5, 6, 7]:
    for lr in [0.01, 0.02, 0.03, 0.05]:
        for l2 in [1, 3, 10, 30]:
            grid.append({'depth': depth, 'learning_rate': lr, 'l2_leaf_reg': l2})

result_list = []
for p in tqdm.tqdm(grid):
    mae_mean, mae_std = calculate_cv_score(X_train, y_train, p, n_splits=3)
    res = {k: v for k, v in p.items()}
    res['mae'] = mae_mean
    result_list.append(res)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [2:00:05<00:00, 150.11s/it]


In [12]:
pd.DataFrame(result_list).sort_values('mae')[:5]

Unnamed: 0,depth,learning_rate,l2_leaf_reg,mae
3,5,0.01,30,0.344643
19,6,0.01,30,0.345064
35,7,0.01,30,0.347595
33,7,0.01,3,0.348085
18,6,0.01,10,0.348131


In [13]:
best_params = {k: v for k,v in min(result_list, key=lambda x: x['mae']).items() if k != 'mae'}
best_params

{'depth': 5, 'learning_rate': 0.01, 'l2_leaf_reg': 30}

In [14]:
model = catboost.CatBoostRegressor(
    **base_params,
    **best_params
)

In [15]:
model.fit(X_train, y_train, silent=True)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
preds = pd.concat([
    pd.DataFrame({'hour': df_train['hour'], 'pred_value': train_preds}),
    pd.DataFrame({'hour': df_test['hour'], 'pred_value': test_preds})
])

In [16]:
utils.insert_predictions(preds, 'catboost_mae', conn=conn)

Model version is 0
Inserted 34407 rows


In [17]:
model.save_model('data/catboost_model_mae0.cbm')

In [18]:
conn.close()