In [1]:
import datetime
import tqdm

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from py import sqlite_utils as utils

In [2]:
conn = utils.connect_sqlite()

In [3]:
df_train = utils.execute_sql_query("""
SELECT *
FROM power_features_train
""", conn=conn)
df_test = utils.execute_sql_query("""
SELECT *
FROM power_features_test
""", conn=conn)

## Baseline - LAG(y, 1) as prediction

In [4]:
preds = pd.concat([df_train[['hour', 'y_kw_mean_lag1']], df_test[['hour', 'y_kw_mean_lag1']]])\
    .rename({'y_kw_mean_lag1': 'pred_value'}, axis=1).reset_index(drop=True).dropna()

In [5]:
utils.insert_predictions(preds, 'baseline_lag1', conn=conn)

Model version is 0
Inserted 34162 rows


In [6]:
preds = pd.concat([df_train[['hour', 'y_kw_mean']], df_test[['hour', 'y_kw_mean']]])\
    .rename({'y_kw_mean': 'pred_value'}, axis=1).reset_index(drop=True)
preds['pred_value'] = preds['pred_value'].ffill().shift(1)
preds = preds.dropna()

In [7]:
utils.insert_predictions(preds, 'baseline_ffill_lag1', conn=conn)

Model version is 0
Inserted 34406 rows


## Baseline CatBoost model

In [16]:
model = catboost.CatBoostRegressor(
    loss_function='MAE',
    iterations=2000,
    random_seed=338
)

In [17]:
X_train = df_train.iloc[:, 2:].values
y_train = df_train.iloc[:, 1].values

X_test = df_test.iloc[:, 2:].values

In [18]:
model.fit(X_train, y_train, silent=True)

<catboost.core.CatBoostRegressor at 0x260d0040950>

In [19]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
preds = pd.concat([
    pd.DataFrame({'hour': df_train['hour'], 'pred_value': train_preds}),
    pd.DataFrame({'hour': df_test['hour'], 'pred_value': test_preds})
])

In [20]:
utils.insert_predictions(preds, 'catboost_baseline', conn=conn)

Model version is 1
Inserted 34407 rows


## CatBoost parameter optimization

In [21]:
def calculate_cv_score(X, y, params, n_splits=5):
    splitter = TimeSeriesSplit(n_splits=n_splits)
    results = []

    for train_idx, eval_idx in tqdm.tqdm(splitter.split(X)):
        X_tr, X_ev = X[train_idx], X[eval_idx]
        y_tr, y_ev = y[train_idx], y[eval_idx]

        model = catboost.CatBoostRegressor(
            loss_function='MAE',
            iterations=2000,
            random_seed=338,
            **params
        )
        model.fit(X_tr, y_tr, silent=True)
        results.append(mean_absolute_error(y_ev, model.predict(X_ev)))

    return np.mean(results), np.std(results)

In [24]:
grid = []
for depth in [6, 7, 8]:
    for lr in [0.03, 0.07, 0.1]:
        for l2 in [3, 10, 30]:
            grid.append({"depth": depth, "learning_rate": lr, "l2_leaf_reg": l2})

result_list = []
for p in grid:
    mae_mean, mae_std = calculate_cv_score(X_train, y_train, p, n_splits=4)
    print(p) 
    print(f'MAE = {mae_mean:.4f} +- {mae_std:.4f}')
    res = {k: v for k, v in p.items()}
    res['mae'] = mae_mean
    result_list.append(res)

4it [01:00, 15.11s/it]


{'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 3}
MAE = 0.3649 +- 0.0399


4it [01:01, 15.39s/it]


{'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 10}
MAE = 0.3655 +- 0.0453


4it [01:02, 15.66s/it]


{'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 30}
MAE = 0.3614 +- 0.0400


4it [01:04, 16.24s/it]


{'depth': 6, 'learning_rate': 0.07, 'l2_leaf_reg': 3}
MAE = 0.3879 +- 0.0452


4it [00:56, 14.19s/it]


{'depth': 6, 'learning_rate': 0.07, 'l2_leaf_reg': 10}
MAE = 0.3807 +- 0.0488


4it [00:55, 13.94s/it]


{'depth': 6, 'learning_rate': 0.07, 'l2_leaf_reg': 30}
MAE = 0.3722 +- 0.0438


4it [00:55, 13.93s/it]


{'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 3}
MAE = 0.3911 +- 0.0553


4it [00:56, 14.22s/it]


{'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 10}
MAE = 0.3877 +- 0.0482


4it [00:56, 14.04s/it]


{'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 30}
MAE = 0.3821 +- 0.0418


4it [01:38, 24.60s/it]


{'depth': 7, 'learning_rate': 0.03, 'l2_leaf_reg': 3}
MAE = 0.3658 +- 0.0445


4it [01:38, 24.51s/it]


{'depth': 7, 'learning_rate': 0.03, 'l2_leaf_reg': 10}
MAE = 0.3666 +- 0.0454


4it [01:37, 24.41s/it]


{'depth': 7, 'learning_rate': 0.03, 'l2_leaf_reg': 30}
MAE = 0.3579 +- 0.0379


4it [01:35, 23.97s/it]


{'depth': 7, 'learning_rate': 0.07, 'l2_leaf_reg': 3}
MAE = 0.3799 +- 0.0421


4it [01:36, 24.22s/it]


{'depth': 7, 'learning_rate': 0.07, 'l2_leaf_reg': 10}
MAE = 0.3773 +- 0.0444


4it [01:41, 25.31s/it]


{'depth': 7, 'learning_rate': 0.07, 'l2_leaf_reg': 30}
MAE = 0.3654 +- 0.0339


4it [01:35, 23.85s/it]


{'depth': 7, 'learning_rate': 0.1, 'l2_leaf_reg': 3}
MAE = 0.3926 +- 0.0522


4it [01:32, 23.24s/it]


{'depth': 7, 'learning_rate': 0.1, 'l2_leaf_reg': 10}
MAE = 0.3827 +- 0.0434


4it [01:30, 22.75s/it]


{'depth': 7, 'learning_rate': 0.1, 'l2_leaf_reg': 30}
MAE = 0.3798 +- 0.0425


4it [02:33, 38.32s/it]


{'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 3}
MAE = 0.3627 +- 0.0380


4it [02:35, 38.82s/it]


{'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 10}
MAE = 0.3639 +- 0.0406


4it [02:41, 40.42s/it]


{'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 30}
MAE = 0.3629 +- 0.0395


4it [02:39, 39.85s/it]


{'depth': 8, 'learning_rate': 0.07, 'l2_leaf_reg': 3}
MAE = 0.3900 +- 0.0569


4it [02:41, 40.28s/it]


{'depth': 8, 'learning_rate': 0.07, 'l2_leaf_reg': 10}
MAE = 0.3810 +- 0.0439


4it [02:47, 41.87s/it]


{'depth': 8, 'learning_rate': 0.07, 'l2_leaf_reg': 30}
MAE = 0.3785 +- 0.0469


4it [02:44, 41.11s/it]


{'depth': 8, 'learning_rate': 0.1, 'l2_leaf_reg': 3}
MAE = 0.3886 +- 0.0475


4it [02:49, 42.30s/it]


{'depth': 8, 'learning_rate': 0.1, 'l2_leaf_reg': 10}
MAE = 0.3846 +- 0.0436


4it [02:45, 41.38s/it]

{'depth': 8, 'learning_rate': 0.1, 'l2_leaf_reg': 30}
MAE = 0.3874 +- 0.0452





In [28]:
pd.DataFrame(result_list).sort_values('mae')[:5]

Unnamed: 0,depth,learning_rate,l2_leaf_reg,mae
11,7,0.03,30,0.357888
2,6,0.03,30,0.361365
18,8,0.03,3,0.36265
20,8,0.03,30,0.362888
19,8,0.03,10,0.36387


In [29]:
best_params = {k: v for k,v in min(result_list, key=lambda x: x['mae']).items() if k != 'mae'}
best_params

{'depth': 7, 'learning_rate': 0.03, 'l2_leaf_reg': 30}

In [30]:
model = catboost.CatBoostRegressor(
    loss_function='MAE',
    iterations=2000,
    random_seed=338,
    **best_params
)

In [31]:
model.fit(X_train, y_train, silent=True)

<catboost.core.CatBoostRegressor at 0x260cff81370>

In [32]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
preds = pd.concat([
    pd.DataFrame({'hour': df_train['hour'], 'pred_value': train_preds}),
    pd.DataFrame({'hour': df_test['hour'], 'pred_value': test_preds})
])

In [33]:
utils.insert_predictions(preds, 'catboost', conn=conn)

Model version is 0
Inserted 34407 rows


In [34]:
conn.close()