In [1]:
import datetime

import catboost
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

from py import sqlite_utils as utils

In [2]:
conn = utils.connect_sqlite()

In [3]:
df_train = utils.execute_sql_query("""
SELECT *
FROM power_features_train
""", conn=conn)
df_test = utils.execute_sql_query("""
SELECT *
FROM power_features_test
""", conn=conn)

## Baseline - LAG(y, 1) as prediction

In [4]:
preds = pd.concat([df_train[['hour', 'y_kw_mean_lag1']], df_test[['hour', 'y_kw_mean_lag1']]])\
    .rename({'y_kw_mean_lag1': 'pred_value'}, axis=1).reset_index(drop=True).dropna()

In [5]:
utils.insert_predictions(preds, 'baseline_lag1', conn=conn)

Model version is 2
Inserted 34162 rows


In [6]:
conn.close()

In [7]:
preds = pd.concat([df_train[['hour', 'y_kw_mean']], df_test[['hour', 'y_kw_mean']]])\
    .rename({'y_kw_mean': 'pred_value'}, axis=1).reset_index(drop=True)
preds['pred_value'] = preds['pred_value'].ffill().shift(1)
preds = preds.dropna()

In [8]:
utils.insert_predictions(preds, 'baseline_ffill_lag1')

Model version is 0
Inserted 34406 rows


## Baseline CatBoost model

In [10]:
model = catboost.CatBoostRegressor(
    loss_function='MAE',
    eval_metric='MAE',
    iterations=5000,
    early_stopping_rounds=200,
    random_seed=338
)

In [12]:
X_train = df_train.iloc[:, 2:].values
y_train = df_train.iloc[:, 1].values

X_test = df_test.iloc[:, 2:].values

In [13]:
model.fit(X_train, y_train, silent=True)

<catboost.core.CatBoostRegressor at 0x2806d447410>

In [14]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
preds = pd.concat([
    pd.DataFrame({'hour': df_train['hour'], 'pred_value': train_preds}),
    pd.DataFrame({'hour': df_test['hour'], 'pred_value': test_preds})
])

In [15]:
utils.insert_predictions(preds, 'catboost_baseline')

Model version is 0
Inserted 34407 rows


## CatBoost parameter optimization

In [27]:
list(TimeSeriesSplit().split(X_train))

[(array([   0,    1,    2, ..., 4581, 4582, 4583]),
  array([4584, 4585, 4586, ..., 9162, 9163, 9164])),
 (array([   0,    1,    2, ..., 9162, 9163, 9164]),
  array([ 9165,  9166,  9167, ..., 13743, 13744, 13745])),
 (array([    0,     1,     2, ..., 13743, 13744, 13745]),
  array([13746, 13747, 13748, ..., 18324, 18325, 18326])),
 (array([    0,     1,     2, ..., 18324, 18325, 18326]),
  array([18327, 18328, 18329, ..., 22905, 22906, 22907])),
 (array([    0,     1,     2, ..., 22905, 22906, 22907]),
  array([22908, 22909, 22910, ..., 27486, 27487, 27488]))]