In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from scipy.interpolate import interp1d
from scipy.stats import norm

features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-23', 'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target'

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [22]:
# Input data directory
data_train = pd.read_csv('./data/extra_train.csv', index_col=0, parse_dates=['timestamp'])
data_train.index = data_train.timestamp
data_test = pd.read_csv('./data/extra_test.csv', index_col=0, parse_dates=['timestamp'])
data_test.index = data_test.timestamp

In [23]:
data_train['timestamp'][int((len(data_train) * 0.8)):]

timestamp
2021-10-13 14:00:00   2021-10-13 14:00:00
2021-10-13 15:00:00   2021-10-13 15:00:00
2021-10-13 16:00:00   2021-10-13 16:00:00
2021-10-13 17:00:00   2021-10-13 17:00:00
2021-10-13 18:00:00   2021-10-13 18:00:00
                              ...        
2022-07-30 19:00:00   2022-07-30 19:00:00
2022-07-30 20:00:00   2022-07-30 20:00:00
2022-07-30 21:00:00   2022-07-30 21:00:00
2022-07-30 22:00:00   2022-07-30 22:00:00
2022-07-30 23:00:00   2022-07-30 23:00:00
Name: timestamp, Length: 6970, dtype: datetime64[ns]

In [28]:
def run_lgb(data, datat, features, target, seed=42):
    # set seed
    np.random.seed(seed)
    # set params for lgb point forecast regression
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
    }
    # train test split the time series 80/20 
    train = data.loc[data['timestamp'] < '2021-10-13 14:00:00']
    valid = data.loc[data['timestamp'] >= '2021-10-13 14:00:00']
    # train
    x_train = train[features]
    y_train = train[target]
    x_valid = valid[features]
    y_valid = valid[target]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100)
    # predict
    x_test = datat[features]
    y_pred = model.predict(data[features], num_iteration=model.best_iteration)
    y_pred_test = model.predict(x_test, num_iteration=model.best_iteration)
    # add an index to the predictions
    y_pred = pd.DataFrame(y_pred, index=data.timestamp, columns=[target])
    y_pred_test = pd.DataFrame(y_pred_test, index=datat.timestamp, columns=[target])
    return y_pred, y_pred_test

In [29]:
y_pred, y_pred_test = run_lgb(data_train, data_test, features, target, seed=42)
# add an hour column to the predictions
y_pred['hour'] = y_pred.index.hour
y_pred_test['hour'] = y_pred_test.index.hour

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 669
[LightGBM] [Info] Number of data points in the train set: 6968, number of used features: 10
[LightGBM] [Info] Start training from score 0.398869
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's rmse: 0.0840273	valid_1's rmse: 0.101215


In [30]:
# save predictions to a point folder
y_pred.to_csv('./data/point/train_fcst.csv')
y_pred_test.to_csv('./data/point/test_fcst.csv')