In [3]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from scipy.interpolate import interp1d
from scipy.stats import norm

features = ['Month', 'Hour', 'hour_x', 'hour_y', 'month_x', 'month_y',
'net_target-23', 'net_target',  'diffuse_solar_radiation+1', 'relative_humidity+1', 'drybulb_temp+1']
target = 'net_target+1'

In [5]:
# Input data directory
data_train = pd.read_csv('./data/extra_train.csv', index_col=0, parse_dates=['timestamp'])
data_train.index = data_train.timestamp
data_test = pd.read_csv('./data/extra_test.csv', index_col=0, parse_dates=['timestamp'])
data_test.index = data_test.timestamp

In [6]:
def run_lgb(data, datat, features, target, seed=42):
    # set seed
    np.random.seed(seed)
    # set params for lgb point forecast regression
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'bagging_fraction': 0.7,
        'bagging_freq': 1,
        'feature_fraction': 0.7,
        'verbose': -1
    }
    # train test split the time series 80/20 
    train = data.loc[data['timestamp'] < '2021-10-13 14:00:00']
    valid = data.loc[data['timestamp'] >= '2021-10-13 14:00:00']
    # train
    x_train = train[features]
    y_train = train[target]
    x_valid = valid[features]
    y_valid = valid[target]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100)
    # save the model to models/point/lgb_next_step_1.pkl
    model.save_model('./models/point/lgb_next_step_1.pkl')
    x_test = datat[features]
    y_pred = model.predict(data[features], num_iteration=model.best_iteration)
    y_pred_test = model.predict(x_test, num_iteration=model.best_iteration)
    # add an index to the predictions
    y_pred = pd.DataFrame(y_pred, index=data.timestamp, columns=[target])
    y_pred_test = pd.DataFrame(y_pred_test, index=datat.timestamp, columns=[target])
    return y_pred, y_pred_test

In [7]:
y_pred, y_pred_test = run_lgb(data_train, data_test, features, target, seed=42)
# add an hour column to the predictions
y_pred['hour'] = y_pred.index.hour
y_pred_test['hour'] = y_pred_test.index.hour

Training until validation scores don't improve for 50 rounds
[100]	training's l1: 0.0493819	valid_1's l1: 0.0508842
Early stopping, best iteration is:
[147]	training's l1: 0.0476575	valid_1's l1: 0.0508264


In [12]:
# save predictions to a point folder
y_pred.to_csv('./data/point/train_fcst.csv')
y_pred_test.to_csv('./data/point/test_fcst.csv')