In [None]:
import pandas as pd
import numpy as np
from shap_flow_util import read_csv_between

import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import datetime
import os

In [None]:
version = 'v2'
date = datetime.datetime.now().strftime("%Y-%m-%d")

periods = [('2018-01-01', '2021-09-30'),
            ('2021-10-01', '2023-12-31'),
            ('2018-01-01', '2023-12-31')]

targets = ['price', 'export']
for target in targets:
    for start_date, end_date in periods:
        model_name = 'xgb_{}_start_{}_end_{}'.format(target, start_date, end_date, version)
        X = read_csv_between('./data/{}/X_full.csv'.format(version, target), start_date, end_date)
        X['isworkingday'] = X['isworkingday']*1.0 # fixes problem with boolean data types (by making boolean type a float)
        y = read_csv_between('./data/{}/y_{}_full.csv'.format(version, target), start_date, end_date)

        # split data into test and train set
        # 4-day sliding window split to prevent memorization of target
        block_size = '4d'
        masker = [pd.Series(g.index) for n, g in X.groupby(pd.Grouper(freq=block_size))]
        train_mask, test_mask = train_test_split(masker, test_size = 0.2, random_state=7)

        X_train = X.loc[pd.concat(train_mask)]
        y_train = y.loc[pd.concat(train_mask)]
        X_test = X.loc[pd.concat(test_mask)]
        y_test = y.loc[pd.concat(test_mask)]

        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_test = xgb.DMatrix(X_test, label=y_test)

        # save test data in order to calculate shap values later
        X_test.to_csv('./data/{}/X_test_{}.csv'.format(version, model_name), sep=',', index=True)
        y_test.to_csv('./data/{}/y_test_{}.csv'.format(version, model_name), sep=',', index=True)
        # save train data
        X_train.to_csv('./data/{}/X_train_{}.csv'.format(version, model_name), sep=',', index=True)
        y_train.to_csv('./data/{}/y_train_{}.csv'.format(version, model_name), sep=',', index=True)


        # conventional CV on 4 day window
        from scipy.stats import randint, uniform
        param_dist = {
            'max_depth': randint(3, 12),
            'learning_rate': uniform(0.01, 0.3),
            'subsample': uniform(0.5, 0.5),
            'min_child_weight': randint(1, 31),
            'reg_lambda': uniform(0, 1),
            'reg_alpha': uniform(0, 1)
        }

        xgb_model = xgb.XGBRegressor(objective='reg:squarederror', 
                                    n_estimators=1200,
                                    verbosity=1, 
                                    n_jobs=40, 
                                    base_score = y_train.iloc[:, 0].mean(),
                                    random_state=42)
        random_search = RandomizedSearchCV(
                                    estimator=xgb_model, 
                                    param_distributions=param_dist, 
                                    cv=5, 
                                    n_iter=60,
                                    n_jobs=1,
                                    refit='neg_root_mean_squared_error', 
                                    random_state=42,
                                    pre_dispatch='2*n_jobs', 
                                    verbose=3,
                                    scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2'])
        random_search.fit(X_train, y_train)

        best_model = random_search.best_estimator_
        best_parameters = random_search.best_params_
        print("Best set of hyperparameters: ", best_parameters)
        print("Best score: ", random_search.best_score_)
        # save model
        directory = './models/{}'.format(version)
        if not os.path.exists(directory):
            os.makedirs(directory)
        best_model.save_model('{}/{}_best.json'.format(directory, model_name))