In [None]:
import sys
import os
from pathlib import Path


import numpy as np
import pandas as pd
import json
import copy
import joblib
from datetime import datetime, date, timedelta
import random as rn

In [None]:
seed = 1234
np.random.seed(seed)
rn.seed(seed)

## load data

In [None]:
df_all = pd.read_csv('../../data/y_1/10features', sep='|', compression='bz2')

In [None]:
from train_setup import *

In [None]:
time_list = df_all['time_id'].unique().tolist()
len(time_list)

In [None]:
all_train = list(set(time_list) - set(all_test))
len(all_train), len(all_test)

In [None]:
k1_train = list(set(all_train)-set(k1_test))
k2_train = list(set(all_train)-set(k2_test))
k3_train = list(set(all_train)-set(k3_test))

In [None]:
len(k1_train), len(k1_test)

In [None]:
len(k2_train), len(k2_test)

In [None]:
len(k3_train), len(k3_test)

In [None]:
len(all_test), len(all_train)

In [None]:
rename_map = {}
for i, col in enumerate(final_feats):
    rename_map[col]=f'feat{i+1}'
    
df_all.rename(columns=rename_map, inplace=True)
df_all.head()

In [None]:
kfold_list_ = [[df_all[df_all['time_id'].isin(k1_train)], df_all[df_all['time_id'].isin(k1_test)], ], 
              [df_all[df_all['time_id'].isin(k2_train)], df_all[df_all['time_id'].isin(k2_test)], ], 
              [df_all[df_all['time_id'].isin(k3_train)], df_all[df_all['time_id'].isin(k3_test)], ], 
             ]

In [None]:
features = ['feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6', 'feat7', 'feat8', 'feat9', 'feat10']

In [None]:
scaler=5
kfold_list = []
for df_train_, df_test_ in kfold_list_:
    print(df_train_.shape, df_test_.shape)
    df_train = df_train_.copy(deep=True)
    df_test = df_test_.copy(deep=True)
    
    df_train.set_index(keys=['stock_id', 'time_id'], inplace=True)
    df_test.set_index(keys=['stock_id', 'time_id'], inplace=True)
 

    for i in range(1, len(final_feats)+1):
        col=f'feat{i}'

        avg = df_train[col].mean()
        std = df_train[col].std()
        df_train[df_train[col]>avg+scaler*std] = avg+scaler*std
        df_train[df_train[col]<avg-scaler*std] = avg-scaler*std
        df_test[df_test[col]>avg+scaler*std] = avg+scaler*std
        df_test[df_test[col]<avg-scaler*std] = avg-scaler*std
        
        
    X_train=df_train[features].copy(deep=True)
    y_train = df_train[['target']].copy(deep=True)
    y_train['target']=df_train_['target'].values
    
    X_test=df_test[features].copy(deep=True)
    y_test=df_test[['target']].copy(deep=True)
    y_test['target']=df_test_['target'].values
    
    kfold_list.append([X_train, y_train, X_test, y_test])

In [None]:
del kfold_list_

### hyperopt setup

In [None]:
import lightgbm as lgb

In [None]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, anneal, rand

In [None]:
from hyperopt import hp
import numpy as np

search_space = { 
                 'num_boost_round': hp.choice('num_boost_round',list(range(300, 1201, 1))),
                 'boosting':hp.choice('boosting', ['gbdt']),
                 'objective':hp.choice('objective', ['regression_l2'] ),#,'regression_l1'
                 'metric':hp.choice('metric', ['mae']),
                 'max_leaves': hp.choice('max_leaves', range(30, 301, 5)),#int
                 'learning_rate':  hp.choice('learning_rate', np.round(np.arange(0.01, 0.75, 0.01),3)), 
                 'feature_fraction': hp.choice('feature_fraction', np.round(np.arange(0.45, 0.86, 0.01),3)), 
                 #learning control parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters
                 'max_depth': hp.choice('max_depth', range(9, 24, 1)), #int type
                 'min_data_in_leaf': hp.choice('min_data_in_leaf',range(30, 501, 1)), #int type
                 'lambda_l1':hp.choice('lambda_l1', range(5, 15, 1)),#reg_alpha
                 'lambda_l2':hp.choice('lambda_l2', range(5, 15, 1)),#reg_lambda
                 'max_bin':hp.choice('max_bin', range(20, 350, 5)),#'max_bin':hp.quniform('max_bin', 100, 500, 50),#int
                 'min_data_in_bin':hp.choice('min_data_in_bin', range(10, 100, 1)),
                 #'min_split_gain':hp.choice('min_split_gain', np.round(np.arange(0.0005, 0.01, 0.0001),5)),
                 'bagging_fraction':hp.choice('bagging_fraction', np.round(np.arange(0.5, 0.86, 0.01),3)), 
                 'bagging_freq':hp.choice('bagging_freq', range(20, 101, 1)),# int
                 #'min_child_weight':hp.choice('min_child_weight', range(300, 1000, 5))# int
                  }

In [None]:
def make_lgb_preds(X_train, y_train,X_test, num_round=100, params={}, verbose=False):
    
    dtrain = lgb.Dataset(X_train, y_train)
    tree_model = lgb.train(params,
                dtrain,
                num_boost_round=num_round,
                verbose_eval=verbose)
    
    y_preds = tree_model.predict(X_test, num_iteration=tree_model.best_iteration)
    scores = tree_model.feature_importance(importance_type='gain', iteration=tree_model.best_iteration)
    df_scores = pd.DataFrame({'feature':list(X_train.columns), 'gain': list(scores)})

    return y_preds, df_scores, tree_model

In [None]:
from sklearn.metrics import mean_squared_error
def score(params):
   
    num_boost_round = params['num_boost_round']
    
    if params in all_params:
        return {'loss': 99999, 'status': STATUS_OK}
    
    all_params.append(copy.deepcopy(params))
    
    del params['num_boost_round']
    params['verbose']=-1
    
    i = len(all_params)
    pred_list = []
    
    for j, (X_train, y_train, X_test, y_test) in enumerate(kfold_list):
        y_preds, df_scores, i_model = make_lgb_preds(X_train, y_train, X_test, params=params,
                                                       num_round=num_boost_round,  verbose=False)
        df_pred = y_test.copy(deep=True)
        df_pred['pred'] = y_preds
        df_pred['fold'] = j+1
        pred_list.append(df_pred)
        
    df_pred_all = pd.concat(pred_list, axis=0)
   
    loss = mean_squared_error(df_pred_all['target'], df_pred_all['pred'])
   
    
    item = [i, all_params[i-1],  -loss] 
    all_metrics.append(item)
    df_pred_all.to_csv(save_dir.joinpath('kfold_'+str(i)), sep='|', index=True, compression='bz2')
    
    
    i = i + 1
    
    if i%10==0:
        save_metric(all_metrics, save_dir.parent, trial_folder+'.xlsx')
        joblib.dump(trials, save_dir.parent.joinpath(trial_folder+'.pkl'))
    
    return {'loss': loss, 'status': STATUS_OK}



In [None]:
def save_metric(metric_list, trials_dir, file_name):
    df_params = pd.DataFrame(data = metric_list, columns = ['trial_id', 'params',  'metric'])
    df_params.index.name='row_nr'
    df_params.to_excel(trials_dir.joinpath(file_name))

In [None]:
from functools import partial
def optimize(space, evals, cores, trials, optimizer=tpe.suggest, random_state=1234, n_startup_jobs=50):
    space['nthread']= cores
    space['seed']= random_state
    algo = partial(optimizer, n_startup_jobs=n_startup_jobs)
    best = fmin(score, space, algo=algo, max_evals=evals, trials = trials)
    print(best)
    return best

## training

In [None]:
trial_folder = '10features_all2'
trials_dir = Path(f'../../trials')
save_dir = Path(f'../../trials/{trial_folder}')
save_dir.exists()

In [None]:
n_trials = 8000
n_random_trials = 2500

In [None]:
cores = 4
n=n_trials
verbose = False

In [None]:
all_metrics = []
all_params = []
trials = Trials()

In [None]:
best_param = optimize(search_space,
                      evals = n,
                      optimizer=tpe.suggest,
                      cores = cores,
                      trials = trials, random_state=1234, 
                      n_startup_jobs=n_random_trials)