<a href="https://colab.research.google.com/github/yuriao/DataScienceProjects/blob/main/optiver_lgbm_featureeng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import gc
from itertools import combinations
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from warnings import simplefilter
import joblib
from sklearn.model_selection import StratifiedKFold
from scipy.stats import kurtosis, skew
from tqdm import tqdm
import copy

warnings.filterwarnings('ignore')
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [None]:
from google.colab import drive
drive.mount('optiver_data')

# 1. read data

In [None]:
train = pd.read_csv('/content/optiver_data/MyDrive/optiver_data/train.csv')

In [None]:
train

# 2. feature engineering

In [None]:
class Preprocessor:

    def preprocessing(self,df_ori):

        df=copy.deepcopy(df_ori)
        # 1. additional columns
        df['signed_imbalance_size']=df['imbalance_size']*df['imbalance_buy_sell_flag']

        # 2. statistical features for each stock
        feats_stat = [
            ('imbalance_size', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('imbalance_buy_sell_flag',['nunique','mean']),
            ('signed_imbalance_size', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('reference_price', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('matched_size', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('far_price', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('near_price', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('bid_price', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('bid_size', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('ask_price', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('ask_size', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
            ('wap', ['mean', 'std', 'min', 'max', 'last', 'first', 'median', 'sum', 'skew']),
        ]

        for colname_method in tqdm(feats_stat):
            for idx,method in enumerate(colname_method[1]):
                tmp_df = df.groupby(['stock_id']).agg({colname_method[0]: method}).reset_index().rename(columns={colname_method[0]: f'{colname_method[0]}_{method}'}).sort_values(by=['stock_id'])
                # https://stackoverflow.com/questions/39291499/how-to-concatenate-multiple-column-values-into-a-single-column-in-pandas-datafra
                #tmp_df['row_id']=tmp_df[['stock_id','date_id','seconds_in_bucket']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
                #tmp_df1=tmp_df.loc[:,['row_id',colname_method[0]+'_'+method]]
                df=df.merge(tmp_df,on='stock_id',how='left')

        return df

In [None]:
preprocessor=Preprocessor()
train=preprocessor.preprocessing(train)

# 2. LGBM model + k-Fold

In [None]:
def KFold_model_training(train_feats,train_score,params,k):

    Y_pred=[]
    Y_ori=[]
    df_importance_list=[]
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)

    final_models=[]

    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_feats)):
        X_train = train_feats.iloc[trn_idx,:]
        Y_train = train_score.iloc[trn_idx]

        X_val = train_feats.iloc[val_idx,:]
        Y_val = train_score.iloc[val_idx]

        print('\nFold_{} Training ================================\n'.format(fold_id+1))
        model= LGBMRegressor(**params)
        lgb_model = model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              eval_metric='rmse')

        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)


        df_importance = pd.DataFrame({
            'column': list(train_feats.columns),
            'importance': lgb_model.feature_importances_,
        })
        df_importance_list.append(df_importance)

        final_models.append(lgb_model)
        Y_pred.extend(pred_val)
        Y_ori.extend(Y_val)

    df_importance = pd.concat(df_importance_list)
    df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False).reset_index()

    return Y_pred, Y_ori, df_importance, final_models

In [None]:
def cross_validation_training(all_train_features_full,train_score_full,params,k):
    errs=[]
    errs_rmse=[]
    test_scores=[]
    preds=[]

    for i in range(0,50):
        all_train_features,test,all_train_scores,test_score=train_test_split(all_train_features_full,train_score_full,test_size=0.3,shuffle=True)

        all_train_features=all_train_features.drop('id',axis=1)
        all_train_scores=all_train_scores.drop('id',axis=1)
        test_score=test_score.drop('id',axis=1)
        test_score=test_score['score']
        test_id=test['id']
        test=test.drop('id',axis=1)

        # training
        print(f'Fitting Model')
        Y_pred, Y_ori, df_importance, final_models = KFold_model_training(all_train_features,all_train_scores,params,k)

        predictions, Ypred = KFold_model_predict(final_models,test)
        errs.append(np.sqrt((predictions-list(test_score))*(predictions-list(test_score))))
        errs_rmse.append(mean_squared_error(predictions,test_score,squared=False))

        test_scores.append(test_score)
        preds.append(predictions)

    return np.max(errs_rmse), errs, test_scores, preds

## Hyperparameter optimization with Optuna

In [None]:
def objective(trial):

    param = {
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3,1.0),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 10.0),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf', 50 , 300),
        'n_estimators': trial.suggest_int('n_estimators', 10 , 500),
        'subsample': trial.suggest_float('subsample', 0.1,1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.0005,0.1),
        'max_depth': trial.suggest_int('max_depth', 2 , 50),
        'max_bin': trial.suggest_int('max_bin', 10, 50),
        'num_leaves' : trial.suggest_int('num_leaves', 3, 50),
        'bagging_fraction':trial.suggest_float('subsample', 0.5,1.0),
        'metric': 'rmse',
        'random_state': 42,
        'early_stoppping':75,
        'vervose' : -1
    }

    k=trial.suggest_int('k', 3 , 32)
    rmse,_,_,_ = cross_validation_training(data,target,param,k);

    return rmse

In [None]:
import time
start_time = time.time()

optuna.logging.set_verbosity(optuna.logging.CRITICAL)
study = optuna.create_study(direction='minimize')
data=all_train_features_full_ori
target=train_score_ori
study.optimize(objective, n_trials=1500)

print(time.time() - start_time)


In [None]:
print(f'Number of finished trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial.params}')
print(f'Best score: {study.best_value}')
best_param=study.best_trial.params

#