In [1]:
import warnings
warnings.filterwarnings("ignore")

# reading file
from tqdm import tqdm
import pandas as pd
import numpy as np


# Modelling
import os
import random
import pickle
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from typing import List, Tuple
from collections import Counter, defaultdict


In [2]:
root_path = "data"
save_dir='checkpoints/artifacts/'

## 4.4 Lightgbm

In [3]:
df_train = pd.read_csv(f'{root_path}/scaled_data.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428932 entries, 0 to 428931
Columns: 114 entries, row_id to stock_id
dtypes: float64(112), int64(1), object(1)
memory usage: 373.1+ MB


In [5]:
df_train['time_id'] = df_train['row_id'].str.split('-', expand = True)[1]
df_train

Unnamed: 0,row_id,book.wap1.mean,book.wap1.std,book.wap1.sum,book.wap1.amin,book.wap1.amax,book.wap2.mean,book.wap2.std,book.wap2.sum,book.wap2.amin,...,trade.trade_volumn.amax,trade.weighted_price.realized_volatility,trade.weighted_price.mean,trade.weighted_price.std,trade.weighted_price.sum,trade.weighted_price.amin,trade.weighted_price.amax,target,stock_id,time_id
0,0-5,0.533363,0.000693,0.457273,0.711488,0.318289,0.006905,0.003301,0.006713,0.059265,...,0.000685,0.221584,0.505270,0.000578,0.063568,0.701747,0.334114,0.004136,0,5
1,0-11,0.511839,0.000262,0.288406,0.701066,0.295490,0.003434,0.000518,0.002298,0.032674,...,0.000383,0.185708,0.484476,0.000304,0.047108,0.685883,0.312804,0.001445,0,11
2,0-16,0.507535,0.000864,0.268528,0.686183,0.295734,0.001506,0.000182,0.001055,0.003873,...,0.000534,0.165812,0.478549,0.000932,0.038952,0.671526,0.310739,0.002168,0,16
3,0-31,0.503150,0.000757,0.157038,0.687526,0.293131,0.002702,0.000438,0.001059,0.094997,...,0.000614,0.119330,0.477461,0.000729,0.022733,0.675102,0.309503,0.002195,0,31
4,0-62,0.508009,0.000258,0.248900,0.698248,0.291719,0.003609,0.001814,0.002100,0.007725,...,0.000466,0.153132,0.480999,0.000182,0.034103,0.682849,0.308352,0.001747,0,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,99-32751,0.514338,0.000459,0.762298,0.700714,0.298106,0.002725,0.000180,0.004761,0.203034,...,0.004004,0.378631,0.487010,0.000431,0.165666,0.685309,0.314554,0.001279,99,32751
428928,99-32753,0.513508,0.000641,0.567139,0.699965,0.302769,0.002386,0.000286,0.003191,0.075675,...,0.005794,0.355655,0.487363,0.000664,0.147813,0.685130,0.320784,0.000890,99,32753
428929,99-32758,0.500177,0.000643,0.781722,0.687706,0.290493,0.003294,0.000310,0.005757,0.065839,...,0.012904,0.337123,0.472882,0.000552,0.134488,0.673030,0.307772,0.001782,99,32758
428930,99-32763,0.503555,0.000303,0.788718,0.691726,0.290497,0.003460,0.000718,0.006058,0.008582,...,0.003992,0.490609,0.476782,0.000309,0.267498,0.676454,0.307878,0.003100,99,32763


In [6]:
def get_X_y(data):
 Xfeature = data.drop(columns = ["target","row_id"])
 y = data["target"]
 return Xfeature, y
 

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def feval_rmspe(y_pred, model, is_xgb=True):
    y_true = model.get_label()

    if is_xgb:
        return "RMSPE", rmspe(y_true, y_pred)

    return "RMSPE", rmspe(y_true, y_pred), False


def feval_wrapper(y_pred, model):
    return feval_rmspe(y_pred, model, is_xgb=False)


def stratified_group_k_fold(X, y, groups, k, seed=None):
    """ https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation """
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in tqdm(sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])), total=len(groups_and_y_counts)):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [7]:
# # # full data
X, y = get_X_y(df_train)

In [8]:
X.reset_index(drop=True, inplace=True); y.reset_index(drop=True, inplace=True)

In [9]:
features = [col for col in df_train.columns if col not in {"time_id", "target", "row_id"}]
feats_nostock = [col for col in df_train.columns if col not in {"time_id", "target", "row_id", "stock_id"}] 

In [10]:
# Create out of folds array
y = df_train['target']
# Iterate through each fold
try_seed = 42
KFOLD = 5
CV_SPLIT = 'default' 

if CV_SPLIT == 'default':
    #gkf = GroupKFold(n_splits=KFOLD)
    skf = stratified_group_k_fold(X=df_train[feats_nostock], y=df_train['stock_id'].astype('category').cat.codes.values, 
                              groups=np.array(df_train['time_id'].astype('category').cat.codes.values), k=KFOLD, seed=try_seed)
    folds = []

    for i, (idx_train, idx_valid) in enumerate(skf):
      # x_train, x_val = train.iloc[idx_train], train.iloc[idx_valid]
      # y_train, y_val = y.iloc[idx_train], y.iloc[idx_valid]
      folds.append((idx_train, idx_valid))
      print(f'Fold {i}: {len(idx_train)} train, {len(idx_valid)} valid')
else:
    raise ValueError()

100%|██████████| 3830/3830 [03:13<00:00, 19.76it/s]


Fold 0: 343146 train, 85786 valid
Fold 1: 343145 train, 85787 valid
Fold 2: 343146 train, 85786 valid
Fold 3: 343146 train, 85786 valid
Fold 4: 343145 train, 85787 valid


In [20]:
lgb_bl = {
    "boosting_type": ["gbdt"],
    "max_depth": [7],
    "num_leaves": [128],
    #"early_stopping_rounds":[10],

    "learning_rate": [0.05],
    "subsample": [0.72],
    "subsample_freq": [4],
    "feature_fraction": [0.4],
    "feature_fraction_bynode": [0.8],
    "bagging_fraction": [0.75],
    "bagging_freq": [25],

    "min_data_in_leaf": [1000],
    "min_sum_hessian_in_leaf": [20],

    "lambda_l1": [2],
    "lambda_l2": [4],
        
    "extra_trees": [True],
    "force_col_wise": [True],
        
    "categorical_column": [0],
    "n_jobs": [-1],
    "verbose": [-1],
}

In [21]:
def train_lgbm(X: pd.DataFrame, y: pd.DataFrame, folds: List[Tuple],output_dir,params):
    global feval_wrapper

    best_losses = []
    best_predictions = []

  # Iterate through each fold
  
    if CV_SPLIT == 'default':
      #gkf = GroupKFold(n_splits=KFOLD)
      skf = stratified_group_k_fold(X=X[feats_nostock], y=X['stock_id'].astype('category').cat.codes.values, 
                                groups=np.array(X['time_id'].astype('category').cat.codes.values), k=KFOLD, seed=try_seed)
      # folds = []
      
      for cv_idx, (idx_train, idx_valid) in enumerate(skf):
  # for cv_idx, (train_idx, valid_idx) in enumerate(folds):
        X_tr, X_va = X.iloc[idx_train], X.iloc[idx_valid]
        y_tr, y_va = y.iloc[idx_train], y.iloc[idx_valid]

        X_tr = X_tr.drop(columns = ['time_id'], axis = 1)
        X_va = X_va.drop(columns = ['time_id'], axis = 1)
        
        dtrain = lgb.Dataset(X_tr, y_tr, weight=1/np.square(y_tr))
        dval = lgb.Dataset(X_va, y_va, weight=1/np.square(y_va))

        print(f"fold {cv_idx} train: {X_tr.shape}, valid: {X_va.shape}")

        model = lgb.train(params=params,
                              num_boost_round=10000,
                              train_set=dtrain,
                              valid_sets=[dtrain, dval],
                              verbose_eval=250,
                              early_stopping_rounds=200,
                              #early_stopping_rounds=1,
                              feval=feval_wrapper)

        fold_preds = model.predict(X_va)
        valid_rmspe = rmspe(y_va, fold_preds)
        #print(f"\nvalid rmspe of fold {fold}: {valid_rmspe}")
        
        print(f"\nRMSPE of fold {cv_idx}: {valid_rmspe}")
        # import pdb; pdb.set_trace()
        pickle.dump(model, open(os.path.join(output_dir, f"lgb_bl_{cv_idx}.pkl"), "wb"))
        
        
        # import pdb; pdb.set_trace()
        best_predictions.append(fold_preds)
        best_losses.append(valid_rmspe)

      return best_losses, best_predictions          

In [22]:
lgbm_losses, lgbm_preds = train_lgbm(X, y, 
                              folds, 
                              output_dir=save_dir,
                              params= lgb_bl)

100%|██████████| 3830/3830 [02:56<00:00, 21.72it/s]


fold 0 train: (343146, 112), valid: (85786, 112)
Training until validation scores don't improve for 200 rounds
[250]	training's RMSPE: 0.235759	valid_1's RMSPE: 0.250779
[500]	training's RMSPE: 0.230668	valid_1's RMSPE: 0.247403
[750]	training's RMSPE: 0.22782	valid_1's RMSPE: 0.245468
[1000]	training's RMSPE: 0.225753	valid_1's RMSPE: 0.244788
[1250]	training's RMSPE: 0.224071	valid_1's RMSPE: 0.244188
[1500]	training's RMSPE: 0.222679	valid_1's RMSPE: 0.244001
[1750]	training's RMSPE: 0.221361	valid_1's RMSPE: 0.244231
Early stopping, best iteration is:
[1650]	training's RMSPE: 0.221876	valid_1's RMSPE: 0.243692

RMSPE of fold 0: 0.2436921386092969
fold 1 train: (343145, 112), valid: (85787, 112)
Training until validation scores don't improve for 200 rounds
[250]	training's RMSPE: 0.236959	valid_1's RMSPE: 0.238966
[500]	training's RMSPE: 0.231803	valid_1's RMSPE: 0.235901
[750]	training's RMSPE: 0.228933	valid_1's RMSPE: 0.234656
[1000]	training's RMSPE: 0.226827	valid_1's RMSPE: 0.

In [28]:
np.mean(lgbm_losses)

0.23470429159543058

Save models

In [17]:
# with open(os.path.join(save_dir, f"lgb_bl_0.pkl"), 'rb') as f:
#    model_0 = pickle.load(f) 
# with open(os.path.join(save_dir, f"lgb_bl_1.pkl"), 'rb') as f:
#    model_1 = pickle.load(f) 
# with open(os.path.join(save_dir, f"lgb_bl_2.pkl"), 'rb') as f:
#    model_2 = pickle.load(f) 