In [1]:
import warnings
warnings.filterwarnings("ignore")

# reading file
from tqdm import tqdm
import pandas as pd
import numpy as np


# Modelling
import os
import random
import pickle
from sklearn.model_selection import GroupKFold
from sklearn.cluster import KMeans
import lightgbm as lgb
from typing import List, Tuple, Optional, Union
from collections import Counter, defaultdict


In [2]:
root_path = "data"
save_dir='checkpoints/artifacts/'

## 4.4 Lightgbm

In [3]:
df_train = pd.read_csv(f'{root_path}/scaled_data.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425343 entries, 0 to 425342
Columns: 114 entries, row_id to stock_id
dtypes: float64(112), int64(1), object(1)
memory usage: 369.9+ MB


In [5]:
df_train['time_id'] = df_train['row_id'].str.split('-', expand = True)[1]
df_train

Unnamed: 0,row_id,book.wap1.mean,book.wap1.std,book.wap1.sum,book.wap1.amin,book.wap1.amax,book.wap2.mean,book.wap2.std,book.wap2.sum,book.wap2.amin,...,trade.trade_volumn.amax,trade.weighted_price.realized_volatility,trade.weighted_price.mean,trade.weighted_price.std,trade.weighted_price.sum,trade.weighted_price.amin,trade.weighted_price.amax,target,stock_id,time_id
0,0-5,0.533363,0.000693,0.457273,0.711488,0.318289,0.006905,0.003301,0.006713,0.059265,...,0.000685,0.221584,0.505270,0.000578,0.063568,0.701747,0.334114,0.004136,0,5
1,0-11,0.511839,0.000262,0.288406,0.701066,0.295490,0.003434,0.000518,0.002298,0.032674,...,0.000383,0.185708,0.484476,0.000304,0.047108,0.685883,0.312804,0.001445,0,11
2,0-16,0.507535,0.000864,0.268528,0.686183,0.295734,0.001506,0.000182,0.001055,0.003873,...,0.000534,0.165812,0.478549,0.000932,0.038952,0.671526,0.310739,0.002168,0,16
3,0-31,0.503150,0.000757,0.157038,0.687526,0.293131,0.002702,0.000438,0.001059,0.094997,...,0.000614,0.119330,0.477461,0.000729,0.022733,0.675102,0.309503,0.002195,0,31
4,0-62,0.508009,0.000258,0.248900,0.698248,0.291719,0.003609,0.001814,0.002100,0.007725,...,0.000466,0.153132,0.480999,0.000182,0.034103,0.682849,0.308352,0.001747,0,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425338,99-1962,0.507617,0.000399,0.627149,0.694414,0.293331,0.003781,0.000379,0.005243,0.167695,...,0.005762,0.316797,0.480570,0.000435,0.120067,0.678839,0.310094,0.001122,99,1962
425339,99-1980,0.496448,0.005809,0.871126,0.631582,0.347846,0.003782,0.001613,0.007226,0.006395,...,0.026894,0.726181,0.468218,0.005909,0.560080,0.614216,0.364131,0.005230,99,1980
425340,99-1981,0.529883,0.001138,0.757741,0.711353,0.328254,0.002148,0.000342,0.003874,0.031994,...,0.003076,0.560824,0.502123,0.001087,0.341909,0.696562,0.343720,0.004505,99,1981
425341,99-1991,0.506296,0.000249,0.741608,0.694887,0.291730,0.002348,0.000109,0.004097,0.190385,...,0.003259,0.289489,0.479031,0.000277,0.102198,0.679863,0.308145,0.001242,99,1991


In [6]:
def get_X_y(data):
 Xfeature = data.drop(columns = ["target","time_id","row_id"])
 y = data["target"]
 return Xfeature, y
 

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def feval_rmspe(y_pred, model, is_xgb=True):
    y_true = model.get_label()

    if is_xgb:
        return "RMSPE", rmspe(y_true, y_pred)

    return "RMSPE", rmspe(y_true, y_pred), False


def feval_wrapper(y_pred, model):
    return feval_rmspe(y_pred, model, is_xgb=False)


def stratified_group_k_fold(X, y, groups, k, seed=None):
    """ https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation """
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in tqdm(sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])), total=len(groups_and_y_counts)):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [16]:
lgb_bl = {
    "boosting_type": ["gbdt"],
    "max_depth": [7],
    "num_leaves": [128],
    #"early_stopping_rounds":[10],

    "learning_rate": [0.05],
    "subsample": [0.72],
    "subsample_freq": [4],
    "feature_fraction": [0.4],
    "feature_fraction_bynode": [0.8],
    "bagging_fraction": [0.75],
    "bagging_freq": [25],

    "min_data_in_leaf": [1000],
    "min_sum_hessian_in_leaf": [20],

    "lambda_l1": [2],
    "lambda_l2": [4],
        
    "extra_trees": [True],
    "force_col_wise": [True],
        
    "categorical_column": [0],
    "n_jobs": [-1],
    "verbose": [-1],
}

In [8]:
def train_lgbm(df_train: pd.DataFrame, l ,output_dir,params):
    global feval_wrapper

    best_losses = []
    best_predictions = []
    best_model = None

    for i in l:
      print(f'stock group: {i}')
      X_stock = df_train[df_train.stock_id.isin(i)]

      # Create out of folds array
      y = X_stock['target']
      X_stock = X_stock.drop(['target'], axis=1)
      # Iterate through each fold


      if CV_SPLIT == 'default':
        #gkf = GroupKFold(n_splits=KFOLD)
        skf = stratified_group_k_fold(X=X_stock[feats_nostock], y=X_stock['stock_id'].astype('category').cat.codes.values, 
                                  groups=np.array(X_stock['time_id'].astype('category').cat.codes.values), k=KFOLD, seed=try_seed)
        folds = []

        for cv_idx, (idx_train, idx_valid) in enumerate(skf):
    # for cv_idx, (train_idx, valid_idx) in enumerate(folds):
          X_tr, X_va = X_stock.iloc[idx_train], X_stock.iloc[idx_valid]
          y_tr, y_va = y.iloc[idx_train], y.iloc[idx_valid]

          
          X_tr = X_tr.drop(columns = ["time_id", "row_id"], axis=1)
          X_va = X_va.drop(columns = ["time_id", "row_id"], axis=1)
          dtrain = lgb.Dataset(X_tr, y_tr, weight=1/np.square(y_tr))
          dval = lgb.Dataset(X_va, y_va, weight=1/np.square(y_va))

          best_loss = 1e10
          best_prediction = None

          print(f"fold {cv_idx} train: {X_tr.shape}, valid: {X_va.shape}")

          model = lgb.train(params=params,
                                num_boost_round=10000,
                                train_set=dtrain,
                                valid_sets=[dtrain, dval],
                                verbose_eval=250,
                                early_stopping_rounds=200,
                                #early_stopping_rounds=1,
                                feval=feval_wrapper)

          fold_preds = model.predict(X_va)
          valid_rmspe = rmspe(y_va, fold_preds)
          #print(f"\nvalid rmspe of fold {fold}: {valid_rmspe}")
          
          print(f"\nRMSPE of fold {cv_idx}: {valid_rmspe}")
          # import pdb; pdb.set_trace()
          pickle.dump(model, open(os.path.join(output_dir, f"lgb_bl_{cv_idx}.pkl"), "wb"))
          
          
          # import pdb; pdb.set_trace()
          best_predictions.append(fold_preds)
          best_losses.append(valid_rmspe)

    return best_losses, best_predictions          

In [9]:
# # # full data
X, y = get_X_y(df_train)

In [10]:
features = [col for col in df_train.columns if col not in {"time_id", "target", "row_id"}]
feats_nostock = [col for col in df_train.columns if col not in {"time_id", "target", "row_id", "stock_id"}] 

In [None]:
# # Create out of folds array
# y = df_train['target']
# # Iterate through each fold
# try_seed = 42
# KFOLD = 5
# CV_SPLIT = 'default' 

# if CV_SPLIT == 'default':
#     #gkf = GroupKFold(n_splits=KFOLD)
#     skf = stratified_group_k_fold(X=df_train[feats_nostock], y=df_train['stock_id'].astype('category').cat.codes.values, 
#                               groups=np.array(df_train['time_id'].astype('category').cat.codes.values), k=KFOLD, seed=try_seed)
#     folds = []

#     for i, (idx_train, idx_valid) in enumerate(skf):
#       # x_train, x_val = train.iloc[idx_train], train.iloc[idx_valid]
#       # y_train, y_val = y.iloc[idx_train], y.iloc[idx_valid]
#       folds.append((idx_train, idx_valid))
#       print(f'Fold {i}: {len(idx_train)} train, {len(idx_valid)} valid')
# else:
#     raise ValueError()

In [12]:
train_p = pd.read_csv(f'{root_path}/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr(method='kendall')

ids = corr.index

kmeans = KMeans(n_clusters=5, random_state=42).fit(corr.values)
print(kmeans.labels_) 

[2 1 2 4 0 0 4 2 3 4 1 1 2 2 2 2 2 4 2 2 0 1 2 2 0 2 1 2 4 2 0 2 2 1 0 0 2
 0 2 2 2 1 2 2 2 1 2 2 2 1 1 4 2 0 4 1 1 2 2 2 2 1 2 1 1 0 4 1 4 1 3 3 0 0
 1 2 0 1 0 4 0 2 2 2 1 4 0 0 2 0 2 0 2 2 2 2 2 4 2 1 1 2 2 1 2 2 2 1 2 1 2
 1]


In [13]:
try_seed = 42
KFOLD = 5
CV_SPLIT = 'default' 

l = []
for n in range(5):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )

for i in l:
  print(f'stock group: {i}')
  X_stock = df_train[df_train.stock_id.isin(i)]

  # Create out of folds array
  y = X_stock['target']
  X_stock = X_stock.drop(['target'], axis=1)
  # Iterate through each fold


  if CV_SPLIT == 'default':
    #gkf = GroupKFold(n_splits=KFOLD)
    skf = stratified_group_k_fold(X=X_stock[feats_nostock], y=X_stock['stock_id'].astype('category').cat.codes.values, 
                              groups=np.array(X_stock['time_id'].astype('category').cat.codes.values), k=KFOLD, seed=try_seed)
    folds = []

    for i, (idx_train, idx_valid) in enumerate(skf):
      # x_train, x_val = train.iloc[idx_train], train.iloc[idx_valid]
      # y_train, y_val = y.iloc[idx_train], y.iloc[idx_valid]
      folds.append((idx_train, idx_valid))
      print(f'Fold {i}: {len(idx_train)} train, {len(idx_valid)} valid')
  else:
      raise ValueError()


stock group: [4, 5, 21, 27, 33, 37, 38, 40, 60, 74, 82, 83, 86, 88, 90, 98, 99, 101, 103]


100%|██████████| 3830/3830 [00:27<00:00, 138.14it/s]


Fold 0: 55333 train, 13833 valid
Fold 1: 55333 train, 13833 valid
Fold 2: 55333 train, 13833 valid
Fold 3: 55333 train, 13833 valid
Fold 4: 55332 train, 13834 valid
stock group: [1, 10, 11, 22, 29, 36, 44, 50, 55, 56, 62, 63, 69, 72, 73, 76, 78, 84, 87, 96, 112, 113, 116, 122, 124, 126]


100%|██████████| 3830/3830 [00:48<00:00, 79.00it/s] 


Fold 0: 79664 train, 19916 valid
Fold 1: 79664 train, 19916 valid
Fold 2: 79664 train, 19916 valid
Fold 3: 79664 train, 19916 valid
Fold 4: 79664 train, 19916 valid
stock group: [0, 2, 7, 13, 14, 15, 16, 17, 19, 20, 23, 26, 28, 30, 32, 34, 35, 39, 41, 42, 43, 46, 47, 48, 51, 52, 53, 59, 64, 66, 67, 68, 70, 85, 93, 94, 95, 100, 102, 104, 105, 107, 108, 109, 111, 114, 115, 118, 119, 120, 123, 125]


100%|██████████| 3830/3830 [01:53<00:00, 33.76it/s]


Fold 0: 159327 train, 39831 valid
Fold 1: 159327 train, 39831 valid
Fold 2: 159326 train, 39832 valid
Fold 3: 159326 train, 39832 valid
Fold 4: 159326 train, 39832 valid
stock group: [8, 80, 81]


100%|██████████| 3830/3830 [00:08<00:00, 464.71it/s]


Fold 0: 9184 train, 2296 valid
Fold 1: 9184 train, 2296 valid
Fold 2: 9184 train, 2296 valid
Fold 3: 9184 train, 2296 valid
Fold 4: 9184 train, 2296 valid
stock group: [3, 6, 9, 18, 31, 58, 61, 75, 77, 89, 97, 110]


100%|██████████| 3830/3830 [00:22<00:00, 167.90it/s]

Fold 0: 36768 train, 9191 valid
Fold 1: 36767 train, 9192 valid
Fold 2: 36767 train, 9192 valid
Fold 3: 36767 train, 9192 valid
Fold 4: 36767 train, 9192 valid





In [14]:
X.reset_index(drop=True, inplace=True); y.reset_index(drop=True, inplace=True)

In [17]:
lgbm_losses, lgbm_preds = train_lgbm(df_train,l = l, 
                              output_dir=save_dir,
                              params= lgb_bl)

stock group: [4, 5, 21, 27, 33, 37, 38, 40, 60, 74, 82, 83, 86, 88, 90, 98, 99, 101, 103]


100%|██████████| 3830/3830 [00:54<00:00, 69.66it/s] 


fold 0 train: (55333, 112), valid: (13833, 112)
Training until validation scores don't improve for 200 rounds
[250]	training's RMSPE: 0.263012	valid_1's RMSPE: 0.264363
[500]	training's RMSPE: 0.257579	valid_1's RMSPE: 0.26244
[750]	training's RMSPE: 0.253796	valid_1's RMSPE: 0.261456
[1000]	training's RMSPE: 0.250743	valid_1's RMSPE: 0.260964
[1250]	training's RMSPE: 0.248086	valid_1's RMSPE: 0.2608
[1500]	training's RMSPE: 0.245577	valid_1's RMSPE: 0.260567
[1750]	training's RMSPE: 0.243473	valid_1's RMSPE: 0.26047
[2000]	training's RMSPE: 0.24151	valid_1's RMSPE: 0.260505
Early stopping, best iteration is:
[1837]	training's RMSPE: 0.242767	valid_1's RMSPE: 0.260375

RMSPE of fold 0: 0.2603750203088708
fold 1 train: (55333, 112), valid: (13833, 112)
Training until validation scores don't improve for 200 rounds
[250]	training's RMSPE: 0.261754	valid_1's RMSPE: 0.273977
[500]	training's RMSPE: 0.256318	valid_1's RMSPE: 0.271675
[750]	training's RMSPE: 0.25239	valid_1's RMSPE: 0.270473


KeyboardInterrupt: 

In [None]:
lgbm_losses

[0.2281209816687572,
 0.21700990661736327,
 0.22441095293406976,
 0.2170015148121175,
 0.22123825265447938,
 0.2690769809335837,
 0.2642502810387935,
 0.26503750437013474,
 0.28367435149565395,
 0.2643990561289135,
 0.2220524477783735,
 0.2207926229267815,
 0.22233575068123035,
 0.21904276479078513,
 0.22546162648628454,
 0.24420483359555645,
 0.24961161451136596,
 0.28861180306826734,
 0.2493914839034596,
 0.25357001073907326,
 0.20484109119198965,
 0.19257327598365936,
 0.197961206000214,
 0.19132198979071394,
 0.19350023119613577]

In [None]:
np.mean(lgbm_losses)

0.23317970141191025

In [None]:
# 0.2324505127007035

## Test

In [None]:
# with open(os.path.join(save_dir, f"lgb_bl_0.pkl"), 'rb') as f:
#    model_0 = pickle.load(f) 
# with open(os.path.join(save_dir, f"lgb_bl_1.pkl"), 'rb') as f:
#    model_1 = pickle.load(f) 
# with open(os.path.join(save_dir, f"lgb_bl_2.pkl"), 'rb') as f:
#    model_2 = pickle.load(f) 