In [1]:
import os
import math
import warnings
import gc
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pickle

import bloscpack as bp

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split, RepeatedStratifiedKFold
from sklearn import metrics

pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [7]:
dat_files = [f for f in os.listdir('../input') if ('_dat_' in f) and ('srs' not in f)]
lbl_files = [f for f in os.listdir('../input') if ('_lbl_' in f) and ('srs' not in f)]
# feat_files = [f for f in os.listdir('../input') if '_feat_' in f]

def load_group_data(group):
    assert isinstance(group, str)
    dat_to_load = [f for f in dat_files if group in f]
    lbl_to_load = [f for f in lbl_files if group in f]
    
    dat = np.concatenate([bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in dat_to_load], axis=1)
    lbl = [bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in lbl_to_load]
    assert np.all([np.all(a==b) for a, b in zip(lbl[:-1], lbl[1:])])
    return dat, lbl[0]

In [10]:
g0_dat, g0_lbl = load_group_data('g0')
g1_dat, g1_lbl = load_group_data('g1')
g2_dat, g2_lbl = load_group_data('g2')
g3_dat, g3_lbl = load_group_data('g3')
g4_dat, g4_lbl = load_group_data('g4')
g5_dat, g5_lbl = load_group_data('g5')
g6_dat, g6_lbl = load_group_data('g6')
g7_dat, g7_lbl = load_group_data('g7')
g8_dat, g8_lbl = load_group_data('g8')
g9_dat, g9_lbl = load_group_data('g9')

In [9]:
dat = np.concatenate([g0_dat, g1_dat, g2_dat, g3_dat, g4_dat, g5_dat, g6_dat, g7_dat, g8_dat, g9_dat], axis=0)
lbl = np.concatenate([g0_lbl, g1_lbl, g2_lbl, g3_lbl, g4_lbl, g5_lbl, g6_lbl, g7_lbl, g8_lbl, g9_lbl], axis=0)
# dat = np.concatenate([g0_dat, g1_dat, g2_dat, g3_dat, g6_dat, g7_dat], axis=0)
# lbl = np.concatenate([g0_lbl, g1_lbl, g2_lbl, g3_lbl, g6_lbl, g7_lbl], axis=0)
# dat = np.concatenate([g3_dat, g4_dat, g5_dat, g7_dat, g8_dat, g9_dat], axis=0)
# lbl = np.concatenate([g3_lbl, g4_lbl, g5_lbl, g7_lbl, g8_lbl, g9_lbl], axis=0)
# dat = np.concatenate([g4_dat, g5_dat, g8_dat, g9_dat], axis=0)
# lbl = np.concatenate([g4_lbl, g5_lbl, g8_lbl, g9_lbl], axis=0)
# dat = g9_dat
# lbl = g9_lbl

NameError: name 'g0_dat' is not defined

In [None]:
# del g0_dat, g1_dat, g2_dat, g3_dat, g4_dat, g5_dat, g6_dat, g7_dat, g8_dat, g9_dat
# del g0_lbl, g1_lbl, g2_lbl, g3_lbl, g4_lbl, g5_lbl, g6_lbl, g7_lbl, g8_lbl, g9_lbl
# del g0_dat, g1_dat, g2_dat
# del g0_lbl, g1_lbl, g2_lbl

In [4]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
for fold, (trn_ndcs, vld_ndcs) in enumerate(kf.split(dat, lbl)):
    x_trn, x_vld = dat[trn_ndcs], dat[vld_ndcs]
    y_trn, y_vld = lbl[trn_ndcs], lbl[vld_ndcs]
    #trn_set = lgb.Dataset(x_trn, y_trn)
    #vld_set = lgb.Dataset(x_vld, y_vld)
    break

In [8]:
params = {
    "boosting": "gbdt",
    "metric": 'rmse',
    'objective': 'regression',
    'random_state': 236,
    'num_leaves': 280,
    'learning_rate': 0.026623466966581126,
    'max_depth': 80,
    'reg_alpha': 2.959759088169741, # L1
    'reg_lambda': 1.331172832164913, # L2
    "bagging_fraction": 0.9655406551472153,
    "bagging_freq": 9,
    'colsample_bytree': 0.6867118652742716
}

In [9]:
# model = lgb.train(params, trn_set, num_boost_round=10000, early_stopping_rounds=100, valid_sets=[vld_set], verbose_eval=50)
model = lgb.LGBMRegressor(**params, n_estimators=10000, n_jobs=12)
model.fit(X=x_trn, y=y_trn, eval_set=[(x_vld, y_vld)], eval_metric='rmse', verbose=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.646549
[100]	valid_0's rmse: 0.255594
[150]	valid_0's rmse: 0.202702
[200]	valid_0's rmse: 0.198086
[250]	valid_0's rmse: 0.197368
[300]	valid_0's rmse: 0.197023
[350]	valid_0's rmse: 0.19681
[400]	valid_0's rmse: 0.19665
[450]	valid_0's rmse: 0.196541
[500]	valid_0's rmse: 0.196456
[550]	valid_0's rmse: 0.196385
[600]	valid_0's rmse: 0.196301
[650]	valid_0's rmse: 0.196238
[700]	valid_0's rmse: 0.19617
[750]	valid_0's rmse: 0.196109
[800]	valid_0's rmse: 0.196064
[850]	valid_0's rmse: 0.196014
[900]	valid_0's rmse: 0.195978
[950]	valid_0's rmse: 0.195931
[1000]	valid_0's rmse: 0.195898
[1050]	valid_0's rmse: 0.195884
[1100]	valid_0's rmse: 0.195848
[1150]	valid_0's rmse: 0.19582
[1200]	valid_0's rmse: 0.195794
[1250]	valid_0's rmse: 0.195781
[1300]	valid_0's rmse: 0.195753
[1350]	valid_0's rmse: 0.195728
[1400]	valid_0's rmse: 0.19571
[1450]	valid_0's rmse: 0.195693
[1500]	valid_0's rmse: 0.195671
[15

LGBMRegressor(bagging_fraction=0.9655406551472153, bagging_freq=9,
              boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.6867118652742716, importance_type='split',
              learning_rate=0.026623466966581126, max_depth=80, metric='rmse',
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=12, num_leaves=280,
              objective='regression', random_state=236,
              reg_alpha=2.959759088169741, reg_lambda=1.331172832164913,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [10]:
vld_pred = model.predict(x_vld, num_iteration=model.best_iteration_)
vld_pred = np.round(np.clip(vld_pred, 0, 10)).astype(int)
f1 = metrics.f1_score(y_vld.astype(int), vld_pred, average = 'macro')
print(f1)

0.934983763304277


In [10]:
vld_pred_g0= model.predict(g0_dat, num_iteration=model.best_iteration_)
vld_pred_g0 = np.round(np.clip(vld_pred_g0, 0, 10)).astype(int)
f1_g0 = metrics.f1_score(g0_lbl.astype(int), vld_pred_g0, average = 'macro')
print(f1_g0)

0.6648643063234626


In [11]:
vld_pred_g1= model.predict(g1_dat, num_iteration=model.best_iteration_)
vld_pred_g1 = np.round(np.clip(vld_pred_g1, 0, 10)).astype(int)
f1_g1 = metrics.f1_score(g1_lbl.astype(int), vld_pred_g1, average = 'macro')
print(f1_g1)

0.9968103630824334


In [12]:
vld_pred_g2= model.predict(g2_dat, num_iteration=model.best_iteration_)
vld_pred_g2 = np.round(np.clip(vld_pred_g2, 0, 10)).astype(int)
f1_g2 = metrics.f1_score(g2_lbl.astype(int), vld_pred_g2, average = 'macro')
print(f1_g2)

0.6647126573075317


In [21]:
vld_pred_g3= model.predict(g3_dat, num_iteration=model.best_iteration_)
vld_pred_g3 = np.round(np.clip(vld_pred_g3, 0, 10)).astype(int)
f1_g3 = metrics.f1_score(g3_lbl.astype(int), vld_pred_g3, average = 'macro')
print(f1_g3)

0.986656035463432


In [22]:
vld_pred_g4= model.predict(g4_dat, num_iteration=model.best_iteration_)
vld_pred_g4 = np.round(np.clip(vld_pred_g4, 0, 10)).astype(int)
f1_g4 = metrics.f1_score(g4_lbl.astype(int), vld_pred_g4, average = 'macro')
print(f1_g4)

0.8942446664286391


In [23]:
vld_pred_g5= model.predict(g5_dat, num_iteration=model.best_iteration_)
vld_pred_g5 = np.round(np.clip(vld_pred_g5, 0, 10)).astype(int)
f1_g5 = metrics.f1_score(g5_lbl.astype(int), vld_pred_g5, average = 'macro')
print(f1_g5)

0.9732927691543368


In [34]:
vld_pred_g6= model.predict(g6_dat, num_iteration=model.best_iteration_)
vld_pred_g6 = np.round(np.clip(vld_pred_g6, 0, 10)).astype(int)
f1_g6 = metrics.f1_score(g6_lbl.astype(int), vld_pred_g6, average = 'macro')
print(f1_g6)

0.6646888239380492


In [15]:
vld_pred_g7 = model.predict(g7_dat, num_iteration=model.best_iteration_)
vld_pred_g7 = np.round(np.clip(vld_pred_g7, 0, 10)).astype(int)
f1_g7 = metrics.f1_score(g7_lbl.astype(int), vld_pred_g7, average = 'macro')
print(f1_g7)

0.9729210756681197


In [16]:
vld_pred_g8 = model.predict(g8_dat, num_iteration=model.best_iteration_)
vld_pred_g8 = np.round(np.clip(vld_pred_g8, 0, 10)).astype(int)
f1_g8 = metrics.f1_score(g8_lbl.astype(int), vld_pred_g8, average = 'macro')
print(f1_g8)

0.9756024481983628


In [17]:
vld_pred_g9 = model.predict(g9_dat, num_iteration=model.best_iteration_)
vld_pred_g9 = np.round(np.clip(vld_pred_g9, 0, 10)).astype(int)
f1_g9 = metrics.f1_score(g9_lbl.astype(int), vld_pred_g9, average = 'macro')
print(f1_g9)

0.9147493146748186


In [47]:
pre_train[features].columns

Index(['signal', 'signal_clean', 'signal_clean', 'lag_t1', 'lag_t2', 'lag_t3',
       'lead_t1', 'lead_t2', 'lead_t3', 'signalmean_t1000',
       ...
       'ewm_mean_10', 'ewm_std_10', 'ewm_mean_50', 'ewm_std_50',
       'ewm_mean_100', 'ewm_std_100', 'ewm_mean_500', 'ewm_std_500',
       'ewm_mean_1000', 'ewm_std_1000'],
      dtype='object', length=209)

In [46]:
pre_train[features].columns[np.argsort(model.feature_importances_)]

Index(['abs_avgbatch_25000', 'signal_clean', 'abs_maxbatch_25000',
       'signal_clean', 'p25batch_25000', 'medianbatch_25000', 'p75batch_25000',
       'maxtominbatch_25000', 'rangebatch_25000', 'p90batch_25000',
       ...
       'highpass_lf_0.7943', 'highpass_ff_0.7943', 'grad_1', 'grad_3',
       'lag_t3', 'ewm_std_50', 'lead_t2', 'lead_t3', 'lead_t1', 'ewm_std_10'],
      dtype='object', length=209)