In [1]:
import os
import joblib
import math
import warnings
import gc
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pickle

import bloscpack as bp

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split, RepeatedStratifiedKFold
from sklearn import metrics

pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [2]:
with open('../input/batch_ids_trn.pkl', 'rb') as f:
    batch_id_trn = pickle.load(f)
with open('../input/batch_ids_tst.pkl', 'rb') as f:
    batch_id_tst = pickle.load(f)

In [3]:
dat_w500 = bp.unpack_ndarray_from_file('../input/feats_tblr/trn_dat_all_w500_fixed.bp')
lbl_w200 = bp.unpack_ndarray_from_file('../input/feats_tblr/trn_lbl_all_w200.bp')

dat_orig = pd.read_pickle('../input/feats_tblr/trn_dat_orig_v2_all.pkl')
lbl_orig = pd.read_pickle('../input/feats_tblr/trn_lbl_orig_v2_all.pkl')

dat_orig = dat_orig.loc[:, [c for c in dat_orig.columns if c not in ('time', 'batch', 'open_channels')]]

In [4]:
lbl = lbl_w200
dat = np.concatenate([dat_orig, dat_w500], axis=1)
del dat_orig, dat_w500

In [5]:
new_lbl = [str(a) + '_' + str(b) for a, b in zip(lbl.astype('uint32'), np.concatenate([np.ones(500000).astype('uint32') * i for i in range(10)]))]
unq_l = np.unique(new_lbl)
lbl_map = {str_l: i for str_l, i in zip(unq_l, np.arange(len(unq_l)))}
new_lbl = [lbl_map[s] for s in new_lbl]

In [6]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [32]:
del x_trn, x_vld, y_trn, y_vld
gc.collect()

70

In [33]:
for fold, (trn_ndcs, vld_ndcs) in enumerate(kf.split(dat, lbl)):
    if fold == 4:
        x_trn, x_vld = dat[trn_ndcs], dat[vld_ndcs]
        y_trn, y_vld = lbl[trn_ndcs], lbl[vld_ndcs]
        break
    #trn_set = lgb.Dataset(x_trn, y_trn)
    #vld_set = lgb.Dataset(x_vld, y_vld)

In [34]:
params = {
    "boosting": "gbdt",
    "metric": 'rmse',
    'objective': 'regression',
    'random_state': 236,
    'num_leaves': 280,
    'learning_rate': 0.026623466966581126,
    'max_depth': 80,
    'reg_alpha': 2.959759088169741, # L1
    'reg_lambda': 1.331172832164913, # L2
    "bagging_fraction": 0.9655406551472153,
    "bagging_freq": 9,
    'colsample_bytree': 0.6867118652742716
}

In [35]:
# model = lgb.train(params, trn_set, num_boost_round=10000, early_stopping_rounds=100, valid_sets=[vld_set], verbose_eval=50)
model = lgb.LGBMRegressor(**params, n_estimators=10000, n_jobs=14)
model.fit(X=x_trn, y=y_trn, eval_set=[(x_vld, y_vld)], eval_metric='rmse', verbose=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.708414
[100]	valid_0's rmse: 0.237627
[150]	valid_0's rmse: 0.161821
[200]	valid_0's rmse: 0.155014
[250]	valid_0's rmse: 0.154253
[300]	valid_0's rmse: 0.153986
[350]	valid_0's rmse: 0.153813
[400]	valid_0's rmse: 0.153685
[450]	valid_0's rmse: 0.153571
[500]	valid_0's rmse: 0.153488
[550]	valid_0's rmse: 0.153422
[600]	valid_0's rmse: 0.153365
[650]	valid_0's rmse: 0.153315
[700]	valid_0's rmse: 0.153277
[750]	valid_0's rmse: 0.153233
[800]	valid_0's rmse: 0.153201
[850]	valid_0's rmse: 0.153171
[900]	valid_0's rmse: 0.153144
[950]	valid_0's rmse: 0.153113
[1000]	valid_0's rmse: 0.153092
[1050]	valid_0's rmse: 0.153076
[1100]	valid_0's rmse: 0.153054
[1150]	valid_0's rmse: 0.153036
[1200]	valid_0's rmse: 0.153012
[1250]	valid_0's rmse: 0.152996
[1300]	valid_0's rmse: 0.152979
[1350]	valid_0's rmse: 0.152967
[1400]	valid_0's rmse: 0.152953
[1450]	valid_0's rmse: 0.152939
[1500]	valid_0's rmse: 0.15293

LGBMRegressor(bagging_fraction=0.9655406551472153, bagging_freq=9,
              boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.6867118652742716, importance_type='split',
              learning_rate=0.026623466966581126, max_depth=80, metric='rmse',
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=14, num_leaves=280,
              objective='regression', random_state=236,
              reg_alpha=2.959759088169741, reg_lambda=1.331172832164913,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [36]:
joblib.dump(model, './saved_models/lgbm_feats_origv2_myw500_fld{:d}.pkl'.format(fold))

['./saved_models/lgbm_feats_origv2_myw500_fld4.pkl']

In [37]:
vld_pred = model.predict(x_vld, num_iteration=model.best_iteration_)
vld_pred = np.round(np.clip(vld_pred, 0, 10)).astype(int)
f1 = metrics.f1_score(y_vld.astype(int), vld_pred, average = 'macro')
print(f1)

0.9394773018394715


In [14]:
del x_trn, x_vld, y_trn, y_vld

In [17]:
# g0_dat, g0_lbl = dat[batch_id_trn[0]], lbl[batch_id_trn[0]]
del g0_dat, g0_lbl

In [16]:
vld_pred_g0= model.predict(g0_dat, num_iteration=model.best_iteration_)
vld_pred_g0 = np.round(np.clip(vld_pred_g0, 0, 10)).astype(int)
f1_g0 = metrics.f1_score(g0_lbl.astype(int), vld_pred_g0, average = 'macro')
print(f1_g0)

0.6651485865682973


In [20]:
# g1_dat, g1_lbl = dat[batch_id_trn[1]], lbl[batch_id_trn[1]]
del g1_dat, g1_lbl

In [19]:
vld_pred_g1= model.predict(g1_dat, num_iteration=model.best_iteration_)
vld_pred_g1 = np.round(np.clip(vld_pred_g1, 0, 10)).astype(int)
f1_g1 = metrics.f1_score(g1_lbl.astype(int), vld_pred_g1, average = 'macro')
print(f1_g1)

0.9970837314032912


In [23]:
# g2_dat, g2_lbl = dat[batch_id_trn[2]], lbl[batch_id_trn[2]]
del g2_dat, g2_lbl

In [22]:
vld_pred_g2= model.predict(g2_dat, num_iteration=model.best_iteration_)
vld_pred_g2 = np.round(np.clip(vld_pred_g2, 0, 10)).astype(int)
f1_g2 = metrics.f1_score(g2_lbl.astype(int), vld_pred_g2, average = 'macro')
print(f1_g2)

0.6647289234525314


In [26]:
# g3_dat, g3_lbl = dat[batch_id_trn[3]], lbl[batch_id_trn[3]]
del g3_dat, g3_lbl

In [25]:
vld_pred_g3= model.predict(g3_dat, num_iteration=model.best_iteration_)
vld_pred_g3 = np.round(np.clip(vld_pred_g3, 0, 10)).astype(int)
f1_g3 = metrics.f1_score(g3_lbl.astype(int), vld_pred_g3, average = 'macro')
print(f1_g3)

0.7901239867368004


In [29]:
# g4_dat, g4_lbl = dat[batch_id_trn[4]], lbl[batch_id_trn[4]]
del g4_dat, g4_lbl

In [28]:
vld_pred_g4= model.predict(g4_dat, num_iteration=model.best_iteration_)
vld_pred_g4 = np.round(np.clip(vld_pred_g4, 0, 10)).astype(int)
f1_g4 = metrics.f1_score(g4_lbl.astype(int), vld_pred_g4, average = 'macro')
print(f1_g4)

0.8373076786675838


In [32]:
# g5_dat, g5_lbl = dat[batch_id_trn[5]], lbl[batch_id_trn[5]]
del g5_dat, g5_lbl

In [31]:
vld_pred_g5= model.predict(g5_dat, num_iteration=model.best_iteration_)
vld_pred_g5 = np.round(np.clip(vld_pred_g5, 0, 10)).astype(int)
f1_g5 = metrics.f1_score(g5_lbl.astype(int), vld_pred_g5, average = 'macro')
print(f1_g5)

0.9749530604325839


In [35]:
# g6_dat, g6_lbl = dat[batch_id_trn[6]], lbl[batch_id_trn[6]]
del g6_dat, g6_lbl

In [34]:
vld_pred_g6= model.predict(g6_dat, num_iteration=model.best_iteration_)
vld_pred_g6 = np.round(np.clip(vld_pred_g6, 0, 10)).astype(int)
f1_g6 = metrics.f1_score(g6_lbl.astype(int), vld_pred_g6, average = 'macro')
print(f1_g6)

0.6646614145124898


In [38]:
# g7_dat, g7_lbl = dat[batch_id_trn[7]], lbl[batch_id_trn[7]]
del g7_dat, g7_lbl

In [37]:
vld_pred_g7 = model.predict(g7_dat, num_iteration=model.best_iteration_)
vld_pred_g7 = np.round(np.clip(vld_pred_g7, 0, 10)).astype(int)
f1_g7 = metrics.f1_score(g7_lbl.astype(int), vld_pred_g7, average = 'macro')
print(f1_g7)

0.779383838259999


In [41]:
# g8_dat, g8_lbl = dat[batch_id_trn[8]], lbl[batch_id_trn[8]]
del g8_dat, g8_lbl

In [40]:
vld_pred_g8 = model.predict(g8_dat, num_iteration=model.best_iteration_)
vld_pred_g8 = np.round(np.clip(vld_pred_g8, 0, 10)).astype(int)
f1_g8 = metrics.f1_score(g8_lbl.astype(int), vld_pred_g8, average = 'macro')
print(f1_g8)

0.9750107890468044


In [42]:
g9_dat, g9_lbl = dat[batch_id_trn[9]], lbl[batch_id_trn[9]]
# del g9_dat, g9_lbl

In [43]:
vld_pred_g9 = model.predict(g9_dat, num_iteration=model.best_iteration_)
vld_pred_g9 = np.round(np.clip(vld_pred_g9, 0, 10)).astype(int)
f1_g9 = metrics.f1_score(g9_lbl.astype(int), vld_pred_g9, average = 'macro')
print(f1_g9)

0.9159880733912752


### inference for submission

In [6]:
tst_fs_w500

['tst_dat_all_w500.bp']

In [7]:
# tst_fs_w500 = sorted([f for f in os.listdir('../input/feats_tblr') if ('tst_dat' in f) and ('w500' in f)])
# tst_fs_w500 = [tst_fs_w500[i] for i in [0, 11, 12, 13, 14, 15, 16, 17, 18, 19]] + tst_fs_w500[1:11]
# tst_fs_w500 = np.concatenate(
#     [
#         bp.unpack_ndarray_from_file(os.path.join('../input/feats_tblr', f)) for f in tst_fs_w500
#     ],
#     0
# )

tst_fs_w500 = bp.unpack_ndarray_from_file(os.path.join('../input/feats_tblr/', 'tst_dat_all_w500.bp'))

tst_dat_orig = pd.read_pickle(os.path.join('../input/feats_tblr', 'tst_dat_orig_v2_all.pkl'))
tst_dat_orig = tst_dat_orig.loc[:, [c for c in tst_dat_orig.columns if c not in ('time', 'batch', 'open_channels')]]

tst_dat = np.concatenate([tst_dat_orig, tst_fs_w500], axis=1)

In [8]:
submission = pd.read_csv('../input/sample_submission.csv', dtype={'time': str, 'open_channels': 'Int64'})

In [9]:
for fld in range(5):
    mdl = joblib.load('./saved_models/lgbm_feats_origv2_myw500_fld{:d}.pkl'.format(fld))
    if fld == 0:
        predictions = mdl.predict(tst_dat, num_iteration=mdl.best_iteration_)
    else:
        predictions += mdl.predict(tst_dat, num_iteration=mdl.best_iteration_)

In [11]:
predictions.max()

50.47457739798004

In [12]:
predictions_sub = np.round(np.clip(predictions / 5, 0, 10)).astype(int)

In [13]:
submission['open_channels'] = predictions_sub

In [14]:
submission.to_csv("../submissions/sub_lgbm_origv2_w500feats_cvbygroupandclass.csv", index=False)