In [1]:
import os
import joblib
import math
import warnings
import gc
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pickle

import bloscpack as bp

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split, RepeatedStratifiedKFold
from sklearn import metrics

pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [2]:
with open('../input/batch_ids_trn.pkl', 'rb') as f:
    batch_id_trn = pickle.load(f)
with open('../input/batch_ids_tst.pkl', 'rb') as f:
    batch_id_tst = pickle.load(f)

In [4]:
dat_w200 = bp.unpack_ndarray_from_file('../input/feats_tblr/trn_dat_all_w200.bp')
lbl_w200 = bp.unpack_ndarray_from_file('../input/feats_tblr/trn_lbl_all_w200.bp')

dat_orig = pd.read_pickle('../input/feats_tblr/trn_dat_orig_v2_all.pkl')
lbl_orig = pd.read_pickle('../input/feats_tblr/trn_lbl_orig_v2_all.pkl')

dat_orig = dat_orig.loc[:, [c for c in dat_orig.columns if c not in ('time', 'signal', 'batch', 'open_channels')]]

In [5]:
dat = np.concatenate([dat_orig.values, dat_w200], axis=-1)
lbl = lbl_w200
del dat_w200, tst_w200, dat_orig, tst_orig, lbl_orig

In [8]:
new_lbl = [str(a) + '_' + str(b) for a, b in zip(lbl.astype('uint32'), np.concatenate([np.ones(500000).astype('uint32') * i for i in range(10)]))]
unq_l = np.unique(new_lbl)
lbl_map = {str_l: i for str_l, i in zip(unq_l, np.arange(len(unq_l)))}
new_lbl = [lbl_map[s] for s in new_lbl]

In [9]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [38]:
del x_trn, x_vld, y_trn, y_vld
gc.collect()

48

In [39]:
for fold, (trn_ndcs, vld_ndcs) in enumerate(kf.split(dat, new_lbl)):
    if fold == 4:
        x_trn, x_vld = dat[trn_ndcs], dat[vld_ndcs]
        y_trn, y_vld = lbl[trn_ndcs], lbl[vld_ndcs]
        break
    #trn_set = lgb.Dataset(x_trn, y_trn)
    #vld_set = lgb.Dataset(x_vld, y_vld)

In [40]:
params = {
    "boosting": "gbdt",
    "metric": 'rmse',
    'objective': 'regression',
    'random_state': 236,
    'num_leaves': 280,
    'learning_rate': 0.026623466966581126,
    'max_depth': 80,
    'reg_alpha': 2.959759088169741, # L1
    'reg_lambda': 1.331172832164913, # L2
    "bagging_fraction": 0.9655406551472153,
    "bagging_freq": 9,
    'colsample_bytree': 0.6867118652742716
}

In [41]:
model = lgb.LGBMRegressor(**params, n_estimators=10000, n_jobs=12)
model.fit(X=x_trn, y=y_trn, eval_set=[(x_vld, y_vld)], eval_metric='rmse', verbose=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.708458
[100]	valid_0's rmse: 0.237957
[150]	valid_0's rmse: 0.162392
[200]	valid_0's rmse: 0.155546
[250]	valid_0's rmse: 0.154789
[300]	valid_0's rmse: 0.154534
[350]	valid_0's rmse: 0.154353
[400]	valid_0's rmse: 0.154238
[450]	valid_0's rmse: 0.154132
[500]	valid_0's rmse: 0.154063
[550]	valid_0's rmse: 0.153996
[600]	valid_0's rmse: 0.153939
[650]	valid_0's rmse: 0.153877
[700]	valid_0's rmse: 0.153824
[750]	valid_0's rmse: 0.153768
[800]	valid_0's rmse: 0.153726
[850]	valid_0's rmse: 0.153691
[900]	valid_0's rmse: 0.153649
[950]	valid_0's rmse: 0.153623
[1000]	valid_0's rmse: 0.153597
[1050]	valid_0's rmse: 0.153572
[1100]	valid_0's rmse: 0.153549
[1150]	valid_0's rmse: 0.153527
[1200]	valid_0's rmse: 0.153508
[1250]	valid_0's rmse: 0.153487
[1300]	valid_0's rmse: 0.153464
[1350]	valid_0's rmse: 0.15345
[1400]	valid_0's rmse: 0.153427
[1450]	valid_0's rmse: 0.153413
[1500]	valid_0's rmse: 0.153402

LGBMRegressor(bagging_fraction=0.9655406551472153, bagging_freq=9,
              boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.6867118652742716, importance_type='split',
              learning_rate=0.026623466966581126, max_depth=80, metric='rmse',
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=12, num_leaves=280,
              objective='regression', random_state=236,
              reg_alpha=2.959759088169741, reg_lambda=1.331172832164913,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [42]:
joblib.dump(model, './saved_models/lgbm_feats_origv2_myw200_fld{:d}.pkl'.format(fold))

['./saved_models/lgbm_feats_origv2_myw200_fld4.pkl']

In [43]:
vld_pred = model.predict(x_vld, num_iteration=model.best_iteration_)
vld_pred = np.round(np.clip(vld_pred, 0, 10)).astype(int)
f1 = metrics.f1_score(y_vld.astype(int), vld_pred, average = 'macro')
print(f1)

0.9386508905001126


In [10]:
vld_pred_g0= model.predict(g0_dat, num_iteration=model.best_iteration_)
vld_pred_g0 = np.round(np.clip(vld_pred_g0, 0, 10)).astype(int)
f1_g0 = metrics.f1_score(g0_lbl.astype(int), vld_pred_g0, average = 'macro')
print(f1_g0)

0.9968903682469297


In [11]:
vld_pred_g1= model.predict(g1_dat, num_iteration=model.best_iteration_)
vld_pred_g1 = np.round(np.clip(vld_pred_g1, 0, 10)).astype(int)
f1_g1 = metrics.f1_score(g1_lbl.astype(int), vld_pred_g1, average = 'macro')
print(f1_g1)

0.9968424553280641


In [12]:
vld_pred_g2= model.predict(g2_dat, num_iteration=model.best_iteration_)
vld_pred_g2 = np.round(np.clip(vld_pred_g2, 0, 10)).astype(int)
f1_g2 = metrics.f1_score(g2_lbl.astype(int), vld_pred_g2, average = 'macro')
print(f1_g2)

0.6646538107371066


In [13]:
vld_pred_g3= model.predict(g3_dat, num_iteration=model.best_iteration_)
vld_pred_g3 = np.round(np.clip(vld_pred_g3, 0, 10)).astype(int)
f1_g3 = metrics.f1_score(g3_lbl.astype(int), vld_pred_g3, average = 'macro')
print(f1_g3)

0.9868084651120028


In [14]:
vld_pred_g4= model.predict(g4_dat, num_iteration=model.best_iteration_)
vld_pred_g4 = np.round(np.clip(vld_pred_g4, 0, 10)).astype(int)
f1_g4 = metrics.f1_score(g4_lbl.astype(int), vld_pred_g4, average = 'macro')
print(f1_g4)

0.8212875579648767


In [15]:
vld_pred_g5= model.predict(g5_dat, num_iteration=model.best_iteration_)
vld_pred_g5 = np.round(np.clip(vld_pred_g5, 0, 10)).astype(int)
f1_g5 = metrics.f1_score(g5_lbl.astype(int), vld_pred_g5, average = 'macro')
print(f1_g5)

0.9736932581052553


In [16]:
vld_pred_g6= model.predict(g6_dat, num_iteration=model.best_iteration_)
vld_pred_g6 = np.round(np.clip(vld_pred_g6, 0, 10)).astype(int)
f1_g6 = metrics.f1_score(g6_lbl.astype(int), vld_pred_g6, average = 'macro')
print(f1_g6)

0.9966063601403695


In [17]:
vld_pred_g7 = model.predict(g7_dat, num_iteration=model.best_iteration_)
vld_pred_g7 = np.round(np.clip(vld_pred_g7, 0, 10)).astype(int)
f1_g7 = metrics.f1_score(g7_lbl.astype(int), vld_pred_g7, average = 'macro')
print(f1_g7)

0.7780210018430778


In [18]:
vld_pred_g8 = model.predict(g8_dat, num_iteration=model.best_iteration_)
vld_pred_g8 = np.round(np.clip(vld_pred_g8, 0, 10)).astype(int)
f1_g8 = metrics.f1_score(g8_lbl.astype(int), vld_pred_g8, average = 'macro')
print(f1_g8)

0.9749084848835565


In [19]:
vld_pred_g9 = model.predict(g9_dat, num_iteration=model.best_iteration_)
vld_pred_g9 = np.round(np.clip(vld_pred_g9, 0, 10)).astype(int)
f1_g9 = metrics.f1_score(g9_lbl.astype(int), vld_pred_g9, average = 'macro')
print(f1_g9)

0.8977718030245324


In [3]:
tst_w200 = bp.unpack_ndarray_from_file('../input/feats_tblr/tst_dat_all_w200.bp')
tst_orig = pd.read_pickle('../input/feats_tblr/tst_dat_orig_v2_all.pkl')
tst_orig = tst_orig.loc[:, [c for c in tst_orig.columns if c not in ('time', 'signal', 'batch', 'open_channels')]]
tst = np.concatenate([tst_orig.values, tst_w200], axis=-1)

In [4]:
submission = pd.read_csv('../input/sample_submission.csv', dtype={'time': str, 'open_channels': 'Int64'})

In [7]:
for fold in range(5):
    mdl = joblib.load('./saved_models/lgbm_feats_origv2_myw200_fld{:d}.pkl'.format(fold))
    if fold == 0:
        predictions = mdl.predict(tst, num_iteration=mdl.best_iteration_)
    else:
        predictions += mdl.predict(tst, num_iteration=mdl.best_iteration_)

In [8]:
predictions_sub = np.round(np.clip(predictions/5, 0, 10)).astype(int)

In [9]:
submission['open_channels'] = predictions_sub

In [10]:
submission.to_csv("../submissions/sub_lgbm_origv2_w200feats_cvbygroupandclass.csv", index=False)