In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions

In [3]:
folder = 'final oofs 519/'
os.listdir(folder)

['baseline_xgb_519_oof_test_pred.csv',
 'baseline_xgb_519_oof_val_pred.csv',
 'cgb519_with_categorical_oof_test_pred.csv',
 'cgb519_with_categorical_oof_val_pred.csv',
 'lgb519_goss_append_keypoint_oof_test_pred.csv',
 'lgb519_goss_append_keypoint_oof_val_pred.csv',
 'rf_519_oof_test_pred.csv',
 'rf_519_oof_val_pred.csv',
 'ridge_new_519_oof_test_pred.csv',
 'ridge_new_519_oof_val_pred.csv',
 'selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_oof_test_pred.csv',
 'selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_oof_val_pred.csv',
 'xentropy_add_lotsof_image_features_lgb519_oof_test_pred.csv',
 'xentropy_add_lotsof_image_features_lgb519_oof_val_pred.csv']

In [4]:
prefixs = [
    'xentropy_add_lotsof_image_features_lgb519',
    'baseline_xgb_519',
    'ridge_new_519',
    'rf_519',
    'rf_519_7000_leaves',
    'lgb519_goss_append_keypoint',
    'cgb519_with_categorical',
    'selftrained_bigru_conv1d_merged_with_image_adv_519_rnn'
]

In [5]:
train = pd.DataFrame()
test = pd.DataFrame()

In [6]:
from scipy.stats import hmean
from scipy.stats.mstats import gmean

def get_clipped_values(a):
    return np.clip(a, 1e-15, 1.)

for prefix in prefixs:
    train_f = folder + prefix + '_oof_val_pred.csv'
    test_f = folder + prefix + '_oof_test_pred.csv'

    train_df = pd.read_csv(train_f)
    test_df = pd.read_csv(test_f)

    original_col = train_df.columns.tolist()[0]
    col = prefix
    print('Add ', col)
    
    train.loc[:,col] = train_df[original_col]
    test.loc[:,col] = test_df[original_col]

    del train_df, test_df; gc.collect()

for df in [train, test]:    
    # within sample group
    df.loc[:, 'all_mean'] = df[prefixs].mean(axis=1)
    df.loc[:, 'all_med'] = df[prefixs].median(axis=1)
    df.loc[:, 'all_max'] = df[prefixs].max(axis=1)
    df.loc[:, 'all_min'] = df[prefixs].min(axis=1)
    df.loc[:, 'all_std'] = df[prefixs].std(axis=1)

    col_len = len(prefixs)
    for i in range(col_len-1):
        for j in range(i+1, col_len):
            cols = [prefixs[i], prefixs[j]]
            feat_name = cols[0]+'_'+cols[1]+'_inter'
            print('Add ', feat_name, ' statistcs')        
            df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
            df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
            df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
    
    col_len = len(prefixs)
    for i in range(col_len-2):
        for j in range(i+1, col_len-1):
            for k in range(j+1, col_len):
                cols = [prefixs[i], prefixs[j], prefixs[k]]
                feat_name = cols[0]+'_'+cols[1]+'_'+cols[2]+'_inter'  
                print('Add ', feat_name, ' statistcs')                    
                df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
                df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_med'] = df[cols].median(axis=1)
                df.loc[:, feat_name+'_std'] = df[cols].std(axis=1)                        
    #df = df.astype(np.float32)

Add  xentropy_add_lotsof_image_features_lgb519
Add  baseline_xgb_519
Add  ridge_new_519
Add  rf_519
Add  lgb519_goss_append_keypoint
Add  cgb519_with_categorical
Add  selftrained_bigru_conv1d_merged_with_image_adv_519_rnn
Add  xentropy_add_lotsof_image_features_lgb519_baseline_xgb_519_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_ridge_new_519_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_rf_519_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_lgb519_goss_append_keypoint_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_cgb519_with_categorical_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter  statistcs
Add  baseline_xgb_519_ridge_new_519_inter  statistcs
Add  baseline_xgb_519_rf_519_inter  statistcs
Add  baseline_xgb_519_lgb519_goss_append_keypoint_inter  statistcs
Add  baseline_xgb_519_cgb519_with_categorical_inter  statistcs
Add  baseline_xgb_51

Add  xentropy_add_lotsof_image_features_lgb519_rf_519_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_lgb519_goss_append_keypoint_cgb519_with_categorical_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_lgb519_goss_append_keypoint_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter  statistcs
Add  baseline_xgb_519_ridge_new_519_rf_519_inter  statistcs
Add  baseline_xgb_519_ridge_new_519_lgb519_goss_append_keypoint_inter  statistcs
Add  baseline_xgb_519_ridge_new_519_cgb519_with_categorical_inter  statistcs
Add  baseline_xgb_519_ridge_new_519_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter  statistcs
Add  baseline_xgb_519_rf_519_lgb519_goss_append_keypoint_inter  statistcs
Add  baseline_xgb_519_rf_519_cgb519_with_categorical_inter  statistc

In [7]:
train.head(3)

Unnamed: 0,xentropy_add_lotsof_image_features_lgb519,baseline_xgb_519,ridge_new_519,rf_519,lgb519_goss_append_keypoint,cgb519_with_categorical,selftrained_bigru_conv1d_merged_with_image_adv_519_rnn,all_mean,all_med,all_max,...,rf_519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_mean,rf_519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_gmean,rf_519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_hmean,rf_519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_med,rf_519_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_std,lgb519_goss_append_keypoint_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_mean,lgb519_goss_append_keypoint_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_gmean,lgb519_goss_append_keypoint_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_hmean,lgb519_goss_append_keypoint_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_med,lgb519_goss_append_keypoint_cgb519_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_519_rnn_inter_std
0,0.074102,0.051428,0.07122,0.057937,0.063916,0.084068,0.035421,0.062585,0.063916,0.084068,...,0.059142,0.055669,0.052277,0.057937,0.024345,0.061135,0.057522,0.05379,0.063916,0.024442
1,0.050559,0.03965,0.049997,0.024884,0.050641,0.056013,0.035403,0.043878,0.049997,0.056013,...,0.038767,0.036679,0.034769,0.035403,0.015835,0.047353,0.046481,0.045561,0.050641,0.010691
2,0.057541,0.046559,0.057304,0.050777,0.040465,0.051884,0.070741,0.05361,0.051884,0.070741,...,0.057801,0.05712,0.056493,0.051884,0.011221,0.054363,0.052958,0.051615,0.051884,0.01529


In [8]:
pd.options.display.max_rows=100
pd.options.display.max_columns=100
train[prefixs].corr()

Unnamed: 0,xentropy_add_lotsof_image_features_lgb519,baseline_xgb_519,ridge_new_519,rf_519,lgb519_goss_append_keypoint,cgb519_with_categorical,selftrained_bigru_conv1d_merged_with_image_adv_519_rnn
xentropy_add_lotsof_image_features_lgb519,1.0,0.978546,0.872219,0.921237,0.988949,0.969467,0.917382
baseline_xgb_519,0.978546,1.0,0.863354,0.917336,0.973241,0.958279,0.907119
ridge_new_519,0.872219,0.863354,1.0,0.944246,0.873256,0.881002,0.840756
rf_519,0.921237,0.917336,0.944246,1.0,0.922042,0.929737,0.880811
lgb519_goss_append_keypoint,0.988949,0.973241,0.873256,0.922042,1.0,0.975097,0.914658
cgb519_with_categorical,0.969467,0.958279,0.881002,0.929737,0.975097,1.0,0.907761
selftrained_bigru_conv1d_merged_with_image_adv_519_rnn,0.917382,0.907119,0.840756,0.880811,0.914658,0.907761,1.0


In [9]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 250 columns):
xentropy_add_lotsof_image_features_lgb519                                                                                                   float64
baseline_xgb_519                                                                                                                            float64
ridge_new_519                                                                                                                               float64
rf_519                                                                                                                                      float64
lgb519_goss_append_keypoint                                                                                                                 float64
cgb519_with_categorical                                                                                                                     float6

In [10]:
train.shape, test.shape

((1503424, 250), (508438, 250))

In [11]:
print(train.isnull().sum().max()) 
print(test.isnull().sum().max())

0
0


In [12]:
for col in tqdm(train.columns):
    train[col] = train[col].astype(np.float32)
    test[col] = test[col].astype(np.float32)

100%|████████████████████████████████████████████████████████████████████████████| 250/250 [03:30<00:00,  1.19it/s]


In [13]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 250 columns):
xentropy_add_lotsof_image_features_lgb519                                                                                                   float32
baseline_xgb_519                                                                                                                            float32
ridge_new_519                                                                                                                               float32
rf_519                                                                                                                                      float32
lgb519_goss_append_keypoint                                                                                                                 float32
cgb519_with_categorical                                                                                                                     float3

In [14]:
with open('meta_train_519.pickle', 'wb') as handle:
    pickle.dump(train, handle)
    
with open('meta_test_519.pickle', 'wb') as handle:
    pickle.dump(test, handle)

## Meta Model Tuning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions, clip_rmse

In [2]:
SEED=519
train_y = pd.read_csv("regression_target.csv").deal_probability.values

In [3]:
with open('meta_train_519.pickle', 'rb') as handle:
    train = pickle.load(handle)
    
with open('meta_test_519.pickle', 'rb') as handle:
    test = pickle.load(handle)

In [4]:
train.shape

(1503424, 250)

### Lightgbm-gbdt

In [4]:
ml = model_loader(model_type='lgb')

In [5]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.210645, valid loss:0.211457, loss_diff:0.000812
{'min_split_gain': 0.0} train loss: 0.210498, valid loss:0.212041, loss_diff:0.001543
{'min_split_gain': 0.0} train loss: 0.210793, valid loss:0.210877, loss_diff:0.000084
{'min_split_gain': 0.0} train loss: 0.210728, valid loss:0.211135, loss_diff:0.000406
{'min_split_gain': 0.0} train loss: 0.210518, valid loss:0.211959, loss_diff:0.001441
{'min_split_gain': 0.1} train loss: 0.210725, valid loss:0.211458, loss_diff:0.000733
{'min_split_gain': 0.1} train loss: 0.210554, valid loss:0.212049, loss_diff:0.001496
{'min_split_gain': 0.1} train loss: 0.210843, valid loss:0.210883, loss_diff:0.000040
{'min_split_gain': 0.1} train loss: 0.210776, valid loss:0.211141, loss_diff:0.000366
{'min_split_gain': 0.1} train loss: 0.210569, valid loss:0.211961, loss_diff:0.001392
{'min_split_gain': 0.2} train loss: 0.210953, valid loss:0.211434, loss_diff:0.000481
{'min_split_gain': 0.2} train loss: 0.210808, valid lo

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'min_split_gain': 0.0},0.211494,0.000453
1,{'min_split_gain': 0.1},0.211499,0.000453
2,{'min_split_gain': 0.2},0.211482,0.000461
3,{'min_split_gain': 0.3},0.211482,0.000463
4,{'min_split_gain': 0.4},0.211479,0.00046


In [1]:
'''
best 
{'min_split_gain': 0.1} train loss: 0.208348, valid loss:0.209236, loss_diff:0.000887
{'min_split_gain': 0.1} train loss: 0.208166, valid loss:0.209885, loss_diff:0.001718
{'min_split_gain': 0.1} train loss: 0.208426, valid loss:0.208832, loss_diff:0.000405
{'min_split_gain': 0.1} train loss: 0.208379, valid loss:0.208949, loss_diff:0.000570
{'min_split_gain': 0.1} train loss: 0.208165, valid loss:0.209724, loss_diff:0.001559
=================>{'min_split_gain': 0.1} loss:0.209325
'''



In [6]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.4, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'colsample_bytree': 0.6} train loss: 0.211111, valid loss:0.211442, loss_diff:0.000330
{'colsample_bytree': 0.6} train loss: 0.210956, valid loss:0.212038, loss_diff:0.001082
{'colsample_bytree': 0.6} train loss: 0.211254, valid loss:0.210841, loss_diff:-0.000413
{'colsample_bytree': 0.6} train loss: 0.211184, valid loss:0.211114, loss_diff:-0.000071
{'colsample_bytree': 0.6} train loss: 0.210972, valid loss:0.211979, loss_diff:0.001007
{'colsample_bytree': 0.7} train loss: 0.211096, valid loss:0.211449, loss_diff:0.000353
{'colsample_bytree': 0.7} train loss: 0.210942, valid loss:0.212042, loss_diff:0.001100
{'colsample_bytree': 0.7} train loss: 0.211245, valid loss:0.210844, loss_diff:-0.000401
{'colsample_bytree': 0.7} train loss: 0.211173, valid loss:0.211126, loss_diff:-0.000047
{'colsample_bytree': 0.7} train loss: 0.210961, valid loss:0.211970, loss_diff:0.001009
{'colsample_bytree': 0.8} train loss: 0.211091, valid loss:0.211445, loss_diff:0.000353
{'colsample_bytree': 0.8} tr

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'colsample_bytree': 0.6},0.211483,0.00047
1,{'colsample_bytree': 0.7},0.211486,0.000466
2,{'colsample_bytree': 0.8},0.21148,0.000465
3,{'colsample_bytree': 0.9},0.211478,0.000461
4,{'colsample_bytree': 1.0},0.211479,0.00046


In [7]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.4, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':.9, 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}


fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.211087, valid loss:0.211456, loss_diff:0.000369
{'subsample': 0.6} train loss: 0.210944, valid loss:0.212045, loss_diff:0.001101
{'subsample': 0.6} train loss: 0.211238, valid loss:0.210853, loss_diff:-0.000385
{'subsample': 0.6} train loss: 0.211180, valid loss:0.211149, loss_diff:-0.000031
{'subsample': 0.6} train loss: 0.210966, valid loss:0.211961, loss_diff:0.000995
{'subsample': 0.7} train loss: 0.211094, valid loss:0.211445, loss_diff:0.000351
{'subsample': 0.7} train loss: 0.210953, valid loss:0.212032, loss_diff:0.001079
{'subsample': 0.7} train loss: 0.211236, valid loss:0.210854, loss_diff:-0.000383
{'subsample': 0.7} train loss: 0.211172, valid loss:0.211139, loss_diff:-0.000033
{'subsample': 0.7} train loss: 0.210972, valid loss:0.211964, loss_diff:0.000993
{'subsample': 0.8} train loss: 0.211099, valid loss:0.211440, loss_diff:0.000341
{'subsample': 0.8} train loss: 0.210938, valid loss:0.212041, loss_diff:0.001103
{'subsample': 0.8} train

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.211493,0.000459
1,{'subsample': 0.7},0.211487,0.000458
2,{'subsample': 0.8},0.211482,0.000465
3,{'subsample': 0.9},0.211486,0.000467
4,{'subsample': 1.0},0.211478,0.000461


In [8]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.4, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':.9, 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.211058, valid loss:0.211447, loss_diff:0.000390
{'reg_alpha': 1.0} train loss: 0.210912, valid loss:0.212036, loss_diff:0.001124
{'reg_alpha': 1.0} train loss: 0.211200, valid loss:0.210845, loss_diff:-0.000355
{'reg_alpha': 1.0} train loss: 0.211122, valid loss:0.211134, loss_diff:0.000013
{'reg_alpha': 1.0} train loss: 0.210939, valid loss:0.211959, loss_diff:0.001020
{'reg_alpha': 1.5} train loss: 0.211077, valid loss:0.211430, loss_diff:0.000353
{'reg_alpha': 1.5} train loss: 0.210921, valid loss:0.212034, loss_diff:0.001114
{'reg_alpha': 1.5} train loss: 0.211218, valid loss:0.210847, loss_diff:-0.000371
{'reg_alpha': 1.5} train loss: 0.211143, valid loss:0.211126, loss_diff:-0.000017
{'reg_alpha': 1.5} train loss: 0.210939, valid loss:0.211960, loss_diff:0.001021
{'reg_alpha': 2.0} train loss: 0.211086, valid loss:0.211436, loss_diff:0.000351
{'reg_alpha': 2.0} train loss: 0.210929, valid loss:0.212030, loss_diff:0.001101
{'reg_alpha': 2.0} train 

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.211484,0.000461
1,{'reg_alpha': 1.5},0.21148,0.000462
2,{'reg_alpha': 2.0},0.211478,0.000461
3,{'reg_alpha': 2.5},0.211481,0.000462
4,{'reg_alpha': 3.0},0.211479,0.000466


In [9]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[3.0, 3.5, 4.0, 4.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 3.0} train loss: 0.208879, valid loss:0.209649, loss_diff:0.000770
{'reg_alpha': 3.0} train loss: 0.208732, valid loss:0.210378, loss_diff:0.001646
{'reg_alpha': 3.0} train loss: 0.208991, valid loss:0.209223, loss_diff:0.000232
{'reg_alpha': 3.0} train loss: 0.208932, valid loss:0.209408, loss_diff:0.000476
{'reg_alpha': 3.0} train loss: 0.208730, valid loss:0.210266, loss_diff:0.001537
{'reg_alpha': 3.5} train loss: 0.208910, valid loss:0.209681, loss_diff:0.000772
{'reg_alpha': 3.5} train loss: 0.208761, valid loss:0.210416, loss_diff:0.001655
{'reg_alpha': 3.5} train loss: 0.209030, valid loss:0.209252, loss_diff:0.000222
{'reg_alpha': 3.5} train loss: 0.208949, valid loss:0.209423, loss_diff:0.000474
{'reg_alpha': 3.5} train loss: 0.208732, valid loss:0.210279, loss_diff:0.001547
{'reg_alpha': 4.0} train loss: 0.208930, valid loss:0.209668, loss_diff:0.000738
{'reg_alpha': 4.0} train loss: 0.208730, valid loss:0.210366, loss_diff:0.001636
{'reg_alpha': 4.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 3.0},0.209785,0.00046
1,{'reg_alpha': 3.5},0.20981,0.000462
2,{'reg_alpha': 4.0},0.209782,0.000457
3,{'reg_alpha': 4.5},0.209796,0.000456


In [9]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.4, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':.9, 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}
fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.211086, valid loss:0.211436, loss_diff:0.000351
{'reg_lambda': 0.0} train loss: 0.210929, valid loss:0.212030, loss_diff:0.001101
{'reg_lambda': 0.0} train loss: 0.211240, valid loss:0.210844, loss_diff:-0.000396
{'reg_lambda': 0.0} train loss: 0.211162, valid loss:0.211123, loss_diff:-0.000039
{'reg_lambda': 0.0} train loss: 0.210954, valid loss:0.211955, loss_diff:0.001001
{'reg_lambda': 0.1} train loss: 0.211094, valid loss:0.211440, loss_diff:0.000346
{'reg_lambda': 0.1} train loss: 0.210930, valid loss:0.212030, loss_diff:0.001101
{'reg_lambda': 0.1} train loss: 0.211229, valid loss:0.210841, loss_diff:-0.000388
{'reg_lambda': 0.1} train loss: 0.211162, valid loss:0.211123, loss_diff:-0.000038
{'reg_lambda': 0.1} train loss: 0.210951, valid loss:0.211955, loss_diff:0.001005
{'reg_lambda': 0.2} train loss: 0.211087, valid loss:0.211435, loss_diff:0.000347
{'reg_lambda': 0.2} train loss: 0.210930, valid loss:0.212030, loss_diff:0.001101
{'reg_lambda

KeyboardInterrupt: 

In [None]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.02, 
    'n_estimators':3000, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.5, 
    'reg_lambda':0.7, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = {
    'early_stopping_rounds': 50,
    'verbose': 100,
    'eval_metric': 'rmse'
}

_, ret_test, _ = get_oof_predictions(train, train_y, test, ml, 
                                     default_params, seed=19, fit_params=fit_param, use_eval_set=True)

In [None]:
test_df = pd.read_csv("test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(ret_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('lgb_meta_no_bagging_exclude_knn.csv')

### Lightgbm-dart

In [6]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.213927, valid loss:0.214264, loss_diff:0.000337
{'min_split_gain': 0.0} train loss: 0.213748, valid loss:0.215104, loss_diff:0.001356
{'min_split_gain': 0.0} train loss: 0.214077, valid loss:0.213627, loss_diff:-0.000450
{'min_split_gain': 0.0} train loss: 0.214023, valid loss:0.213778, loss_diff:-0.000245
{'min_split_gain': 0.0} train loss: 0.213804, valid loss:0.214721, loss_diff:0.000917
{'min_split_gain': 0.1} train loss: 0.213920, valid loss:0.214252, loss_diff:0.000331
{'min_split_gain': 0.1} train loss: 0.213755, valid loss:0.215107, loss_diff:0.001352
{'min_split_gain': 0.1} train loss: 0.214055, valid loss:0.213606, loss_diff:-0.000449
{'min_split_gain': 0.1} train loss: 0.214021, valid loss:0.213776, loss_diff:-0.000245
{'min_split_gain': 0.1} train loss: 0.213790, valid loss:0.214729, loss_diff:0.000938
{'min_split_gain': 0.2} train loss: 0.213925, valid loss:0.214274, loss_diff:0.000349
{'min_split_gain': 0.2} train loss: 0.213742, vali

KeyboardInterrupt: 

In [23]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'colsample_bytree': 0.6} train loss: 0.212777, valid loss:0.212907, loss_diff:0.000130
{'colsample_bytree': 0.6} train loss: 0.212682, valid loss:0.213379, loss_diff:0.000696
{'colsample_bytree': 0.6} train loss: 0.212666, valid loss:0.213448, loss_diff:0.000782
{'colsample_bytree': 0.6} train loss: 0.212721, valid loss:0.213123, loss_diff:0.000403
{'colsample_bytree': 0.6} train loss: 0.212778, valid loss:0.212892, loss_diff:0.000114
{'colsample_bytree': 0.7} train loss: 0.212761, valid loss:0.212934, loss_diff:0.000172
{'colsample_bytree': 0.7} train loss: 0.212644, valid loss:0.213358, loss_diff:0.000714
{'colsample_bytree': 0.7} train loss: 0.212643, valid loss:0.213420, loss_diff:0.000777
{'colsample_bytree': 0.7} train loss: 0.212692, valid loss:0.213109, loss_diff:0.000417
{'colsample_bytree': 0.7} train loss: 0.212771, valid loss:0.212889, loss_diff:0.000118
{'colsample_bytree': 0.8} train loss: 0.212740, valid loss:0.212878, loss_diff:0.000138
{'colsample_bytree': 0.8} train 

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'colsample_bytree': 0.6},0.21315,0.000231
1,{'colsample_bytree': 0.7},0.213142,0.000216
2,{'colsample_bytree': 0.8},0.213123,0.000223
3,{'colsample_bytree': 0.9},0.21314,0.000212
4,{'colsample_bytree': 1.0},0.21311,0.000217


In [24]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.212709, valid loss:0.212853, loss_diff:0.000144
{'subsample': 0.6} train loss: 0.212645, valid loss:0.213329, loss_diff:0.000684
{'subsample': 0.6} train loss: 0.212630, valid loss:0.213384, loss_diff:0.000754
{'subsample': 0.6} train loss: 0.212655, valid loss:0.213066, loss_diff:0.000410
{'subsample': 0.6} train loss: 0.212758, valid loss:0.212853, loss_diff:0.000095
{'subsample': 0.7} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'subsample': 0.7} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'subsample': 0.7} train loss: 0.212620, valid loss:0.213375, loss_diff:0.000755
{'subsample': 0.7} train loss: 0.212664, valid loss:0.213054, loss_diff:0.000390
{'subsample': 0.7} train loss: 0.212737, valid loss:0.212844, loss_diff:0.000107
{'subsample': 0.8} train loss: 0.212731, valid loss:0.212885, loss_diff:0.000154
{'subsample': 0.8} train loss: 0.212635, valid loss:0.213348, loss_diff:0.000713
{'subsample': 0.8} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.213097,0.000226
1,{'subsample': 0.7},0.213094,0.000223
2,{'subsample': 0.8},0.213108,0.000221
3,{'subsample': 0.9},0.213126,0.000219
4,{'subsample': 1.0},0.21311,0.000217


In [25]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_alpha': 1.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_alpha': 1.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_alpha': 1.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_alpha': 1.5} train loss: 0.212732, valid loss:0.212871, loss_diff:0.000139
{'reg_alpha': 1.5} train loss: 0.212631, valid loss:0.213312, loss_diff:0.000681
{'reg_alpha': 1.5} train loss: 0.212605, valid loss:0.213374, loss_diff:0.000770
{'reg_alpha': 1.5} train loss: 0.212669, valid loss:0.213057, loss_diff:0.000388
{'reg_alpha': 1.5} train loss: 0.212743, valid loss:0.212854, loss_diff:0.000111
{'reg_alpha': 2.0} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'reg_alpha': 2.0} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'reg_alpha': 2.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.213086,0.000223
1,{'reg_alpha': 1.5},0.213093,0.000217
2,{'reg_alpha': 2.0},0.213094,0.000223
3,{'reg_alpha': 2.5},0.213104,0.000216
4,{'reg_alpha': 3.0},0.21311,0.000222


In [26]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_lambda': 0.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_lambda': 0.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_lambda': 0.1} train loss: 0.212705, valid loss:0.212876, loss_diff:0.000171
{'reg_lambda': 0.1} train loss: 0.212607, valid loss:0.213300, loss_diff:0.000693
{'reg_lambda': 0.1} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.1} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.1} train loss: 0.212718, valid loss:0.212836, loss_diff:0.000118
{'reg_lambda': 0.2} train loss: 0.212700, valid loss:0.212859, loss_diff:0.000159
{'reg_lambda': 0.2} train loss: 0.212611, valid loss:0.213313, loss_diff:0.000701
{'reg_lambda': 0

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_lambda': 0.0},0.213086,0.000223
1,{'reg_lambda': 0.1},0.213091,0.000216
2,{'reg_lambda': 0.2},0.213094,0.000223
3,{'reg_lambda': 0.3},0.213092,0.000222
4,{'reg_lambda': 0.4},0.213097,0.000217
5,{'reg_lambda': 0.5},0.213094,0.000217
6,{'reg_lambda': 0.6},0.213097,0.000216
7,{'reg_lambda': 0.7},0.213096,0.000212
8,{'reg_lambda': 0.8},0.213091,0.000222
9,{'reg_lambda': 0.9},0.213083,0.000214


### XGB-gbdt

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_child_weight':[0.001, 0.1, 2, 4, 8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'gamma':[.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

## Rigde

In [10]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.209141, valid loss:0.208948, loss_diff:-0.000193
{'alpha': 1} train loss: 0.208965, valid loss:0.209659, loss_diff:0.000694
{'alpha': 1} train loss: 0.209151, valid loss:0.208757, loss_diff:-0.000394
{'alpha': 1} train loss: 0.209166, valid loss:0.208811, loss_diff:-0.000355
{'alpha': 1} train loss: 0.208998, valid loss:0.209512, loss_diff:0.000514
{'alpha': 2} train loss: 0.209148, valid loss:0.208957, loss_diff:-0.000192
{'alpha': 2} train loss: 0.208973, valid loss:0.209665, loss_diff:0.000692
{'alpha': 2} train loss: 0.209158, valid loss:0.208762, loss_diff:-0.000396
{'alpha': 2} train loss: 0.209174, valid loss:0.208818, loss_diff:-0.000356
{'alpha': 2} train loss: 0.209006, valid loss:0.209519, loss_diff:0.000514
{'alpha': 4} train loss: 0.209164, valid loss:0.208974, loss_diff:-0.000190
{'alpha': 4} train loss: 0.208989, valid loss:0.209679, loss_diff:0.000690
{'alpha': 4} train loss: 0.209174, valid loss:0.208775, loss_diff:-0.000400
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.209137,0.000374
1,{'alpha': 2},0.209144,0.000374
2,{'alpha': 4},0.209159,0.000374
3,{'alpha': 8},0.209187,0.000374


In [11]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[0.05, 0.1, 0.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 0.05} train loss: 0.209136, valid loss:0.208942, loss_diff:-0.000194
{'alpha': 0.05} train loss: 0.208961, valid loss:0.209658, loss_diff:0.000697
{'alpha': 0.05} train loss: 0.209146, valid loss:0.208755, loss_diff:-0.000392
{'alpha': 0.05} train loss: 0.209162, valid loss:0.208807, loss_diff:-0.000355
{'alpha': 0.05} train loss: 0.208993, valid loss:0.209507, loss_diff:0.000513
{'alpha': 0.1} train loss: 0.209136, valid loss:0.208942, loss_diff:-0.000194
{'alpha': 0.1} train loss: 0.208961, valid loss:0.209658, loss_diff:0.000697
{'alpha': 0.1} train loss: 0.209146, valid loss:0.208755, loss_diff:-0.000392
{'alpha': 0.1} train loss: 0.209162, valid loss:0.208807, loss_diff:-0.000355
{'alpha': 0.1} train loss: 0.208993, valid loss:0.209507, loss_diff:0.000513
{'alpha': 0.5} train loss: 0.209138, valid loss:0.208944, loss_diff:-0.000194
{'alpha': 0.5} train loss: 0.208962, valid loss:0.209657, loss_diff:0.000695
{'alpha': 0.5} train loss: 0.209148, valid loss:0.208755, loss_d

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 0.05},0.209134,0.000375
1,{'alpha': 0.1},0.209134,0.000374
2,{'alpha': 0.5},0.209135,0.000374


## Bagging + Ultimate Blending

In [12]:
seeds = [789]
config = {
    'lgb_gbdt':{
        'ml': model_loader(model_type='lgb'),
        'param': {
            'boosting_type':'gbdt', 
            'num_leaves':31, 
            'max_depth':5, 
            'learning_rate':0.02, 
            'n_estimators':3000, 
            'min_split_gain':0.0, 
            'min_child_weight':0.001, 
            'min_child_samples':20, 
            'subsample':1.,  
            'colsample_bytree':.9, 
            'reg_alpha':2.0, 
            'reg_lambda':0.2, 
            'random_state':SEED, 
            'n_jobs': 4
        },
        'fit_param': {
            'early_stopping_rounds': 100,
            'verbose': 100,
            'eval_metric': 'rmse'
        }
    },
    'ridge':{
        'ml': model_loader(model_type='rg'),
        'param': {
            'alpha': 0.05, 
            'fit_intercept': True, 
            'normalize': False, 
            'copy_X': True, 
            'max_iter': None, 
            'tol': 0.001, 
            'solver':'auto', 
            'random_state': SEED
        },
        'fit_param': None
    }
}

In [13]:
results = []

for k,v in config.items():
    print('Training & bagging: ', k)
    res = {
        'val_oof': np.zeros((len(train_y),)),
        'test_oof': np.zeros((test.shape[0],))
    }
    
    for seed in seeds:
        print('Training seed =', seed)
        if 'random_state' in v['param']:
            v['param']['random_state'] = seed
            
        oof_val_pred, oof_test_pred, _ = get_oof_predictions(train, train_y, test, v['ml'], 
                                                          v['param'], seed=SEED, fit_params=v['fit_param'], 
                                                          use_eval_set= v['fit_param'] is not None)
        
        res['val_oof'] += oof_val_pred
        res['test_oof'] += oof_test_pred
    
    res['val_oof'] /= len(seeds)
    res['test_oof'] /= len(seeds)
    
    results.append(res)

Training & bagging:  lgb_gbdt
Training seed = 789
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212327	valid's rmse: 0.21247
[200]	train's rmse: 0.211153	valid's rmse: 0.211449
[300]	train's rmse: 0.210969	valid's rmse: 0.211429
[400]	train's rmse: 0.210819	valid's rmse: 0.211431
Early stopping, best iteration is:
[320]	train's rmse: 0.210938	valid's rmse: 0.211427
Fold 1 completed.
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212176	valid's rmse: 0.213114
[200]	train's rmse: 0.210997	valid's rmse: 0.212057
[300]	train's rmse: 0.210804	valid's rmse: 0.212029
[400]	train's rmse: 0.21065	valid's rmse: 0.212031
Early stopping, best iteration is:
[328]	train's rmse: 0.210758	valid's rmse: 0.212028
Fold 2 completed.
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212475	valid's rmse: 0.211819
[200]	train's rmse: 0.211297	valid's rmse: 0.210855
[300]	train's rmse: 0.211109	vali

In [17]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

new_train['f1'] = results[0]['val_oof']
new_train['f2'] = results[1]['val_oof']

new_test['f1'] = results[0]['test_oof']
new_test['f2'] = results[1]['test_oof']

ml = model_loader(model_type='rg')
default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(new_train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.211455, valid loss:0.211383, loss_diff:-0.000072
{'alpha': 1} train loss: 0.211300, valid loss:0.212001, loss_diff:0.000701
{'alpha': 1} train loss: 0.211598, valid loss:0.210808, loss_diff:-0.000791
{'alpha': 1} train loss: 0.211528, valid loss:0.211089, loss_diff:-0.000439
{'alpha': 1} train loss: 0.211320, valid loss:0.211923, loss_diff:0.000603
{'alpha': 2} train loss: 0.211455, valid loss:0.211383, loss_diff:-0.000072
{'alpha': 2} train loss: 0.211300, valid loss:0.212001, loss_diff:0.000701
{'alpha': 2} train loss: 0.211598, valid loss:0.210808, loss_diff:-0.000791
{'alpha': 2} train loss: 0.211528, valid loss:0.211089, loss_diff:-0.000439
{'alpha': 2} train loss: 0.211320, valid loss:0.211923, loss_diff:0.000603
{'alpha': 4} train loss: 0.211455, valid loss:0.211383, loss_diff:-0.000072
{'alpha': 4} train loss: 0.211300, valid loss:0.212001, loss_diff:0.000701
{'alpha': 4} train loss: 0.211598, valid loss:0.210808, loss_diff:-0.000791
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.211441,0.000464
1,{'alpha': 2},0.211441,0.000464
2,{'alpha': 4},0.211441,0.000464
3,{'alpha': 8},0.211441,0.000464


In [18]:
new_train[['f1', 'f2']].corr()

Unnamed: 0,f1,f2
f1,1.0,0.998852
f2,0.998852,1.0


In [19]:
default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

oof_val_pred, oof_test_pred, _ = get_oof_predictions(new_train, train_y, new_test, ml, 
                                          default_params, seed=SEED, fit_params=fit_param, 
                                          use_eval_set= False)

Fold 1 completed.
Fold 2 completed.
Fold 3 completed.
Fold 4 completed.
Fold 5 completed.


In [20]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(oof_test_pred,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blending_519_simple.csv')

In [21]:
pd.DataFrame(np.clip(oof_val_pred,0,1), 
             columns=['deal_probability']).to_csv('stack_bagging_blending_519_simple_val.csv')

In [12]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in tqdm(np.arange(min_w, 1+min_w-min_w*2, min_w)):
    for b in np.arange(min_w, 1-a+min_w-min_w*1, min_w):
        c = 1-a-b
        combined_res = a*results[0]['val_oof'] + \
                       b*results[1]['val_oof'] + \
                       c*results[2]['val_oof']

        score = clip_rmse(train_y, combined_res)
        if best_score is None or score < best_score:
            best_score = score
            print('best score updated: {:.6f}'.format(best_score), ' coefficient=> {}, {}, {}'.format(a, b, c))
            best_blend_test =  a*results[0]['test_oof'] + \
                               b*results[1]['test_oof'] + \
                               c*results[2]['test_oof']

  0%|                                                                                       | 0/98 [00:00<?, ?it/s]

best score updated: 0.209618  coefficient=> 0.01, 0.01, 0.98
best score updated: 0.209601  coefficient=> 0.01, 0.02, 0.97
best score updated: 0.209584  coefficient=> 0.01, 0.03, 0.96
best score updated: 0.209568  coefficient=> 0.01, 0.04, 0.95
best score updated: 0.209552  coefficient=> 0.01, 0.05, 0.94
best score updated: 0.209536  coefficient=> 0.01, 0.060000000000000005, 0.9299999999999999
best score updated: 0.209520  coefficient=> 0.01, 0.06999999999999999, 0.92
best score updated: 0.209504  coefficient=> 0.01, 0.08, 0.91
best score updated: 0.209489  coefficient=> 0.01, 0.09, 0.9
best score updated: 0.209473  coefficient=> 0.01, 0.09999999999999999, 0.89
best score updated: 0.209458  coefficient=> 0.01, 0.11, 0.88
best score updated: 0.209444  coefficient=> 0.01, 0.12, 0.87
best score updated: 0.209429  coefficient=> 0.01, 0.13, 0.86
best score updated: 0.209414  coefficient=> 0.01, 0.14, 0.85
best score updated: 0.209400  coefficient=> 0.01, 0.15000000000000002, 0.84
best score 

100%|██████████████████████████████████████████████████████████████████████████████| 98/98 [03:51<00:00,  2.36s/it]


In [None]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in np.arange(min_w, 1+min_w-min_w*4, min_w):
    for b in np.arange(min_w, 1-a+min_w-min_w*3, min_w):
        for c in np.arange(min_w, 1-a-b+min_w-min_w*2, min_w):
            for d in np.arange(min_w, 1-a-b-c+min_w-min_w*1, min_w):
                e = 1-a-b-c-d
                combined_res = a*results[0]['val_oof'] + \
                               b*results[1]['val_oof'] + \
                               c*results[2]['val_oof'] + \
                               d*results[3]['val_oof'] + \ 
                               e*results[4]['val_oof']
                
                score = clip_rmse(train_y, combined_res)
                if best_score is None or score < best_score:
                    best_score = score
                    print('best score updated:', best_score)
                    best_blend_test =  a*results[0]['test_oof'] + \
                                       b*results[1]['test_oof'] + \
                                       c*results[2]['test_oof'] + \
                                       d*results[3]['test_oof'] + \ 
                                       e*results[4]['test_oof']

In [8]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(best_blend_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blend_no_xgb_meta.csv')