In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions

In [2]:
folder = 'final oofs 719/'
os.listdir(folder)

['baseline_xgb_719_oof_test_pred.csv',
 'baseline_xgb_719_oof_val_pred.csv',
 'cgb719_with_categorical_oof_test_pred.csv',
 'cgb719_with_categorical_oof_val_pred.csv',
 'lgb719_goss_append_keypoint_oof_test_pred.csv',
 'lgb719_goss_append_keypoint_oof_val_pred.csv',
 'rf719_oof_test_pred.csv',
 'rf719_oof_val_pred.csv',
 'ridge_new_719_oof_test_pred.csv',
 'ridge_new_719_oof_val_pred.csv',
 'selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_oof_test_pred.csv',
 'selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_oof_val_pred.csv',
 'xentropy_add_lotsof_image_features_lgb719_oof_test_pred.csv',
 'xentropy_add_lotsof_image_features_lgb719_oof_val_pred.csv']

In [3]:
prefixs = [
    'xentropy_add_lotsof_image_features_lgb719',
    'baseline_xgb_719',
    'cgb719_with_categorical',
    'lgb719_goss_append_keypoint',
    'rf719',
    'ridge_new_719',
    'selftrained_bigru_conv1d_merged_with_image_adv_719_rnn'
]

In [4]:
train = pd.DataFrame()
test = pd.DataFrame()

In [5]:
from scipy.stats import hmean
from scipy.stats.mstats import gmean

def get_clipped_values(a):
    return np.clip(a, 1e-15, 1.)

for prefix in prefixs:
    train_f = folder + prefix + '_oof_val_pred.csv'
    test_f = folder + prefix + '_oof_test_pred.csv'

    train_df = pd.read_csv(train_f)
    test_df = pd.read_csv(test_f)

    original_col = train_df.columns.tolist()[0]
    col = prefix
    print('Add ', col)
    
    train.loc[:,col] = train_df[original_col]
    test.loc[:,col] = test_df[original_col]

    del train_df, test_df; gc.collect()

for df in [train, test]:    
    # within sample group
    df.loc[:, 'all_mean'] = df[prefixs].mean(axis=1)
    df.loc[:, 'all_med'] = df[prefixs].median(axis=1)
    df.loc[:, 'all_max'] = df[prefixs].max(axis=1)
    df.loc[:, 'all_min'] = df[prefixs].min(axis=1)
    df.loc[:, 'all_std'] = df[prefixs].std(axis=1)

    col_len = len(prefixs)
    for i in range(col_len-1):
        for j in range(i+1, col_len):
            cols = [prefixs[i], prefixs[j]]
            feat_name = cols[0]+'_'+cols[1]+'_inter'
            print('Add ', feat_name, ' statistcs')        
            df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
            df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
            df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
    
    col_len = len(prefixs)
    for i in range(col_len-2):
        for j in range(i+1, col_len-1):
            for k in range(j+1, col_len):
                cols = [prefixs[i], prefixs[j], prefixs[k]]
                feat_name = cols[0]+'_'+cols[1]+'_'+cols[2]+'_inter'  
                print('Add ', feat_name, ' statistcs')                    
                df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
                df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_med'] = df[cols].median(axis=1)
                df.loc[:, feat_name+'_std'] = df[cols].std(axis=1)                        
    #df = df.astype(np.float32)

Add  xentropy_add_lotsof_image_features_lgb719
Add  baseline_xgb_719
Add  cgb719_with_categorical
Add  lgb719_goss_append_keypoint
Add  rf719
Add  ridge_new_719
Add  selftrained_bigru_conv1d_merged_with_image_adv_719_rnn
Add  xentropy_add_lotsof_image_features_lgb719_baseline_xgb_719_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_cgb719_with_categorical_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_lgb719_goss_append_keypoint_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_rf719_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_ridge_new_719_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter  statistcs
Add  baseline_xgb_719_cgb719_with_categorical_inter  statistcs
Add  baseline_xgb_719_lgb719_goss_append_keypoint_inter  statistcs
Add  baseline_xgb_719_rf719_inter  statistcs
Add  baseline_xgb_719_ridge_new_719_inter  statistcs
Add  baseline_xgb_719_s

Add  xentropy_add_lotsof_image_features_lgb719_lgb719_goss_append_keypoint_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_rf719_ridge_new_719_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_rf719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter  statistcs
Add  xentropy_add_lotsof_image_features_lgb719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter  statistcs
Add  baseline_xgb_719_cgb719_with_categorical_lgb719_goss_append_keypoint_inter  statistcs
Add  baseline_xgb_719_cgb719_with_categorical_rf719_inter  statistcs
Add  baseline_xgb_719_cgb719_with_categorical_ridge_new_719_inter  statistcs
Add  baseline_xgb_719_cgb719_with_categorical_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter  statistcs
Add  baseline_xgb_719_lgb719_goss_append_keypoint_rf719_inter  statistcs
Add  baseline_xgb_719_lgb719_goss_append_keypoint_ridge_new_719_inter  statistcs
Ad

In [6]:
train.head(3)

Unnamed: 0,xentropy_add_lotsof_image_features_lgb719,baseline_xgb_719,cgb719_with_categorical,lgb719_goss_append_keypoint,rf719,ridge_new_719,selftrained_bigru_conv1d_merged_with_image_adv_719_rnn,all_mean,all_med,all_max,...,lgb719_goss_append_keypoint_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_mean,lgb719_goss_append_keypoint_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_gmean,lgb719_goss_append_keypoint_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_hmean,lgb719_goss_append_keypoint_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_med,lgb719_goss_append_keypoint_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_std,rf719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_mean,rf719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_gmean,rf719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_hmean,rf719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_med,rf719_ridge_new_719_selftrained_bigru_conv1d_merged_with_image_adv_719_rnn_inter_std
0,0.060275,0.079386,0.07796,0.068296,0.055568,0.060719,0.055605,0.065401,0.060719,0.079386,...,0.06154,0.061321,0.061105,0.060719,0.006385,0.057297,0.057247,0.057198,0.055605,0.002964
1,0.049319,0.049824,0.03587,0.050909,0.02882,0.04904,0.042389,0.043739,0.04904,0.050909,...,0.047446,0.0473,0.047151,0.04904,0.004478,0.040083,0.039129,0.038129,0.042389,0.010305
2,0.039271,0.032743,0.030674,0.03298,0.051158,0.055162,0.044281,0.040896,0.039271,0.055162,...,0.044141,0.043189,0.042234,0.044281,0.011091,0.0502,0.049995,0.049785,0.051158,0.005503


In [7]:
pd.options.display.max_rows=100
pd.options.display.max_columns=100
train[prefixs].corr()

Unnamed: 0,xentropy_add_lotsof_image_features_lgb719,baseline_xgb_719,cgb719_with_categorical,lgb719_goss_append_keypoint,rf719,ridge_new_719,selftrained_bigru_conv1d_merged_with_image_adv_719_rnn
xentropy_add_lotsof_image_features_lgb719,1.0,0.978145,0.971084,0.98926,0.922805,0.873556,0.916805
baseline_xgb_719,0.978145,1.0,0.958515,0.972856,0.915871,0.86175,0.90489
cgb719_with_categorical,0.971084,0.958515,1.0,0.97594,0.934553,0.885376,0.908931
lgb719_goss_append_keypoint,0.98926,0.972856,0.97594,1.0,0.922859,0.873763,0.914048
rf719,0.922805,0.915871,0.934553,0.922859,1.0,0.945654,0.880436
ridge_new_719,0.873556,0.86175,0.885376,0.873763,0.945654,1.0,0.840825
selftrained_bigru_conv1d_merged_with_image_adv_719_rnn,0.916805,0.90489,0.908931,0.914048,0.880436,0.840825,1.0


In [8]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 250 columns):
xentropy_add_lotsof_image_features_lgb719                                                                                                   float64
baseline_xgb_719                                                                                                                            float64
cgb719_with_categorical                                                                                                                     float64
lgb719_goss_append_keypoint                                                                                                                 float64
rf719                                                                                                                                       float64
ridge_new_719                                                                                                                               float6

In [9]:
train.shape, test.shape

((1503424, 250), (508438, 250))

In [10]:
print(train.isnull().sum().max()) 
print(test.isnull().sum().max())

0
0


In [11]:
for col in tqdm(train.columns):
    train[col] = train[col].astype(np.float32)
    test[col] = test[col].astype(np.float32)

100%|████████████████████████████████████████████████████████████████████████████| 250/250 [03:25<00:00,  1.22it/s]


In [12]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 250 columns):
xentropy_add_lotsof_image_features_lgb719                                                                                                   float32
baseline_xgb_719                                                                                                                            float32
cgb719_with_categorical                                                                                                                     float32
lgb719_goss_append_keypoint                                                                                                                 float32
rf719                                                                                                                                       float32
ridge_new_719                                                                                                                               float3

In [13]:
with open('meta_train_719.pickle', 'wb') as handle:
    pickle.dump(train, handle)
    
with open('meta_test_719.pickle', 'wb') as handle:
    pickle.dump(test, handle)

## Meta Model Tuning

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions, clip_rmse

In [4]:
SEED=719
train_y = pd.read_csv("regression_target.csv").deal_probability.values

In [5]:
with open('meta_train_719.pickle', 'rb') as handle:
    train = pickle.load(handle)
    
with open('meta_test_719.pickle', 'rb') as handle:
    test = pickle.load(handle)

### Lightgbm-gbdt

In [5]:
ml = model_loader(model_type='lgb')

In [5]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.207978, valid loss:0.208974, loss_diff:0.000996
{'min_split_gain': 0.0} train loss: 0.207808, valid loss:0.209675, loss_diff:0.001867
{'min_split_gain': 0.0} train loss: 0.208046, valid loss:0.208600, loss_diff:0.000554
{'min_split_gain': 0.0} train loss: 0.207981, valid loss:0.208823, loss_diff:0.000843
{'min_split_gain': 0.0} train loss: 0.207785, valid loss:0.209536, loss_diff:0.001751
{'min_split_gain': 0.1} train loss: 0.207993, valid loss:0.208984, loss_diff:0.000991
{'min_split_gain': 0.1} train loss: 0.207808, valid loss:0.209692, loss_diff:0.001884
{'min_split_gain': 0.1} train loss: 0.208039, valid loss:0.208583, loss_diff:0.000544
{'min_split_gain': 0.1} train loss: 0.207969, valid loss:0.208843, loss_diff:0.000874
{'min_split_gain': 0.1} train loss: 0.207823, valid loss:0.209519, loss_diff:0.001695
{'min_split_gain': 0.2} train loss: 0.207997, valid loss:0.208965, loss_diff:0.000967
{'min_split_gain': 0.2} train loss: 0.207846, valid lo

KeyboardInterrupt: 

In [1]:
'''
best 
{'min_split_gain': 0.1} train loss: 0.208348, valid loss:0.209236, loss_diff:0.000887
{'min_split_gain': 0.1} train loss: 0.208166, valid loss:0.209885, loss_diff:0.001718
{'min_split_gain': 0.1} train loss: 0.208426, valid loss:0.208832, loss_diff:0.000405
{'min_split_gain': 0.1} train loss: 0.208379, valid loss:0.208949, loss_diff:0.000570
{'min_split_gain': 0.1} train loss: 0.208165, valid loss:0.209724, loss_diff:0.001559
=================>{'min_split_gain': 0.1} loss:0.209325
'''



In [None]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 4
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [7]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}


fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.208383, valid loss:0.209283, loss_diff:0.000900
{'subsample': 0.6} train loss: 0.208222, valid loss:0.209865, loss_diff:0.001642
{'subsample': 0.6} train loss: 0.208485, valid loss:0.208844, loss_diff:0.000359
{'subsample': 0.6} train loss: 0.208434, valid loss:0.208999, loss_diff:0.000565
{'subsample': 0.6} train loss: 0.208252, valid loss:0.209756, loss_diff:0.001504
{'subsample': 0.7} train loss: 0.208363, valid loss:0.209219, loss_diff:0.000857
{'subsample': 0.7} train loss: 0.208175, valid loss:0.209866, loss_diff:0.001690
{'subsample': 0.7} train loss: 0.208406, valid loss:0.208786, loss_diff:0.000380
{'subsample': 0.7} train loss: 0.208416, valid loss:0.209000, loss_diff:0.000583
{'subsample': 0.7} train loss: 0.208231, valid loss:0.209730, loss_diff:0.001500
{'subsample': 0.8} train loss: 0.208352, valid loss:0.209221, loss_diff:0.000869
{'subsample': 0.8} train loss: 0.208169, valid loss:0.209877, loss_diff:0.001707
{'subsample': 0.8} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.209349,0.000403
1,{'subsample': 0.7},0.20932,0.000416
2,{'subsample': 0.8},0.209341,0.00041
3,{'subsample': 0.9},0.20936,0.000403
4,{'subsample': 1.0},0.209325,0.000416


In [8]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.208339, valid loss:0.209249, loss_diff:0.000909
{'reg_alpha': 1.0} train loss: 0.208163, valid loss:0.209913, loss_diff:0.001750
{'reg_alpha': 1.0} train loss: 0.208432, valid loss:0.208866, loss_diff:0.000434
{'reg_alpha': 1.0} train loss: 0.208347, valid loss:0.208971, loss_diff:0.000624
{'reg_alpha': 1.0} train loss: 0.208211, valid loss:0.209750, loss_diff:0.001539
{'reg_alpha': 1.5} train loss: 0.208347, valid loss:0.209243, loss_diff:0.000897
{'reg_alpha': 1.5} train loss: 0.208158, valid loss:0.209846, loss_diff:0.001688
{'reg_alpha': 1.5} train loss: 0.208416, valid loss:0.208855, loss_diff:0.000439
{'reg_alpha': 1.5} train loss: 0.208388, valid loss:0.209011, loss_diff:0.000623
{'reg_alpha': 1.5} train loss: 0.208205, valid loss:0.209701, loss_diff:0.001496
{'reg_alpha': 2.0} train loss: 0.208363, valid loss:0.209219, loss_diff:0.000857
{'reg_alpha': 2.0} train loss: 0.208175, valid loss:0.209866, loss_diff:0.001690
{'reg_alpha': 2.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.20935,0.000416
1,{'reg_alpha': 1.5},0.209331,0.000384
2,{'reg_alpha': 2.0},0.20932,0.000416
3,{'reg_alpha': 2.5},0.209324,0.000401
4,{'reg_alpha': 3.0},0.209334,0.000414


In [9]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[3.0, 3.5, 4.0, 4.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 3.0} train loss: 0.208879, valid loss:0.209649, loss_diff:0.000770
{'reg_alpha': 3.0} train loss: 0.208732, valid loss:0.210378, loss_diff:0.001646
{'reg_alpha': 3.0} train loss: 0.208991, valid loss:0.209223, loss_diff:0.000232
{'reg_alpha': 3.0} train loss: 0.208932, valid loss:0.209408, loss_diff:0.000476
{'reg_alpha': 3.0} train loss: 0.208730, valid loss:0.210266, loss_diff:0.001537
{'reg_alpha': 3.5} train loss: 0.208910, valid loss:0.209681, loss_diff:0.000772
{'reg_alpha': 3.5} train loss: 0.208761, valid loss:0.210416, loss_diff:0.001655
{'reg_alpha': 3.5} train loss: 0.209030, valid loss:0.209252, loss_diff:0.000222
{'reg_alpha': 3.5} train loss: 0.208949, valid loss:0.209423, loss_diff:0.000474
{'reg_alpha': 3.5} train loss: 0.208732, valid loss:0.210279, loss_diff:0.001547
{'reg_alpha': 4.0} train loss: 0.208930, valid loss:0.209668, loss_diff:0.000738
{'reg_alpha': 4.0} train loss: 0.208730, valid loss:0.210366, loss_diff:0.001636
{'reg_alpha': 4.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 3.0},0.209785,0.00046
1,{'reg_alpha': 3.5},0.20981,0.000462
2,{'reg_alpha': 4.0},0.209782,0.000457
3,{'reg_alpha': 4.5},0.209796,0.000456


In [9]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}
fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.208363, valid loss:0.209219, loss_diff:0.000857
{'reg_lambda': 0.0} train loss: 0.208175, valid loss:0.209866, loss_diff:0.001690
{'reg_lambda': 0.0} train loss: 0.208406, valid loss:0.208786, loss_diff:0.000380
{'reg_lambda': 0.0} train loss: 0.208416, valid loss:0.209000, loss_diff:0.000583
{'reg_lambda': 0.0} train loss: 0.208231, valid loss:0.209730, loss_diff:0.001500
{'reg_lambda': 0.1} train loss: 0.208362, valid loss:0.209209, loss_diff:0.000847
{'reg_lambda': 0.1} train loss: 0.208193, valid loss:0.209875, loss_diff:0.001682
{'reg_lambda': 0.1} train loss: 0.208406, valid loss:0.208786, loss_diff:0.000380
{'reg_lambda': 0.1} train loss: 0.208408, valid loss:0.208995, loss_diff:0.000586
{'reg_lambda': 0.1} train loss: 0.208231, valid loss:0.209731, loss_diff:0.001500
{'reg_lambda': 0.2} train loss: 0.208375, valid loss:0.209236, loss_diff:0.000861
{'reg_lambda': 0.2} train loss: 0.208174, valid loss:0.209854, loss_diff:0.001680
{'reg_lambda': 0

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_lambda': 0.0},0.20932,0.000416
1,{'reg_lambda': 0.1},0.209319,0.00042
2,{'reg_lambda': 0.2},0.209318,0.000403
3,{'reg_lambda': 0.3},0.209322,0.000409
4,{'reg_lambda': 0.4},0.209335,0.000424
5,{'reg_lambda': 0.5},0.209344,0.000408
6,{'reg_lambda': 0.6},0.209343,0.000421
7,{'reg_lambda': 0.7},0.209337,0.000427
8,{'reg_lambda': 0.8},0.209343,0.000431
9,{'reg_lambda': 0.9},0.209349,0.000429


In [None]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.02, 
    'n_estimators':3000, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.5, 
    'reg_lambda':0.7, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = {
    'early_stopping_rounds': 50,
    'verbose': 100,
    'eval_metric': 'rmse'
}

_, ret_test, _ = get_oof_predictions(train, train_y, test, ml, 
                                     default_params, seed=19, fit_params=fit_param, use_eval_set=True)

In [None]:
test_df = pd.read_csv("test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(ret_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('lgb_meta_no_bagging_exclude_knn.csv')

### Lightgbm-dart

In [6]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.213927, valid loss:0.214264, loss_diff:0.000337
{'min_split_gain': 0.0} train loss: 0.213748, valid loss:0.215104, loss_diff:0.001356
{'min_split_gain': 0.0} train loss: 0.214077, valid loss:0.213627, loss_diff:-0.000450
{'min_split_gain': 0.0} train loss: 0.214023, valid loss:0.213778, loss_diff:-0.000245
{'min_split_gain': 0.0} train loss: 0.213804, valid loss:0.214721, loss_diff:0.000917
{'min_split_gain': 0.1} train loss: 0.213920, valid loss:0.214252, loss_diff:0.000331
{'min_split_gain': 0.1} train loss: 0.213755, valid loss:0.215107, loss_diff:0.001352
{'min_split_gain': 0.1} train loss: 0.214055, valid loss:0.213606, loss_diff:-0.000449
{'min_split_gain': 0.1} train loss: 0.214021, valid loss:0.213776, loss_diff:-0.000245
{'min_split_gain': 0.1} train loss: 0.213790, valid loss:0.214729, loss_diff:0.000938
{'min_split_gain': 0.2} train loss: 0.213925, valid loss:0.214274, loss_diff:0.000349
{'min_split_gain': 0.2} train loss: 0.213742, vali

KeyboardInterrupt: 

In [23]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'colsample_bytree': 0.6} train loss: 0.212777, valid loss:0.212907, loss_diff:0.000130
{'colsample_bytree': 0.6} train loss: 0.212682, valid loss:0.213379, loss_diff:0.000696
{'colsample_bytree': 0.6} train loss: 0.212666, valid loss:0.213448, loss_diff:0.000782
{'colsample_bytree': 0.6} train loss: 0.212721, valid loss:0.213123, loss_diff:0.000403
{'colsample_bytree': 0.6} train loss: 0.212778, valid loss:0.212892, loss_diff:0.000114
{'colsample_bytree': 0.7} train loss: 0.212761, valid loss:0.212934, loss_diff:0.000172
{'colsample_bytree': 0.7} train loss: 0.212644, valid loss:0.213358, loss_diff:0.000714
{'colsample_bytree': 0.7} train loss: 0.212643, valid loss:0.213420, loss_diff:0.000777
{'colsample_bytree': 0.7} train loss: 0.212692, valid loss:0.213109, loss_diff:0.000417
{'colsample_bytree': 0.7} train loss: 0.212771, valid loss:0.212889, loss_diff:0.000118
{'colsample_bytree': 0.8} train loss: 0.212740, valid loss:0.212878, loss_diff:0.000138
{'colsample_bytree': 0.8} train 

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'colsample_bytree': 0.6},0.21315,0.000231
1,{'colsample_bytree': 0.7},0.213142,0.000216
2,{'colsample_bytree': 0.8},0.213123,0.000223
3,{'colsample_bytree': 0.9},0.21314,0.000212
4,{'colsample_bytree': 1.0},0.21311,0.000217


In [24]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.212709, valid loss:0.212853, loss_diff:0.000144
{'subsample': 0.6} train loss: 0.212645, valid loss:0.213329, loss_diff:0.000684
{'subsample': 0.6} train loss: 0.212630, valid loss:0.213384, loss_diff:0.000754
{'subsample': 0.6} train loss: 0.212655, valid loss:0.213066, loss_diff:0.000410
{'subsample': 0.6} train loss: 0.212758, valid loss:0.212853, loss_diff:0.000095
{'subsample': 0.7} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'subsample': 0.7} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'subsample': 0.7} train loss: 0.212620, valid loss:0.213375, loss_diff:0.000755
{'subsample': 0.7} train loss: 0.212664, valid loss:0.213054, loss_diff:0.000390
{'subsample': 0.7} train loss: 0.212737, valid loss:0.212844, loss_diff:0.000107
{'subsample': 0.8} train loss: 0.212731, valid loss:0.212885, loss_diff:0.000154
{'subsample': 0.8} train loss: 0.212635, valid loss:0.213348, loss_diff:0.000713
{'subsample': 0.8} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.213097,0.000226
1,{'subsample': 0.7},0.213094,0.000223
2,{'subsample': 0.8},0.213108,0.000221
3,{'subsample': 0.9},0.213126,0.000219
4,{'subsample': 1.0},0.21311,0.000217


In [25]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_alpha': 1.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_alpha': 1.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_alpha': 1.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_alpha': 1.5} train loss: 0.212732, valid loss:0.212871, loss_diff:0.000139
{'reg_alpha': 1.5} train loss: 0.212631, valid loss:0.213312, loss_diff:0.000681
{'reg_alpha': 1.5} train loss: 0.212605, valid loss:0.213374, loss_diff:0.000770
{'reg_alpha': 1.5} train loss: 0.212669, valid loss:0.213057, loss_diff:0.000388
{'reg_alpha': 1.5} train loss: 0.212743, valid loss:0.212854, loss_diff:0.000111
{'reg_alpha': 2.0} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'reg_alpha': 2.0} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'reg_alpha': 2.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.213086,0.000223
1,{'reg_alpha': 1.5},0.213093,0.000217
2,{'reg_alpha': 2.0},0.213094,0.000223
3,{'reg_alpha': 2.5},0.213104,0.000216
4,{'reg_alpha': 3.0},0.21311,0.000222


In [26]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_lambda': 0.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_lambda': 0.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_lambda': 0.1} train loss: 0.212705, valid loss:0.212876, loss_diff:0.000171
{'reg_lambda': 0.1} train loss: 0.212607, valid loss:0.213300, loss_diff:0.000693
{'reg_lambda': 0.1} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.1} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.1} train loss: 0.212718, valid loss:0.212836, loss_diff:0.000118
{'reg_lambda': 0.2} train loss: 0.212700, valid loss:0.212859, loss_diff:0.000159
{'reg_lambda': 0.2} train loss: 0.212611, valid loss:0.213313, loss_diff:0.000701
{'reg_lambda': 0

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_lambda': 0.0},0.213086,0.000223
1,{'reg_lambda': 0.1},0.213091,0.000216
2,{'reg_lambda': 0.2},0.213094,0.000223
3,{'reg_lambda': 0.3},0.213092,0.000222
4,{'reg_lambda': 0.4},0.213097,0.000217
5,{'reg_lambda': 0.5},0.213094,0.000217
6,{'reg_lambda': 0.6},0.213097,0.000216
7,{'reg_lambda': 0.7},0.213096,0.000212
8,{'reg_lambda': 0.8},0.213091,0.000222
9,{'reg_lambda': 0.9},0.213083,0.000214


### XGB-gbdt

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_child_weight':[0.001, 0.1, 2, 4, 8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'gamma':[.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

## Rigde

In [4]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.211353, valid loss:0.211762, loss_diff:0.000409
{'alpha': 1} train loss: 0.211446, valid loss:0.211393, loss_diff:-0.000052
{'alpha': 1} train loss: 0.211476, valid loss:0.211268, loss_diff:-0.000208
{'alpha': 1} train loss: 0.211306, valid loss:0.211946, loss_diff:0.000640
{'alpha': 1} train loss: 0.211565, valid loss:0.210914, loss_diff:-0.000651
{'alpha': 2} train loss: 0.211354, valid loss:0.211761, loss_diff:0.000407
{'alpha': 2} train loss: 0.211447, valid loss:0.211392, loss_diff:-0.000055
{'alpha': 2} train loss: 0.211477, valid loss:0.211268, loss_diff:-0.000210
{'alpha': 2} train loss: 0.211308, valid loss:0.211945, loss_diff:0.000637
{'alpha': 2} train loss: 0.211567, valid loss:0.210915, loss_diff:-0.000652
{'alpha': 4} train loss: 0.211356, valid loss:0.211760, loss_diff:0.000404
{'alpha': 4} train loss: 0.211449, valid loss:0.211390, loss_diff:-0.000059
{'alpha': 4} train loss: 0.211479, valid loss:0.211267, loss_diff:-0.000211
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.211457,0.000365
1,{'alpha': 2},0.211456,0.000365
2,{'alpha': 4},0.211455,0.000364
3,{'alpha': 8},0.211455,0.000363


In [5]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[0.05, 0.1, 0.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 0.05} train loss: 0.211348, valid loss:0.211766, loss_diff:0.000418
{'alpha': 0.05} train loss: 0.211440, valid loss:0.211399, loss_diff:-0.000042
{'alpha': 0.05} train loss: 0.211471, valid loss:0.211271, loss_diff:-0.000199
{'alpha': 0.05} train loss: 0.211301, valid loss:0.211952, loss_diff:0.000651
{'alpha': 0.05} train loss: 0.211561, valid loss:0.210914, loss_diff:-0.000647
{'alpha': 0.1} train loss: 0.211349, valid loss:0.211765, loss_diff:0.000416
{'alpha': 0.1} train loss: 0.211441, valid loss:0.211398, loss_diff:-0.000044
{'alpha': 0.1} train loss: 0.211472, valid loss:0.211270, loss_diff:-0.000201
{'alpha': 0.1} train loss: 0.211302, valid loss:0.211951, loss_diff:0.000649
{'alpha': 0.1} train loss: 0.211562, valid loss:0.210914, loss_diff:-0.000648
{'alpha': 0.5} train loss: 0.211352, valid loss:0.211764, loss_diff:0.000412
{'alpha': 0.5} train loss: 0.211444, valid loss:0.211395, loss_diff:-0.000049
{'alpha': 0.5} train loss: 0.211475, valid loss:0.211269, loss_d

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 0.05},0.21146,0.000367
1,{'alpha': 0.1},0.21146,0.000367
2,{'alpha': 0.5},0.211458,0.000366


## Bagging + Ultimate Blending

In [6]:
seeds = [598754]
config = {
    'lgb_gbdt':{
        'ml': model_loader(model_type='lgb'),
        'param': {
            'boosting_type':'gbdt', 
            'num_leaves':31, 
            'max_depth':5, 
            'learning_rate':0.02, 
            'n_estimators':3000, 
            'min_split_gain':0.4, 
            'min_child_weight':0.001, 
            'min_child_samples':20, 
            'subsample':1.,  
            'colsample_bytree':.9, 
            'reg_alpha':2.0, 
            'reg_lambda':0.2, 
            'random_state':SEED, 
            'n_jobs': 4
        },
        'fit_param': {
            'early_stopping_rounds': 100,
            'verbose': 100,
            'eval_metric': 'rmse'
        }
    },
    'ridge':{
        'ml': model_loader(model_type='rg'),
        'param': {
            'alpha': 8, 
            'fit_intercept': True, 
            'normalize': False, 
            'copy_X': True, 
            'max_iter': None, 
            'tol': 0.001, 
            'solver':'auto', 
            'random_state': SEED
        },
        'fit_param': None
    }
}

In [7]:
results = []

for k,v in config.items():
    print('Training & bagging: ', k)
    res = {
        'val_oof': np.zeros((len(train_y),)),
        'test_oof': np.zeros((test.shape[0],))
    }
    
    for seed in seeds:
        print('Training seed =', seed)
        if 'random_state' in v['param']:
            v['param']['random_state'] = seed
            
        oof_val_pred, oof_test_pred, _ = get_oof_predictions(train, train_y, test, v['ml'], 
                                                          v['param'], seed=SEED, fit_params=v['fit_param'], 
                                                          use_eval_set= v['fit_param'] is not None)
        
        res['val_oof'] += oof_val_pred
        res['test_oof'] += oof_test_pred
    
    res['val_oof'] /= len(seeds)
    res['test_oof'] /= len(seeds)
    
    results.append(res)

Training & bagging:  lgb_gbdt
Training seed = 598754
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212246	valid's rmse: 0.212833
[200]	train's rmse: 0.211101	valid's rmse: 0.211827
[300]	train's rmse: 0.210993	valid's rmse: 0.211805
[400]	train's rmse: 0.210993	valid's rmse: 0.211805
Early stopping, best iteration is:
[306]	train's rmse: 0.210993	valid's rmse: 0.211805
Fold 1 completed.
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212335	valid's rmse: 0.212475
[200]	train's rmse: 0.211191	valid's rmse: 0.211454
[300]	train's rmse: 0.211073	valid's rmse: 0.211437
Early stopping, best iteration is:
[272]	train's rmse: 0.211083	valid's rmse: 0.211436
Fold 2 completed.
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 0.212379	valid's rmse: 0.212243
[200]	train's rmse: 0.21123	valid's rmse: 0.211283
[300]	train's rmse: 0.211122	valid's rmse: 0.211267
Early stopping, best iteratio

In [8]:
results

[{'test_oof': array([ 0.43012976,  0.11989575,  0.18681456, ...,  0.04950588,
          0.48096084,  0.11337298]),
  'val_oof': array([ 0.06805332,  0.05085494,  0.03708395, ...,  0.23074029,
          0.17018002,  0.13708617])},
 {'test_oof': array([ 0.43973295,  0.11383719,  0.18686005, ...,  0.05050436,
          0.49309947,  0.11329323]),
  'val_oof': array([ 0.07231319,  0.05345392,  0.03705866, ...,  0.22465725,
          0.16191074,  0.13212057])}]

In [9]:
import pickle
pickle.dump(results, open('719_meta_oof_result', 'wb'))

In [6]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(results[1]['test_oof'],0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('lgb_gbdt_meta_bagging.csv')

In [6]:
import pickle
results = pickle.load(open('719_meta_oof_result', 'rb'))

In [8]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

new_train['f1'] = results[0]['val_oof']
new_train['f2'] = results[1]['val_oof']

new_test['f1'] = results[0]['test_oof']
new_test['f2'] = results[1]['test_oof']

ml = model_loader(model_type='rg')
default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(new_train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.211362, valid loss:0.211747, loss_diff:0.000385
{'alpha': 1} train loss: 0.211455, valid loss:0.211375, loss_diff:-0.000080
{'alpha': 1} train loss: 0.211489, valid loss:0.211242, loss_diff:-0.000246
{'alpha': 1} train loss: 0.211319, valid loss:0.211920, loss_diff:0.000601
{'alpha': 1} train loss: 0.211571, valid loss:0.210914, loss_diff:-0.000656
{'alpha': 2} train loss: 0.211362, valid loss:0.211747, loss_diff:0.000385
{'alpha': 2} train loss: 0.211455, valid loss:0.211375, loss_diff:-0.000080
{'alpha': 2} train loss: 0.211489, valid loss:0.211242, loss_diff:-0.000247
{'alpha': 2} train loss: 0.211319, valid loss:0.211920, loss_diff:0.000601
{'alpha': 2} train loss: 0.211571, valid loss:0.210914, loss_diff:-0.000656
{'alpha': 4} train loss: 0.211362, valid loss:0.211747, loss_diff:0.000385
{'alpha': 4} train loss: 0.211455, valid loss:0.211375, loss_diff:-0.000080
{'alpha': 4} train loss: 0.211489, valid loss:0.211242, loss_diff:-0.000247
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.21144,0.000359
1,{'alpha': 2},0.21144,0.000359
2,{'alpha': 4},0.21144,0.000359
3,{'alpha': 8},0.21144,0.000359


In [9]:
new_train[['f1', 'f2']].corr()

Unnamed: 0,f1,f2
f1,1.0,0.998866
f2,0.998866,1.0


In [10]:
default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

oof_val_pred, oof_test_pred, _ = get_oof_predictions(new_train, train_y, new_test, ml, 
                                          default_params, seed=SEED, fit_params=fit_param, 
                                          use_eval_set= False)

Fold 1 completed.
Fold 2 completed.
Fold 3 completed.
Fold 4 completed.
Fold 5 completed.


In [11]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(oof_test_pred,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blending_719_simple.csv')

In [13]:
pd.DataFrame(np.clip(oof_val_pred,0,1), 
             columns=['deal_probability']).to_csv('stack_bagging_blending_719_simple_val.csv')

In [12]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in tqdm(np.arange(min_w, 1+min_w-min_w*2, min_w)):
    for b in np.arange(min_w, 1-a+min_w-min_w*1, min_w):
        c = 1-a-b
        combined_res = a*results[0]['val_oof'] + \
                       b*results[1]['val_oof'] + \
                       c*results[2]['val_oof']

        score = clip_rmse(train_y, combined_res)
        if best_score is None or score < best_score:
            best_score = score
            print('best score updated: {:.6f}'.format(best_score), ' coefficient=> {}, {}, {}'.format(a, b, c))
            best_blend_test =  a*results[0]['test_oof'] + \
                               b*results[1]['test_oof'] + \
                               c*results[2]['test_oof']

  0%|                                                                                       | 0/98 [00:00<?, ?it/s]

best score updated: 0.209618  coefficient=> 0.01, 0.01, 0.98
best score updated: 0.209601  coefficient=> 0.01, 0.02, 0.97
best score updated: 0.209584  coefficient=> 0.01, 0.03, 0.96
best score updated: 0.209568  coefficient=> 0.01, 0.04, 0.95
best score updated: 0.209552  coefficient=> 0.01, 0.05, 0.94
best score updated: 0.209536  coefficient=> 0.01, 0.060000000000000005, 0.9299999999999999
best score updated: 0.209520  coefficient=> 0.01, 0.06999999999999999, 0.92
best score updated: 0.209504  coefficient=> 0.01, 0.08, 0.91
best score updated: 0.209489  coefficient=> 0.01, 0.09, 0.9
best score updated: 0.209473  coefficient=> 0.01, 0.09999999999999999, 0.89
best score updated: 0.209458  coefficient=> 0.01, 0.11, 0.88
best score updated: 0.209444  coefficient=> 0.01, 0.12, 0.87
best score updated: 0.209429  coefficient=> 0.01, 0.13, 0.86
best score updated: 0.209414  coefficient=> 0.01, 0.14, 0.85
best score updated: 0.209400  coefficient=> 0.01, 0.15000000000000002, 0.84
best score 

100%|██████████████████████████████████████████████████████████████████████████████| 98/98 [03:51<00:00,  2.36s/it]


In [None]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in np.arange(min_w, 1+min_w-min_w*4, min_w):
    for b in np.arange(min_w, 1-a+min_w-min_w*3, min_w):
        for c in np.arange(min_w, 1-a-b+min_w-min_w*2, min_w):
            for d in np.arange(min_w, 1-a-b-c+min_w-min_w*1, min_w):
                e = 1-a-b-c-d
                combined_res = a*results[0]['val_oof'] + \
                               b*results[1]['val_oof'] + \
                               c*results[2]['val_oof'] + \
                               d*results[3]['val_oof'] + \ 
                               e*results[4]['val_oof']
                
                score = clip_rmse(train_y, combined_res)
                if best_score is None or score < best_score:
                    best_score = score
                    print('best score updated:', best_score)
                    best_blend_test =  a*results[0]['test_oof'] + \
                                       b*results[1]['test_oof'] + \
                                       c*results[2]['test_oof'] + \
                                       d*results[3]['test_oof'] + \ 
                                       e*results[4]['test_oof']

In [8]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(best_blend_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blend_no_xgb_meta.csv')