In [4]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
import dill
import gc
import time

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, precision_recall_curve, f1_score, recall_score, precision_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

from cross_validation import cross_validation
from Preprocessor import Preprocessor
from NanImputer import NanImputer
from hyperparameters_optimization import hyperparameters_optimization

from lightgbm import LGBMClassifier
import lightgbm as lgb
from hyperopt import hp
import catboost

if not os.path.exists('./best_models'):
    os.mkdir('./best_models')
if not os.path.exists('./submits'):
    os.mkdir('./submits')

In [6]:
train = pd.read_csv('./data/train.csv')
train.drop('id', axis=1, inplace=True)
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


# LGBoost

### Preproc data

In [63]:
label_encoding_columns = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_columns:# special transform for ord columns
        label_encoding_columns.append(col)
        
target_encoder_columns = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
preproc_params = {
    'label_encoding_columns': label_encoding_columns,
    'target_encoder_columns': target_encoder_columns,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('fillna', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)


### Hyperparameters tuning using hyperopt

In [52]:
X_columns = [column for column in train_preproc.columns if column != 'target']
X, y = train_preproc[X_columns].values, train_preproc.target.values
train_params = {'params': {
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'objective':'binary',
                    'reg_alpha': 1,
                    'reg_lambda': 1,
                    'learning_rate': 0.1,
                    "boosting": "gbdt",
                    "feature_fraction": 0.85,
                    "bagging_freq": 1,
                    "bagging_fraction": 0.95 ,
                    "seed": 123,
                    'num_threads': 1,
                    'is_unbalance': True,
                    'boost_from_average': False,
                    "metric": 'auc',
                    'n_estimators': 3000, 
                    },
                'fit_params': {'early_stopping_rounds': 50, 'verbose': 1000, },
}
model = LGBMClassifier(**train_params['params'])
# define cross_validation
cv_params = {
    'n_splits': 4,
    'shuffle': True,
    'random_state': 234,
}
cv = StratifiedKFold(**cv_params)

# cross_validation(cv, model, X.values, y.values, train_params=train_params)
# hyperparameters tuning
search_space = {
    'num_leaves': hp.uniformint('num_leaves', 6, 32), 
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 10, 1000),
    'feature_fraction': hp.uniform('feature_fraction', 0.05, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1.0),
}
max_eval = 30
best_params, hp_tuning_results = hyperparameters_optimization(cv, X, y, model, search_space, max_eval, train_params, mode='')
best_params, hp_tuning_results

Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[258]	valid_0's auc: 0.787803
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[279]	valid_0's auc: 0.783231
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[293]	valid_0's auc: 0.786282
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                    

Early stopping, best iteration is:                                                                                     
[271]	valid_0's auc: 0.786041
roc_auc_score                                                                                                          
TRAIN: [0.79666 0.79754 0.79663 0.79746] (0.79707)                                                                     
VAL: [0.78823 0.78384 0.7867  0.78604] (0.7862)                                                                        
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[308]	valid_0's auc: 0.787286
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                  

Early stopping, best iteration is:                                                                                     
[267]	valid_0's auc: 0.782542
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[266]	valid_0's auc: 0.786031
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[365]	valid_0's auc: 0.785148
roc_auc_score                                                                                                          
TRAIN: [0.79774 0.79867 0.79784 0.80143] (0.79892)                                                                     
VAL: [0.78729 0.78254 0.78603 0.78515] (0.78525)                      

VAL: [0.7866  0.78271 0.78529 0.78519] (0.78495)                                                                       
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[284]	valid_0's auc: 0.787904
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[284]	valid_0's auc: 0.783345
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[275]	valid_0's auc: 0.786168
Training until validation scores don't improve for 50 rounds.         

Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[432]	valid_0's auc: 0.787256
roc_auc_score                                                                                                          
TRAIN: [0.79271 0.7939  0.79313 0.79414] (0.79347)                                                                     
VAL: [0.7893  0.78486 0.78771 0.78726] (0.78728)                                                                       
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[371]	valid_0's auc: 0.789359
Training until validation scores don't improve for 50 rounds.                                       

Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[391]	valid_0's auc: 0.785165
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[397]	valid_0's auc: 0.78809
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[570]	valid_0's auc: 0.78754
roc_auc_score                                                                                                          
TRAIN: [0.79176 0.79218 0.79117 0.7926 ] (0.79193)                      

({'bagging_fraction': 0.8705368154360354,
  'feature_fraction': 0.11054504850071879,
  'min_data_in_leaf': 231.0,
  'num_leaves': 13.0},
 [{'loss': -0.7876,
   'status': 'ok',
   'scores': {'roc_auc_score': {'train': array([0.79176, 0.79218, 0.79117, 0.7926 ]),
     'val': array([0.7896 , 0.78516, 0.78809, 0.78754]),
     'train_mean': 0.79193,
     'val_mean': 0.7876}},
   'params': {'bagging_fraction': 0.8705368154360354,
    'feature_fraction': 0.11054504850071879,
    'min_data_in_leaf': 231,
    'num_leaves': 13}},
  {'loss': -0.78758,
   'status': 'ok',
   'scores': {'roc_auc_score': {'train': array([0.79027, 0.79283, 0.79088, 0.79333]),
     'val': array([0.78946, 0.78515, 0.78802, 0.78768]),
     'train_mean': 0.79183,
     'val_mean': 0.78758}},
   'params': {'bagging_fraction': 0.8741056320141718,
    'feature_fraction': 0.11594574331705182,
    'min_data_in_leaf': 273,
    'num_leaves': 13}},
  {'loss': -0.7875,
   'status': 'ok',
   'scores': {'roc_auc_score': {'train': arr

### Fit model with best hyperparameters 

In [59]:
X_columns = [column for column in train_preproc.columns if column != 'target']
X, y = train_preproc[X_columns].values, train_preproc.target.values
train_params = {'params': {
                    'num_leaves': 13,
                    'min_data_in_leaf': 231, 
                    'objective':'binary',
                    'reg_alpha': 1,
                    'reg_lambda': 1,
                    'learning_rate': 0.1,
                    "boosting": "gbdt",
                    "feature_fraction": 0.11054,
                    "bagging_freq": 1,
                    "bagging_fraction": 0.87053 ,
                    "seed": 123,
                    'num_threads': 1,
                    'is_unbalance': True,
                    'boost_from_average': False,
                    "metric": 'auc',
                    'n_estimators': 3000, 
                    },
                'fit_params': {'early_stopping_rounds': 50, 'verbose': 50, },
}
model = LGBMClassifier(**train_params['params'])
# define cross_validation
cv_params = {
    'n_splits': 1,
    'test_size': 0.15,
    'random_state': 234,
}
cv = StratifiedShuffleSplit(**cv_params)

model = cross_validation(cv, model, X, y, train_params=train_params)[1]

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.748208
[100]	valid_0's auc: 0.776297
[150]	valid_0's auc: 0.783675
[200]	valid_0's auc: 0.785786
[250]	valid_0's auc: 0.786764
[300]	valid_0's auc: 0.787386
[350]	valid_0's auc: 0.78753
[400]	valid_0's auc: 0.787627
[450]	valid_0's auc: 0.787661
[500]	valid_0's auc: 0.78769
[550]	valid_0's auc: 0.787708
Early stopping, best iteration is:
[532]	valid_0's auc: 0.787723
roc_auc_score
TRAIN: [0.79201] (0.79201)
VAL: [0.78772] (0.78772)


### Save model and it's params

In [64]:
with open('./best_models/lgb.params', 'w') as f:
    f.write(str(param))

with open('./best_models/lgb.model', 'wb') as f:
    pkl.dump(model, f)
# model.save_model('./best_models/lgb.model')
with open('./best_models/lgb_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [65]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.140327,0.157827,0.151261,0.125761,0.182075,0.146751,3.0,0,4,5,20,0.235641,3.0,9.0
1,600001,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,5.0,0.207375,0.155556,0.213166,0.175406,0.234127,0.191781,1.0,0,1,13,13,-1.0,2.0,8.0
2,600002,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.179573,0.210823,0.174699,0.251001,0.144341,0.162252,1.0,2,2,8,13,0.125748,2.0,6.0
3,600003,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,0.207375,0.241458,0.204947,0.184991,0.169082,0.173469,1.0,2,3,12,1,0.108688,1.0,6.0
4,600004,0.0,0.0,1.0,0.0,1.0,2.0,0.0,-1.0,3.0,0.179573,0.194495,0.153646,0.25029,0.121985,0.154574,1.0,1,5,14,9,0.119081,3.0,3.0


In [66]:
predictions = model.predict(test.iloc[:, 1:])
predictions

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

In [29]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_lgb.csv', index=False)

# CatBoost

### Preproc data

In [67]:
label_encoding_columns = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_columns:# special transform for ord columns
        label_encoding_columns.append(col)
        
target_encoder_columns = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
preproc_params = {
    'label_encoding_columns': label_encoding_columns,
    'target_encoder_columns': target_encoder_columns,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('fillna', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)


### Hyperparameters tuning using hyperopt

In [83]:
X_columns = [column for column in train_preproc.columns if column != 'target']
# cat_columns_ind = [ind for ind, col in enumerate(X_columns) if col in cat_columns]
X, y = train_preproc[X_columns].values, train_preproc.target.values

train_params = {'params': {
                    'depth': 6,
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'loss_function': 'Logloss',
                    'iterations': 1500,
                    'early_stopping_rounds': 50,
                    'l2_leaf_reg': 30,
                    'learning_rate': 0.05,
                    'bagging_temperature': 0.8,
                    'random_strength': 0.8,
                    'task_type': "GPU",
                    'grow_policy': 'Lossguide',
                    "random_seed": 123,
                    'thread_count': 1,
                    "eval_metric": 'AUC',
                    },
                'fit_params': {'use_best_model': True, 'verbose': 1000, 'init_model': None},
}
model = catboost.CatBoostClassifier(**train_params['params'])
# define cross_validation
cv_params = {
    'n_splits': 4,
    'test_size': 0.2,
    'random_state': 123,
}
cv = StratifiedShuffleSplit(**cv_params)

# hyperparameters tuning
search_space = {
    'num_leaves': hp.uniformint('num_leaves', 4, 32), 
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 10, 1000),
    'random_strength': hp.uniform('random_strength', 0.1, 1.0),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.5, 1.0),
}
max_eval = 30
best_params, hp_tuning_results = hyperparameters_optimization(cv, X, y, model, search_space, max_eval, train_params, mode='')
best_params#, hp_tuning_results

0:	learn: 0.6755823	test: 0.6727723	best: 0.6727723 (0)	total: 37.4ms	remaining: 56s                                   

bestTest = 0.7860383093                                                                                                

bestIteration = 768                                                                                                    

Shrink model to first 769 iterations.                                                                                  
0:	learn: 0.6751086	test: 0.6745200	best: 0.6745200 (0)	total: 38.9ms	remaining: 58.3s                                 

bestTest = 0.7852219045                                                                                                

bestIteration = 752                                                                                                    

Shrink model to first 753 iterations.                                                                                  
0:	learn: 0.6756034	test: 0.671191

bestIteration = 651                                                                                                    

Shrink model to first 652 iterations.                                                                                  
0:	learn: 0.6848956	test: 0.6796118	best: 0.6796118 (0)	total: 44.7ms	remaining: 1m 7s                                 

bestTest = 0.7847363949                                                                                                

bestIteration = 639                                                                                                    

Shrink model to first 640 iterations.                                                                                  
roc_auc_score                                                                                                          
TRAIN: [0.79951 0.80139 0.80165 0.80133] (0.80097)                                                                     
VAL: [0.78561 0.78474 0.7845  0.7847

0:	learn: 0.6717586	test: 0.6695077	best: 0.6695077 (0)	total: 36ms	remaining: 54s                                     

bestTest = 0.7862560153                                                                                                

bestIteration = 821                                                                                                    

Shrink model to first 822 iterations.                                                                                  
0:	learn: 0.6709920	test: 0.6712902	best: 0.6712902 (0)	total: 42.7ms	remaining: 1m 3s                                 

bestTest = 0.7854088247                                                                                                

bestIteration = 920                                                                                                    

Shrink model to first 921 iterations.                                                                                  
0:	learn: 0.6708261	test: 0.667035

bestIteration = 517                                                                                                    

Shrink model to first 518 iterations.                                                                                  
0:	learn: 0.6882952	test: 0.6828304	best: 0.6828304 (0)	total: 83.2ms	remaining: 2m 4s                                 

bestTest = 0.7845372558                                                                                                

bestIteration = 556                                                                                                    

Shrink model to first 557 iterations.                                                                                  
roc_auc_score                                                                                                          
TRAIN: [0.80296 0.80429 0.80342 0.80423] (0.80372)                                                                     
VAL: [0.78514 0.78431 0.78432 0.7845

TRAIN: [0.79879 0.7979  0.79896 0.79836] (0.7985)                                                                      
VAL: [0.78596 0.78501 0.78493 0.78533] (0.78531)                                                                       
0:	learn: 0.6874891	test: 0.6836672	best: 0.6836672 (0)	total: 52.7ms	remaining: 1m 18s                                

bestTest = 0.7850084305                                                                                                

bestIteration = 504                                                                                                    

Shrink model to first 505 iterations.                                                                                  
0:	learn: 0.6871620	test: 0.6847185	best: 0.6847185 (0)	total: 51.4ms	remaining: 1m 17s                                

bestTest = 0.784311235                                                                                                 

bestIteration = 563                

0:	learn: 0.6762010	test: 0.6717362	best: 0.6717362 (0)	total: 37.4ms	remaining: 56.1s                                 

bestTest = 0.7848755121                                                                                                

bestIteration = 633                                                                                                    

Shrink model to first 634 iterations.                                                                                  
0:	learn: 0.6771770	test: 0.6733876	best: 0.6733876 (0)	total: 33.4ms	remaining: 50s                                   

bestTest = 0.7852179408                                                                                                

bestIteration = 749                                                                                                    

Shrink model to first 750 iterations.                                                                                  
roc_auc_score                     

1000:	learn: 0.7903517	test: 0.7858865	best: 0.7858876 (997)	total: 27.5s	remaining: 13.7s                             

bestTest = 0.786273092                                                                                                 

bestIteration = 1395                                                                                                   

Shrink model to first 1396 iterations.                                                                                 
roc_auc_score                                                                                                          
TRAIN: [0.79179 0.79178 0.79197 0.79154] (0.79177)                                                                     
VAL: [0.78682 0.7861  0.78577 0.78627] (0.78624)                                                                       
0:	learn: 0.6396469	test: 0.6382460	best: 0.6382460 (0)	total: 24.8ms	remaining: 37.1s                                 

1000:	learn: 0.7888333	test: 0.78621

0:	learn: 0.6583042	test: 0.6529661	best: 0.6529661 (0)	total: 39.1ms	remaining: 58.6s                                 

1000:	learn: 0.7914635	test: 0.7854551	best: 0.7854581 (993)	total: 31.2s	remaining: 15.5s                             

bestTest = 0.7856083214                                                                                                

bestIteration = 1138                                                                                                   

Shrink model to first 1139 iterations.                                                                                 
0:	learn: 0.6479369	test: 0.6449877	best: 0.6449877 (0)	total: 33.6ms	remaining: 50.4s                                 

1000:	learn: 0.7913817	test: 0.7859563	best: 0.7859565 (998)	total: 31.3s	remaining: 15.6s                             

bestTest = 0.7861487865                                                                                                

bestIteration = 1309             

bestTest = 0.785836041                                                                                                 

bestIteration = 1081                                                                                                   

Shrink model to first 1082 iterations.                                                                                 
0:	learn: 0.6654167	test: 0.6606744	best: 0.6606744 (0)	total: 36.3ms	remaining: 54.5s                                 

1000:	learn: 0.7944797	test: 0.7854291	best: 0.7854402 (976)	total: 31s	remaining: 15.4s                               

bestTest = 0.7854402065                                                                                                

bestIteration = 976                                                                                                    

Shrink model to first 977 iterations.                                                                                  
0:	learn: 0.6649493	test: 0.662938

1499:	learn: 0.7918373	test: 0.7858399	best: 0.7858411 (1496)	total: 41.8s	remaining: 0us                              

bestTest = 0.7858411074                                                                                                

bestIteration = 1496                                                                                                   

Shrink model to first 1497 iterations.                                                                                 
0:	learn: 0.6462713	test: 0.6430966	best: 0.6430966 (0)	total: 30.9ms	remaining: 46.4s                                 

1000:	learn: 0.7902882	test: 0.7857823	best: 0.7857823 (1000)	total: 28.5s	remaining: 14.2s                            

1499:	learn: 0.7917333	test: 0.7861885	best: 0.7861897 (1495)	total: 42.4s	remaining: 0us                              

bestTest = 0.7861897349                                                                                                

bestIteration = 1495             

{'bagging_temperature': 0.9326912190294238,
 'min_data_in_leaf': 985.0,
 'num_leaves': 4.0,
 'random_strength': 0.9441760357417693}

### Fit model with best hyperparameters 

In [85]:
X_columns = [column for column in train_preproc.columns if column != 'target']
# cat_columns_ind = [ind for ind, col in enumerate(X_columns) if col in cat_columns]
X, y = train_preproc[X_columns].values, train_preproc.target.values

param = {'params': {
                    'depth': 6,
                    'num_leaves': 4,
                    'min_data_in_leaf': 985, 
                    'loss_function': 'Logloss',
                    'iterations': 1500,
                    'early_stopping_rounds': 50,
                    'l2_leaf_reg': 30,
                    'learning_rate': 0.05,
                    'bagging_temperature': 0.93269,
                    'random_strength': 0.94417,
                    'task_type': "GPU",
                    'grow_policy': 'Lossguide',
                    "random_seed": 123,
                    'thread_count': 1,
                    "eval_metric": 'AUC',
                    },
         'fit_params': {'use_best_model': True, 'verbose': 1000, 'init_model': None},
}
param['params'].update({'bagging_temperature': 0.7497082074820156,
 'min_data_in_leaf': 67.0,
 'num_leaves': 4.0,
 'random_strength': 0.2017357950398055})

model = catboost.CatBoostClassifier(**param['params'])#)dtrain=train_dataset, eval_set=val_dataset, **param)
cv_params = {
    'n_splits': 1,
    'test_size': 0.15,
    'random_state': 234,
}
cv = StratifiedShuffleSplit(**cv_params)

model = cross_validation(cv, model, X, y, train_params=train_params)[1]

0:	learn: 0.6390886	test: 0.6409414	best: 0.6409414 (0)	total: 36.1ms	remaining: 54.1s
1000:	learn: 0.7887323	test: 0.7867146	best: 0.7867163 (999)	total: 27.9s	remaining: 13.9s
1499:	learn: 0.7900638	test: 0.7873889	best: 0.7873904 (1496)	total: 41.6s	remaining: 0us
bestTest = 0.7873904407
bestIteration = 1496
Shrink model to first 1497 iterations.
roc_auc_score
TRAIN: [0.79006] (0.79006)
VAL: [0.78739] (0.78739)


### Save model and it's params

In [86]:
with open('./best_models/catboost.params', 'w') as f:
    f.write(str(param))

model.save_model('./best_models/catboost.model')
with open('./best_models/catboost_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [87]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.140327,0.157827,0.151261,0.125761,0.182075,0.146751,3.0,0,4,5,20,0.235641,3.0,9.0
1,600001,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,5.0,0.207375,0.155556,0.213166,0.175406,0.234127,0.191781,1.0,0,1,13,13,-1.0,2.0,8.0
2,600002,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.179573,0.210823,0.174699,0.251001,0.144341,0.162252,1.0,2,2,8,13,0.125748,2.0,6.0
3,600003,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,0.207375,0.241458,0.204947,0.184991,0.169082,0.173469,1.0,2,3,12,1,0.108688,1.0,6.0
4,600004,0.0,0.0,1.0,0.0,1.0,2.0,0.0,-1.0,3.0,0.179573,0.194495,0.153646,0.25029,0.121985,0.154574,1.0,1,5,14,9,0.119081,3.0,3.0


In [88]:
predictions = model.predict_proba(test.iloc[:, 1:]).T[1]
predictions

array([0.11995345, 0.26400647, 0.17419941, ..., 0.54297048, 0.24530804,
       0.20178994])

In [20]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_cat.csv', index=False)

# Logreg

### Preproc data

In [89]:
label_encoding_columns = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_columns:# special transform for ord columns
        label_encoding_columns.append(col)

minmax_columns = [f'ord_{i}' for i in range(5)]
target_encoder_columns = [i for i in train.columns if i not in minmax_columns and i != 'target']
preproc_params = {
    'label_encoding_columns': label_encoding_columns,
    'target_encoder_columns': target_encoder_columns,
    'min_max_columns': minmax_columns,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('ohe', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)


### Cross validation

In [90]:
X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns].values, train_preproc.target.values

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
#     'penalty': 'l2',
#     'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

# define cross_validation
cv_params = {
    'n_splits': 4,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)

cross_validation(cv, LogisticRegression(**train_params), X, y)[0]

roc_auc_score
TRAIN: [0.78703 0.78752 0.78689 0.78727] (0.78718)
VAL: [0.78757 0.78611 0.78798 0.7868 ] (0.78712)


{'roc_auc_score': {'train': array([0.78703, 0.78752, 0.78689, 0.78727]),
  'val': array([0.78757, 0.78611, 0.78798, 0.7868 ]),
  'train_mean': 0.78718,
  'val_mean': 0.78712}}

### Fit model

In [91]:
X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns].values, train_preproc.target.values

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
#     'penalty': 'l2',
#     'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

model = LogisticRegression(**train_params)
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2020,
                   multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

### Save model and it's params

In [92]:
with open('./best_models/logreg.params', 'w') as f:
    f.write(str(train_params))

with open('./best_models/logreg.model', 'wb') as f:
    pkl.dump(model, f)
with open('./best_models/logreg_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [93]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.205417,0.201104,0.178947,0.140327,0.157827,0.151261,0.125761,0.182075,0.146751,1.0,0.0,0.8,0.357143,0.8,0.235641,0.16408,0.211688
1,600001,0.194674,0.19056,0.17113,0.186772,0.195305,0.183067,0.179694,0.209834,0.219534,0.207375,0.155556,0.213166,0.175406,0.234127,0.191781,0.0,0.0,0.2,0.928571,0.52,0.18745,0.19916,0.190094
2,600002,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.179694,0.201104,0.219534,0.179573,0.210823,0.174699,0.251001,0.144341,0.162252,0.0,0.5,0.4,0.571429,0.52,0.125748,0.19916,0.229951
3,600003,0.113365,0.19056,0.17113,0.186772,0.180234,0.183067,0.205417,0.201104,0.202836,0.207375,0.241458,0.204947,0.184991,0.169082,0.173469,0.0,0.5,0.6,0.857143,0.04,0.108688,0.213682,0.229951
4,600004,0.194674,0.19056,0.228917,0.186772,0.195305,0.183067,0.179694,0.187262,0.178947,0.179573,0.194495,0.153646,0.25029,0.121985,0.154574,0.0,0.25,1.0,1.0,0.36,0.119081,0.16408,0.146451


In [94]:
predictions = model.predict_proba(test.iloc[:, 1:]).T[1]
predictions

array([0.14000958, 0.2410715 , 0.16268397, ..., 0.52645846, 0.25565631,
       0.22060699])

In [27]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_logreg.csv', index=False)

# Polynomial LogReg

### Since logreg with lbfgs solver takes a lot of CPU usage during training (I very care about my CPU xD) and we have a lot of data and relatively many features (therefore it takes a little bit more time for convergence) i decided to train on subset objects hoping the subset will reflect the same properties and polynomial logreg make a decision boundary aproximately as on the original full dataset. All preprocessing steps is the same as for LogReg.

In [28]:
compresed_xy = None
for tr_ind, val_ind in StratifiedKFold(10, shuffle=True, random_state=123).split(X, y):
    compresed_xy = (X.iloc[val_ind], y.iloc[val_ind])
    break
compresed_xy[0].shape, compresed_xy[1].shape

((60000, 23), (60000,))

### Cross validation

In [29]:
poly = PolynomialFeatures(2, interaction_only=True)
poly_X = poly.fit_transform(compresed_xy[0].reset_index(drop=True))
# poly_X = pd.DataFrame(poly.fit_transform(train_preproc[X_columns]))

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)
cross_validation(cv, LogisticRegression(**train_params), poly_X, compresed_xy[1].values)


roc_auc_score
TRAIN: [0.78955 0.78845 0.78815 0.78897 0.78619] (0.78826)
VAL: [0.78115 0.78525 0.78669 0.78469 0.79427] (0.78641)


({'roc_auc_score': {'train': array([0.78955, 0.78845, 0.78815, 0.78897, 0.78619]),
   'val': array([0.78115, 0.78525, 0.78669, 0.78469, 0.79427]),
   'train_mean': 0.78826,
   'val_mean': 0.78641}},
 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=2020,
                    multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                    solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))

### Fit model

In [30]:
poly = PolynomialFeatures(2, interaction_only=True)
poly_X = poly.fit_transform(compresed_xy[0].reset_index(drop=True))
train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 500, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

model = LogisticRegression(**train_params)
model.fit(poly_X, compresed_xy[1].values)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

### Save model and it's params

In [31]:
with open('./best_models/poly_logreg.params', 'w') as f:
    f.write(str(train_params))

with open('./best_models/poly_logreg.model', 'wb') as f:
    pkl.dump(model, f)
with open('./best_models/poly_logreg_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [32]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.205417,0.201104,0.178947,...,0.182075,0.146751,1.0,0.0,0.8,0.357143,0.8,0.235641,0.16408,0.211688
1,600001,0.194674,0.19056,0.17113,0.186772,0.195305,0.183067,0.179694,0.209834,0.219534,...,0.234127,0.191781,0.0,0.0,0.2,0.928571,0.52,0.18745,0.19916,0.190094
2,600002,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.179694,0.201104,0.219534,...,0.144341,0.162252,0.0,0.5,0.4,0.571429,0.52,0.125748,0.19916,0.229951
3,600003,0.113365,0.19056,0.17113,0.186772,0.180234,0.183067,0.205417,0.201104,0.202836,...,0.169082,0.173469,0.0,0.5,0.6,0.857143,0.04,0.108688,0.213682,0.229951
4,600004,0.194674,0.19056,0.228917,0.186772,0.195305,0.183067,0.179694,0.187262,0.178947,...,0.121985,0.154574,0.0,0.25,1.0,1.0,0.36,0.119081,0.16408,0.146451


In [33]:
predictions = model.predict_proba(poly.transform(test.iloc[:, 1:])).T[1]
predictions

array([0.15732676, 0.23835215, 0.15664765, ..., 0.55263458, 0.24580884,
       0.18908509])

In [34]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_poly_logreg.csv', index=False)

## LogReg + KNN

### Preproc data

In [35]:
label_encoding_columns = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_columns:# special transform for ord columns
        label_encoding_columns.append(col)
        
minmax_columns = [f'ord_{i}' for i in range(5)]# + [f'nom_{i}' for i in range(4)] + ['day', 'month']
target_encoder_columns = [i for i in train.columns if i not in minmax_columns and i != 'target']
preproc_params = {
    'label_encoding_columns': label_encoding_columns,
    'target_encoder_columns': target_encoder_columns,
    'min_max_columns': minmax_columns,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('ohe', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)

X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns], train_preproc.target

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}
log_model = LogisticRegression(**train_params)
log_model.fit(X, y)
X *= abs(log_model.coef_[0])

In [36]:
compresed_xy = None
for tr_ind, val_ind in StratifiedKFold(10, shuffle=True, random_state=123).split(X, y):
    compresed_xy = (X.iloc[val_ind], y.iloc[val_ind])
    break
compresed_xy[0].shape, compresed_xy[1].shape

((60000, 23), (60000,))

In [37]:
train_params = {'n_neighbors': 188, 'p': 2, 'weights': 'uniform'}

cv_params = {
    'n_splits': 1,
    'test_size': 0.2,
    'random_state': 123,
}
cv = StratifiedShuffleSplit(**cv_params)

cross_validation(cv, KNeighborsClassifier(**train_params), 
                 compresed_xy[0].values, 
                 compresed_xy[1].values, verbose=True)#[0], verbose=True)#[0]


roc_auc_score
TRAIN: [0.77849] (0.77849)
VAL: [0.77885] (0.77885)


({'roc_auc_score': {'train': array([0.77849]),
   'val': array([0.77885]),
   'train_mean': 0.77849,
   'val_mean': 0.77885}},
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=188, p=2,
                      weights='uniform'))

### Since i've got relatively bad ROC-AUC score in training and 1 fold validation, i decided not to include that model in stack. Any changing n_neighbours parameter and other hyperparameters optimization didn't improve train and validation score sufficiently to add this model in stack.

# Stack LGB, CatBoost and LogReg and Polynomial LogReg

In [96]:
label_encoding_columns = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_columns:# special transform for ord columns
        label_encoding_columns.append(col)

X_columns = [column for column in train.columns if column != 'target']
test = pd.read_csv('./data/test.csv')

# lgb preproc
print('lgb preproc')
if os.path.exists('./best_models/lgb_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/lgb_preproc_pipeline.ppln', 'rb') as f:
        lgb_preproc_pipepline = dill.load(f)
    lgb_preproc_pipepline[0].isTrain = True
    lgb_train_preproc = lgb_preproc_pipepline.transform(train)
else:
    target_encoder_columns = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
    preproc_params = {
        'label_encoding_columns': label_encoding_columns,
        'target_encoder_columns': target_encoder_columns,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    lgb_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('fillna', -1))),
    ])

    lgb_preproc_pipepline[0].isTrain = True
    lgb_train_preproc = lgb_preproc_pipepline.fit_transform(train, train.target)
    
lgb_params = {
    'params': {
        'num_leaves': 13,
        'min_data_in_leaf': 231, 
        'objective':'binary',
        'reg_alpha': 1,
        'reg_lambda': 1,
        'learning_rate': 0.1,
        "boosting": "gbdt",
        "feature_fraction": 0.11054,
        "bagging_freq": 1,
        "bagging_fraction": 0.87053 ,
        "seed": 123,
        'num_threads': 1,
        'is_unbalance': True,
        'boost_from_average': False,
        "metric": 'auc',
        'n_estimators': 3000, 
        },
    'fit_params': {'early_stopping_rounds': 50, 'verbose': 50, },
}
lgb_preproc_pipepline['preprocessor'].isTrain = False
lgb_test = lgb_preproc_pipepline.transform(test)


# catboost preproc
print('catboost preproc')
if os.path.exists('./best_models/catboost_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/catboost_preproc_pipeline.ppln', 'rb') as f:
        catboost_preproc_pipepline = dill.load(f)
    catboost_preproc_pipepline[0].isTrain = True
    catboost_train_preproc = catboost_preproc_pipepline.transform(train)
else:
    target_encoder_columns = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
    preproc_params = {
        'label_encoding_columns': label_encoding_columns,
        'target_encoder_columns': target_encoder_columns,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    catboost_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('fillna', -1))),
    ])

    catboost_preproc_pipepline[0].isTrain = True
    catboost_train_preproc = catboost_preproc_pipepline.fit_transform(train, train.target)
    
# cat_columns = [column for column in catboost_train_preproc.columns if ('nom' in column or 'ord' in column) and column not in target_encoder_columns] + ['day', 'month']
# catboost_train_preproc[cat_columns] = catboost_train_preproc[cat_columns].astype(int)
# cat_columns_ind = [ind for ind, col in enumerate(X_columns) if col in cat_columns]
catboost_params = {
    'params': {
            'depth': 6,
            'num_leaves': 4,
            'min_data_in_leaf': 985, 
            'loss_function': 'Logloss',
            'iterations': 1500,
            'early_stopping_rounds': 50,
            'l2_leaf_reg': 30,
            'learning_rate': 0.05,
            'bagging_temperature': 0.93269,
            'random_strength': 0.94417,
            'task_type': "GPU",
            'grow_policy': 'Lossguide',
            "random_seed": 123,
            'thread_count': 1,
            "eval_metric": 'AUC',
            },
     'fit_params': {'use_best_model': True, 'verbose': 1000, 'init_model': None},
}
catboost_preproc_pipepline['preprocessor'].isTrain = False
catboost_test = catboost_preproc_pipepline.transform(test)
catboost_test[cat_columns] = catboost_test[cat_columns].astype(int)


# log_reg preproc
print('log_reg preproc')
if os.path.exists('./best_models/logreg_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/logreg_preproc_pipeline.ppln', 'rb') as f:
        logreg_preproc_pipepline = dill.load(f)
    logreg_preproc_pipepline[0].isTrain = True
    logreg_train_preproc = logreg_preproc_pipepline.transform(train)
else:
    minmax_columns = [f'ord_{i}' for i in range(5)]
    target_encoder_columns = [i for i in train.columns if i not in minmax_columns and i != 'target']
    preproc_params = {
        'label_encoding_columns': label_encoding_columns,
        'target_encoder_columns': target_encoder_columns,
        'min_max_columns': minmax_columns,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    logreg_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('ohe', -1))),
    ])

    logreg_preproc_pipepline[0].isTrain = True
    logreg_train_preproc = logreg_preproc_pipepline.fit_transform(train, train.target)
    
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}
logreg_preproc_pipepline['preprocessor'].isTrain = False
logreg_test = logreg_preproc_pipepline.transform(test)


# polynimal logreg
polylogreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}


stack = pd.DataFrame(index=train.index)
for modelname in ['lgb', 'catboost', 'logreg', 'poly_logreg',]:
    stack[modelname] = 0.5
stack['target'] = logreg_train_preproc.target.values
test_pred = []
    
cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)

n_fold = 1
for tr_ind, val_ind in cv.split(train, train.target):
    print(f'n_fold={n_fold}')
    n_fold += 1
    # lgb
    train_features, train_target = lgb_train_preproc.iloc[tr_ind][X_columns], lgb_train_preproc.iloc[tr_ind].target
    val_features, val_target = lgb_train_preproc.iloc[val_ind][X_columns], lgb_train_preproc.iloc[val_ind].target
    
    model = lgb.LGBMClassifier(**lgb_params['params'])
    model.fit(train_features, train_target, eval_set=[(val_features, val_target)], **lgb_params['fit_params'])
    
    stack.iloc[val_ind, 0] = model.predict_proba(val_features).T[1]
    # test prediction
    test_pred.append(model.predict(lgb_test.iloc[:, 1:]).tolist())
    
    # catboost
    train_features, train_target = catboost_train_preproc.iloc[tr_ind][X_columns], catboost_train_preproc.iloc[tr_ind].target
    val_features, val_target = catboost_train_preproc.iloc[val_ind][X_columns], catboost_train_preproc.iloc[val_ind].target

    model = catboost.CatBoostClassifier(**catboost_params['params'])#)dtrain=train_dataset, eval_set=val_dataset, **param)
    model.fit(train_features, train_target, eval_set=[(val_features, val_target)], **catboost_params['fit_params'])
    
    stack.iloc[val_ind, 1] = model.predict_proba(val_features).T[1]
    # test prediction
    test_pred.append(model.predict_proba(catboost_test.iloc[:, 1:]).T[1].tolist())
    
    # logreg
    train_features, train_target = logreg_train_preproc.iloc[tr_ind][X_columns], logreg_train_preproc.iloc[tr_ind].target
    val_features, val_target = logreg_train_preproc.iloc[val_ind][X_columns], logreg_train_preproc.iloc[val_ind].target

    model = LogisticRegression(**logreg_params)
    model.fit(train_features, train_target)
    
    stack.iloc[val_ind, 2] = model.predict_proba(val_features).T[1]
    # test prediction
    test_pred.append(model.predict_proba(logreg_test.iloc[:, 1:]).T[1].tolist())
    
    # polynomial logreg
    tm = time.time()
    train_features, train_target = logreg_train_preproc.iloc[tr_ind][X_columns], logreg_train_preproc.iloc[tr_ind].target
    val_features, val_target = logreg_train_preproc.iloc[val_ind][X_columns], logreg_train_preproc.iloc[val_ind].target
    compresed_xy = None
    for tr_ind1, val_ind1 in StratifiedShuffleSplit(1, test_size=0.2, random_state=123).split(train_features, train_target):
        compresed_xy = (train_features.iloc[val_ind1], train_target.iloc[val_ind1])
    poly = PolynomialFeatures(2, interaction_only=True)
    train_features = poly.fit_transform(compresed_xy[0].reset_index(drop=True))
    train_target = compresed_xy[1]

    model = LogisticRegression(**polylogreg_params)
    model.fit(train_features, train_target)
    
    stack.iloc[val_ind, 3] = model.predict_proba(poly.transform(val_features)).T[1]
    # test prediction
    test_pred.append(model.predict_proba(poly.transform(logreg_test.iloc[:, 1:])).T[1].tolist())
    

['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
lgb preproc
load existsing preproc_pipeline...
catboost preproc
load existsing preproc_pipeline...
log_reg preproc
load existsing preproc_pipeline...
n_fold=1
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.747841
[100]	valid_0's auc: 0.776755
[150]	valid_0's auc: 0.784687
[200]	valid_0's auc: 0.786901
[250]	valid_0's auc: 0.787875
[300]	valid_0's auc: 0.788179
[350]	valid_0's auc: 0.788262
[400]	valid_0's auc: 0.788416
[450]	valid_0's auc: 0.788437
[500]	valid_0's auc: 0.788474
Early stopping, best iteration is:
[473]	valid_0's auc: 0.788504
0:	learn: 0.6389869	test: 0.6408865	best: 0.6408865 (0)	total: 26.7ms	remaining: 40.1s
1000:	learn: 0.7885101	test: 0.7873374	best: 0.7873397 (999)	total: 26.6s	remaining: 13.3s
1499:	learn: 0.7899117	test: 0.7881209	best: 0.7881258 (1487)	total: 39.7s	remaining: 0us
bestTest = 0.7881258428
bestIteration 

## Logreg as a highlevel agregate model

In [98]:
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}

cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 321,
}
cv = StratifiedKFold(**cv_params)
cross_validation(cv, LogisticRegression(**logreg_params), stack.iloc[:, :stack.shape[1]-1].values, stack.target.values)

roc_auc_score
TRAIN: [0.78772 0.78733 0.78814 0.78807 0.78745] (0.78774)
VAL: [0.78782 0.7894  0.78616 0.78644 0.78889] (0.78774)


({'roc_auc_score': {'train': array([0.78772, 0.78733, 0.78814, 0.78807, 0.78745]),
   'val': array([0.78782, 0.7894 , 0.78616, 0.78644, 0.78889]),
   'train_mean': 0.78774,
   'val_mean': 0.78774}},
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=2020,
                    multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                    solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))

### Fit model

In [101]:
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}

stack_logreg = LogisticRegression(**logreg_params)
stack_logreg.fit(stack.iloc[:, :stack.shape[1]-1], stack.target)
stack_logreg.coef_[0] # logreg coeficients

array([ 4.77367853, -0.50914572,  0.42169298,  0.37637993])

### Save model and it's params

In [102]:
with open('./best_models/agregate_logreg.params', 'w') as f:
    f.write(str(logreg_params))

with open('./best_models/agregate_logreg.model', 'wb') as f:
    pkl.dump(stack_logreg, f)

### Make submission

In [103]:
test_stack = pd.DataFrame(index=test.index)
for index, modelname in enumerate(['lgb', 'catboost', 'logreg', 'poly_logreg',]):
    test_stack[modelname] = np.mean(test_pred[index::4], axis=0) # where 4 - number of models
test_stack.describe()

Unnamed: 0,lgb,catboost,logreg,poly_logreg
count,400000.0,400000.0,400000.0,400000.0
mean,0.37632,0.18694,0.187021,0.187251
std,0.475955,0.159804,0.163298,0.162873
min,0.0,0.001219,0.000911,0.001281
25%,0.0,0.064675,0.063799,0.064526
50%,0.0,0.137475,0.13511,0.135559
75%,1.0,0.26444,0.262444,0.262069
max,1.0,0.937859,0.953451,0.958971


In [90]:
predictions = stack_logreg.predict_proba(test_stack).T[1]
predictions

array([0.11681178, 0.2778794 , 0.16745677, ..., 0.5424866 , 0.27959457,
       0.18709986])

In [91]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_logreg_stack_with_poly.csv', index=False)