In [74]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
import dill
import category_encoders
import gc
import time

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, precision_recall_curve, f1_score, recall_score, precision_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

from lightgbm import LGBMClassifier, Dataset
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import catboost

if not os.path.exists('./best_models'):
    os.mkdir('./best_models')
if not os.path.exists('./submits'):
    os.mkdir('./submits')

In [2]:
train = pd.read_csv('./data/train.csv')
train.drop('id', axis=1, inplace=True)
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [3]:
label_encoding_features = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_features:# special transform for ord features
        label_encoding_features.append(col)
label_encoding_features

['bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9']

# Define some helper functions for preprocessing, cross-validation and hyperparameters-tuning

In [4]:
class Preprocessor():
    
    def __init__(self, label_encoding_features=[], target_encoder_features=[], ohe_features=[], min_max_features=[], custom_transform={}, drop_columns=[]):
        self.label_encoding_features = label_encoding_features
        self.target_encoder_features = target_encoder_features
        self.ohe_features = ohe_features
        self.min_max_features = min_max_features
        self.custom_transform = custom_transform
        self.drop_columns = drop_columns
        self.isTrain = True
        self.y = None
        
        self.le = []
        self.ohe = []
        self.mm = []
        self.te = []
        
    def fit(self, X, y=None):
        X = X.copy()
        
        for col in self.label_encoding_features:
            self.le.append(OrdinalEncoder())
            X.loc[~X[col].isna(), col] = self.le[-1].fit_transform(X.loc[~X[col].isna(), col].values.reshape(-1, 1))
                
        for col in self.custom_transform:
            if type(self.custom_transform[col]) is dict:
                X.loc[~X[col].isna(), col] = X.loc[~X[col].isna(), col].replace(self.custom_transform[col])
            elif type(self.custom_transform[col]) is list:
                for sub_col, func in self.custom_transform[col]:
                    X[sub_col] = -1
                    X.loc[~X[col].isna(), sub_col] = X.loc[~X[col].isna(), col].apply(func)
            else:
                X.loc[~X[col].isna(), col] = X.loc[~X[col].isna(), col].apply(self.custom_transform[col])
                
        for column in self.min_max_features:
            self.mm.append(MinMaxScaler())
            X.loc[~X[column].isna(), column] = self.mm[-1].fit_transform(X.loc[~X[column].isna(), column].values.reshape(-1, 1))
                
        if self.target_encoder_features:
            self.y = y
            for train_ind, val_ind in StratifiedKFold(shuffle=True, random_state=123).split(X, y):
                self.te.append(category_encoders.TargetEncoder(cols=self.target_encoder_features, handle_missing='return_nan'))
                self.te[-1].fit(X.loc[train_ind, self.target_encoder_features], X.loc[train_ind, 'target'].values.reshape(-1, 1))

            self.te.append(category_encoders.TargetEncoder(cols=self.target_encoder_features, handle_missing='return_nan'))#, smoothing=0.25))
            self.te[-1].fit(X[self.target_encoder_features], y)
        
        return self
    
    def transform(self, X):
        X = X.copy()
        
        for ind, col in enumerate(self.label_encoding_features):
            X.loc[~X[col].isin(list(self.le[ind].categories_[0])), col] = np.nan
            X.loc[~X[col].isna(), col] = self.le[ind].transform(X.loc[~X[col].isna(), col].values.reshape(-1, 1))#.astype(int)
                
        for col in self.custom_transform:
            if type(self.custom_transform[col]) is dict:
                X.loc[~X[col].isna(), col] = X.loc[~X[col].isna(), col].replace(self.custom_transform[col])
            elif type(self.custom_transform[col]) is list:
                for sub_col, func in self.custom_transform[col]:
                    X[sub_col] = -1
                    X.loc[~X[col].isna(), sub_col] = X.loc[~X[col].isna(), col].apply(func)
            else:
                X.loc[~X[col].isna(), col] = X.loc[~X[col].isna(), col].apply(self.custom_transform[col])
                
        for ind, column in enumerate(self.min_max_features):
            X.loc[~X[column].isna(), column] = self.mm[ind].transform(X.loc[~X[column].isna(), column].values.reshape(-1, 1))
                
        if self.target_encoder_features:
            if self.isTrain: #train-val
                for ind, (train_ind, val_ind) in enumerate(StratifiedKFold(shuffle=True, random_state=123).split(X, self.y)):
                    X.loc[val_ind, self.target_encoder_features] = self.te[ind].transform(X.loc[val_ind, self.target_encoder_features])
            else: # test
                X[self.target_encoder_features] = self.te[-1].transform(X[self.target_encoder_features])
            
        if self.drop_columns:
            X = X.drop(self.drop_columns, axis=1)
            
        return X

In [5]:
class NanImputer():
    
    def __init__(self, mode):
        self.mode = mode # ('ohe',) or ('fillna', -1)
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        if self.mode[0] == 'fillna':
            X.fillna(self.mode[1], inplace=True)
        elif self.mode[0] == 'ohe':
            nan_columns = X.isna().sum()
            nan_columns = nan_columns[nan_columns > 0].index.values
            for column in nan_columns:
#                 X[f'{column}_isNaN'] = X[column].isna() * 1
#                 X.loc[X[column].isna(), column] = X.loc[~X[column].isna(), column].value_counts().index.values.mean()
                X.loc[X[column].isna(), column] = X.loc[~X[column].isna(), column].values.mean()
                X[column] = X[column].astype(float)
        return X
        


In [6]:
def cross_validation(cv, model, X, y, metrics=[roc_auc_score], verbose=True, train_params={}):
    
    scores = {}
    for metric in metrics:
        scores[metric.__name__] = {'train': [], 'val': []}
    modeltype = train_params.pop('modeltype', None)
    cat_features = train_params.pop('cat_features', None)
        
    for train_index, val_index in cv.split(X, y):
        X_train, X_val, y_train, y_val = X.loc[train_index], X.loc[val_index], y.loc[train_index], y.loc[val_index]
        
        if modeltype == 'lgb':
            train_dataset = Dataset(X_train, y_train, free_raw_data=False)
            val_dataset = Dataset(X_val, y_val, free_raw_data=False)

            model = lgb.train(train_set=train_dataset, valid_sets=[val_dataset], **train_params)

            train_predictions_proba = model.predict(X_train)
            val_predictions_proba = model.predict(X_val)

        elif modeltype == 'catboost':
            train_dataset = catboost.Pool(X_train, y_train, cat_features=cat_features, feature_names=list(X_train.columns), thread_count=1)
            val_dataset = catboost.Pool(X_val, y_val, cat_features=cat_features, feature_names=list(X_train.columns), thread_count=1)

            model = catboost.CatBoostClassifier(**train_params['params'])
            model.fit(train_dataset, eval_set=val_dataset, **train_params['fit_params'])

            train_predictions_proba = model.predict_proba(X_train).T[1]
            val_predictions_proba = model.predict_proba(X_val).T[1]
        else:
            model.fit(X_train, y_train)
            
            train_predictions_proba = model.predict_proba(X_train).T[1]
            val_predictions_proba = model.predict_proba(X_val).T[1]

        train_predictions = np.round(train_predictions_proba)
        val_predictions = np.round(val_predictions_proba)

        # metric calculation
        for index, metric in enumerate(metrics):
            if metric.__name__ in ['precision_recall_curve', 'roc_curve']:
                train_score = auc(*metric(y_train, train_predictions_proba)[:2][::-1])
                val_score = auc(*metric(y_val, val_predictions_proba)[:2][::-1])
            elif metric.__name__ == 'roc_auc_score':
                train_score = metric(y_train, train_predictions_proba)
                val_score = metric(y_val, val_predictions_proba)
            else:
                train_score = metric(y_train, train_predictions)
                val_score = metric(y_val, val_predictions)

            scores[metric.__name__]['train'].append(train_score)
            scores[metric.__name__]['val'].append(val_score)
            
    for metric in metrics:
        if verbose:
            print(metric.__name__)
        for key in ['train', 'val']:
            scores[metric.__name__][key] = np.round(scores[metric.__name__][key], 5)
            scores[metric.__name__][f'{key}_mean'] = round(np.mean(scores[metric.__name__][key]), 5)
            if verbose:
                print(f"{key.upper()}: {scores[metric.__name__][key]} ({scores[metric.__name__][key+'_mean']})")
    
    return scores, model
    

In [7]:
def hyperparameters_optimization(X, y, model, space_search, max_evals, base_params={}, loss=''):

    modeltype = base_params.get('modeltype', None)
    
    def objective(space_search):
        if model is not None:
            model.set_params(**space_search)
        else:
            if 'params' in base_params:
                base_params['params'].update(space_search)
            else:
                base_params.update(space_search)
            base_params['modeltype'] = modeltype
#         print(space_search, model)
        scores = cross_validation(cv, model, X, y, verbose=True, train_params=base_params)[0];
        if loss == 'overfit':
            return {'loss': -scores['roc_auc_score']['val_mean'] + max(0, (scores['roc_auc_score']['train_mean'] - scores['roc_auc_score']['val_mean'])), 
                    'status': STATUS_OK, 'scores': scores, 'params': space_search}
        return {'loss': -scores['roc_auc_score']['val_mean'], 'status': STATUS_OK, 'scores': scores, 'params': space_search}
    
    trials = Trials()
    best = fmin(fn=objective,
                space=space_search,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)
    
    return best, sorted(trials.results, key=lambda x: x['loss'])

# LGBoost

### Preproc data

In [8]:
target_encoder_features = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
preproc_params = {
    'label_encoding_features': label_encoding_features,
    'target_encoder_features': target_encoder_features,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('fillna', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)


### Hyperparameters tuning using hyperopt

In [47]:
X_columns = [column for column in train_preproc.columns if column != 'target']
X, y = train_preproc[X_columns], train_preproc.target

train_params = {'params': {
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'objective':'binary',
                    'reg_alpha': 1,
                    'reg_lambda': 1,
                    'learning_rate': 0.1,
                    "boosting": "gbdt",
                    "feature_fraction": 0.85,
                    "bagging_freq": 1,
                    "bagging_fraction": 0.95 ,
                    "seed": 123,
                    'num_threads': 1,
                    'is_unbalance': True,
                    'boost_from_average': False,
                    "metric": 'auc',
                    "verbosity": -1
                    },
                'num_boost_round': 3000,
                'verbose_eval': 1000,
                'early_stopping_rounds': 50,
                'modeltype': 'lgb',
}

# define cross_validation
cv_params = {
    'n_splits': 4,
    'shuffle': True,
    'random_state': 234,
}
cv = StratifiedKFold(**cv_params)

# hyperparameters tuning
search_space = {
    'num_leaves': hp.uniformint('num_leaves', 6, 32), 
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 10, 1000),
    'feature_fraction': hp.uniform('feature_fraction', 0.05, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1.0),
}
max_eval = 30
best_params, hp_tuning_results = hyperparameters_optimization(X, y, None, search_space, max_eval, train_params, loss='')
best_params, hp_tuning_results

{'bagging_fraction': 0.7780696809503734, 'feature_fraction': 0.2701366409833747, 'min_data_in_leaf': 640, 'num_leaves': 23}
None                                                                                                                   
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[317]	valid_0's auc: 0.788323
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[282]	valid_0's auc: 0.78405
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                               

Early stopping, best iteration is:                                                                                     
[444]	valid_0's auc: 0.786846
roc_auc_score                                                                                                          
TRAIN: [0.79177 0.79339 0.79236 0.79252] (0.79251)                                                                     
VAL: [0.78885 0.78413 0.78708 0.78685] (0.78673)                                                                       
{'bagging_fraction': 0.9734262360446999, 'feature_fraction': 0.858146347885475, 'min_data_in_leaf': 100, 'num_leaves': 26}
None                                                                                                                   
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[261]	v

Early stopping, best iteration is:                                                                                     
[411]	valid_0's auc: 0.784004
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[400]	valid_0's auc: 0.786599
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[462]	valid_0's auc: 0.786459
roc_auc_score                                                                                                          
TRAIN: [0.79269 0.79345 0.79241 0.79315] (0.79292)                                                                     
VAL: [0.78833 0.784   0.7866  0.78646] (0.78635)                      

None                                                                                                                   
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[426]	valid_0's auc: 0.789556
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[391]	valid_0's auc: 0.784949
Training until validation scores don't improve for 50 rounds.                                                          
Early stopping, best iteration is:                                                                                     
[281]	valid_0's auc: 0.788049
Training until validation scores don't improve for 50 rounds.         

### Fit model with best hyperparameters 

In [9]:
X_columns = [column for column in train_preproc.columns if column != 'target']
rs = 234
test_size = 0.15
train_X, val_X, train_y, val_y = train_test_split(train_preproc.loc[:, X_columns], train_preproc.target, 
                                                  test_size=test_size, stratify=train_preproc.target, 
                                                  random_state=rs)
print(train_X.shape, val_X.shape)

(510000, 23) (90000, 23)


In [10]:
train_dataset = Dataset(train_X, train_y, free_raw_data=False)#, categorical_feature=categorical_features)
val_dataset = Dataset(val_X, val_y, free_raw_data=False)#, categorical_feature=categorical_features)
param = {
                'learning_rate': 0.1,
                'num_leaves': 11,
                'min_data_in_leaf': 141, 
                'objective':'binary',
                'reg_alpha': 1,
                'reg_lambda': 1,
                "boosting": "gbdt",
                "feature_fraction": 0.11159440461908189,
                "bagging_fraction": 0.7092434829167672,
                "seed": 123,
                'num_threads': 1,
                'is_unbalance': True,
                "metric": 'auc',
                "verbosity": -1
}

clf = lgb.train(param, train_dataset, num_boost_round=500, 
                valid_sets=[val_dataset], #[val_dataset, train_dataset], 
                verbose_eval=50, 
                early_stopping_rounds=50
               )

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.748183
[100]	valid_0's auc: 0.77586
[150]	valid_0's auc: 0.783284
[200]	valid_0's auc: 0.785456
[250]	valid_0's auc: 0.786561
[300]	valid_0's auc: 0.787314
[350]	valid_0's auc: 0.787483
[400]	valid_0's auc: 0.787544
[450]	valid_0's auc: 0.787556
[500]	valid_0's auc: 0.787629
Did not meet early stopping. Best iteration is:
[495]	valid_0's auc: 0.787641


### Save model and it's params

In [13]:
with open('./best_models/lgb.params', 'w') as f:
    f.write(str(param))

clf.save_model('./best_models/lgb.model')
with open('./best_models/lgb_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [11]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,...,0.182075,0.146751,3.0,0,4,5,20,0.235641,3.0,9.0
1,600001,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.0,5.0,...,0.234127,0.191781,1.0,0,1,13,13,-1.0,2.0,8.0
2,600002,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,...,0.144341,0.162252,1.0,2,2,8,13,0.125748,2.0,6.0
3,600003,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,...,0.169082,0.173469,1.0,2,3,12,1,0.108688,1.0,6.0
4,600004,0.0,0.0,1.0,0.0,1.0,2.0,0.0,-1.0,3.0,...,0.121985,0.154574,1.0,1,5,14,9,0.119081,3.0,3.0


In [28]:
predictions = clf.predict(test.iloc[:, 1:])
predictions

array([0.39272547, 0.61695022, 0.4931543 , ..., 0.83724117, 0.61287631,
       0.51048698])

In [29]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_lgb.csv', index=False)

# CatBoost

### Preproc data

In [14]:
target_encoder_features = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
preproc_params = {
    'label_encoding_features': label_encoding_features,
    'target_encoder_features': target_encoder_features,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('fillna', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)

cat_features = [column for column in train_preproc.columns if ('nom' in column or 'ord' in column) and column not in target_encoder_features] + ['day', 'month']
train_preproc[cat_features] = train_preproc[cat_features].astype(int)

### Hyperparameters tuning using hyperopt

In [36]:
X_columns = [column for column in train_preproc.columns if column != 'target']
cat_features_ind = [ind for ind, col in enumerate(X_columns) if col in cat_features]
X, y = train_preproc[X_columns], train_preproc.target

train_params = {'params': {
                    'depth': 6,
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'loss_function': 'Logloss',
                    'iterations': 1500,
                    'early_stopping_rounds': 50,
                    'l2_leaf_reg': 30,
                    'learning_rate': 0.05,
                    'bagging_temperature': 0.8,
                    'random_strength': 0.8,
                    'task_type': "GPU",
                    'grow_policy': 'Lossguide',
                    "random_seed": 123,
                    'thread_count': 1,
                    "eval_metric": 'AUC',
                    "verbose": False,
                    'use_best_model': True
                    },
                'fit_params': {'verbose_eval': 1000, 'use_best_model': True},
                'modeltype': 'catboost',
                'cat_features': cat_features_ind,
}

# define cross_validation
cv_params = {
    'n_splits': 4,
    'test_size': 0.2,
    'random_state': 123,
}
cv = StratifiedShuffleSplit(**cv_params)

# hyperparameters tuning
search_space = {
    'num_leaves': hp.uniformint('num_leaves', 4, 32), 
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 10, 1000),
    'random_strength': hp.uniform('random_strength', 0.1, 1.0),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.5, 1.0),
}
max_eval = 30
best_params, hp_tuning_results = hyperparameters_optimization(X, y, None, search_space, max_eval, train_params, loss='overfit')
best_params

0:	learn: 0.6843467	test: 0.6818747	best: 0.6818747 (0)	total: 74.1ms	remaining: 1m 51s                                

bestTest = 0.7853022218                                                                                                

bestIteration = 613                                                                                                    

Shrink model to first 614 iterations.                                                                                  
0:	learn: 0.6843456	test: 0.6832845	best: 0.6832845 (0)	total: 78ms	remaining: 1m 56s                                  

bestTest = 0.7846719027                                                                                                

bestIteration = 503                                                                                                    

Shrink model to first 504 iterations.                                                                                  
0:	learn: 0.6857425	test: 0.681146

bestIteration = 664                                                                                                    

Shrink model to first 665 iterations.                                                                                  
0:	learn: 0.6726301	test: 0.6731028	best: 0.6731028 (0)	total: 41.8ms	remaining: 1m 2s                                 

bestTest = 0.7851429582                                                                                                

bestIteration = 647                                                                                                    

Shrink model to first 648 iterations.                                                                                  
0:	learn: 0.6737686	test: 0.6695391	best: 0.6695391 (0)	total: 38.8ms	remaining: 58.1s                                 

bestTest = 0.7850256562                                                                                                

bestIteration = 838               

0:	learn: 0.6901482	test: 0.6839978	best: 0.6839978 (0)	total: 45.2ms	remaining: 1m 7s                                 

bestTest = 0.7842530012                                                                                                

bestIteration = 536                                                                                                    

Shrink model to first 537 iterations.                                                                                  
0:	learn: 0.6875057	test: 0.6822776	best: 0.6822776 (0)	total: 43.9ms	remaining: 1m 5s                                 

bestTest = 0.7843005359                                                                                                

bestIteration = 582                                                                                                    

Shrink model to first 583 iterations.                                                                                  
roc_auc_score                     

0:	learn: 0.6877369	test: 0.6821122	best: 0.6821122 (0)	total: 42.5ms	remaining: 1m 3s                                 

bestTest = 0.7845821977                                                                                                

bestIteration = 524                                                                                                    

Shrink model to first 525 iterations.                                                                                  
roc_auc_score                                                                                                          
TRAIN: [0.80255 0.8017  0.80268 0.80247] (0.80235)                                                                     
VAL: [0.78543 0.78446 0.78422 0.78458] (0.78467)                                                                       
0:	learn: 0.6901563	test: 0.6860806	best: 0.6860806 (0)	total: 44.2ms	remaining: 1m 6s                                 

bestTest = 0.7852395177             

Shrink model to first 809 iterations.                                                                                  
0:	learn: 0.6720725	test: 0.6719469	best: 0.6719469 (0)	total: 39.9ms	remaining: 59.8s                                 

bestTest = 0.7853429317                                                                                                

bestIteration = 808                                                                                                    

Shrink model to first 809 iterations.                                                                                  
0:	learn: 0.6721596	test: 0.6677929	best: 0.6677929 (0)	total: 36.4ms	remaining: 54.5s                                 

bestTest = 0.7851946652                                                                                                

bestIteration = 810                                                                                                    

Shrink model to first 811 iteratio

bestTest = 0.784527123                                                                                                 

bestIteration = 537                                                                                                    

Shrink model to first 538 iterations.                                                                                  
roc_auc_score                                                                                                          
TRAIN: [0.7995  0.8013  0.80127 0.79969] (0.80044)                                                                     
VAL: [0.78534 0.78466 0.78476 0.78453] (0.78482)                                                                       
0:	learn: 0.6735485	test: 0.6706885	best: 0.6706885 (0)	total: 53.3ms	remaining: 1m 19s                                

bestTest = 0.7861081064                                                                                                

bestIteration = 768                 

1000:	learn: 0.7902733	test: 0.7865040	best: 0.7865040 (1000)	total: 42.7s	remaining: 21.3s                            

bestTest = 0.786837846                                                                                                 

bestIteration = 1425                                                                                                   

Shrink model to first 1426 iterations.                                                                                 
0:	learn: 0.6458558	test: 0.6447700	best: 0.6447700 (0)	total: 47.5ms	remaining: 1m 11s                                

1000:	learn: 0.7903604	test: 0.7856933	best: 0.7856967 (997)	total: 43.1s	remaining: 21.5s                             

1499:	learn: 0.7918732	test: 0.7861501	best: 0.7861522 (1494)	total: 1m 3s	remaining: 0us                              

bestTest = 0.786152184                                                                                                 

bestIteration = 1494             

TRAIN: [0.79293 0.79269 0.79422 0.79336] (0.7933)                                                                      
VAL: [0.78671 0.78587 0.78561 0.78598] (0.78604)                                                                       
0:	learn: 0.6396469	test: 0.6382460	best: 0.6382460 (0)	total: 38.1ms	remaining: 57.1s                                 

1000:	learn: 0.7887666	test: 0.7861646	best: 0.7861646 (1000)	total: 39.9s	remaining: 19.9s                            

1499:	learn: 0.7901885	test: 0.7869021	best: 0.7869021 (1499)	total: 1m	remaining: 0us                                 

bestTest = 0.7869020998                                                                                                

bestIteration = 1499                                                                                                   

0:	learn: 0.6395811	test: 0.6385097	best: 0.6385097 (0)	total: 43.6ms	remaining: 1m 5s                                 

1000:	learn: 0.7889059	test: 0.785

1000:	learn: 0.7935722	test: 0.7866394	best: 0.7866400 (999)	total: 32.3s	remaining: 16.1s                             

bestTest = 0.7866607308                                                                                                

bestIteration = 1084                                                                                                   

Shrink model to first 1085 iterations.                                                                                 
0:	learn: 0.6610653	test: 0.6598341	best: 0.6598341 (0)	total: 36.6ms	remaining: 54.9s                                 

1000:	learn: 0.7935505	test: 0.7855909	best: 0.7855923 (993)	total: 31.9s	remaining: 15.9s                             

bestTest = 0.7856547832                                                                                                

bestIteration = 1095                                                                                                   

Shrink model to first 1096 iterat

bestTest = 0.786268115                                                                                                 

bestIteration = 746                                                                                                    

Shrink model to first 747 iterations.                                                                                  
0:	learn: 0.6709920	test: 0.6712902	best: 0.6712902 (0)	total: 36.4ms	remaining: 54.6s                                 

bestTest = 0.7853473127                                                                                                

bestIteration = 760                                                                                                    

Shrink model to first 761 iterations.                                                                                  
0:	learn: 0.6708261	test: 0.6670359	best: 0.6670359 (0)	total: 34.8ms	remaining: 52.2s                                 

bestTest = 0.785484314            

({'bagging_temperature': 0.7497082074820156,
  'min_data_in_leaf': 67.0,
  'num_leaves': 4.0,
  'random_strength': 0.2017357950398055},
 [{'loss': -0.7822600000000001,
   'status': 'ok',
   'scores': {'roc_auc_score': {'train': array([0.79025, 0.79038, 0.79053, 0.79038]),
     'val': array([0.78695, 0.78609, 0.78591, 0.78631]),
     'train_mean': 0.79038,
     'val_mean': 0.78632}},
   'params': {'bagging_temperature': 0.7497082074820156,
    'min_data_in_leaf': 67,
    'num_leaves': 4,
    'random_strength': 0.2017357950398055}},
  {'loss': -0.78224,
   'status': 'ok',
   'scores': {'roc_auc_score': {'train': array([0.79019, 0.79028, 0.79039, 0.7902 ]),
     'val': array([0.7869 , 0.78601, 0.78583, 0.78625]),
     'train_mean': 0.79026,
     'val_mean': 0.78625}},
   'params': {'bagging_temperature': 0.9161779844929899,
    'min_data_in_leaf': 116,
    'num_leaves': 4,
    'random_strength': 0.10244767481208092}},
  {'loss': -0.78086,
   'status': 'ok',
   'scores': {'roc_auc_score': 

### Fit model with best hyperparameters 

In [15]:
X_columns = [column for column in train_preproc.columns if column != 'target']
cat_features_ind = [ind for ind, col in enumerate(X_columns) if col in cat_features]
rs = 123
test_size = 0.2
train_X, val_X, train_y, val_y = train_test_split(train_preproc.loc[:, X_columns], train_preproc.target, 
                                                  test_size=test_size, stratify=train_preproc.target, 
                                                  random_state=rs)
print(train_X.shape, val_X.shape)

(480000, 23) (120000, 23)


In [16]:
train_dataset = catboost.Pool(train_X, train_y, cat_features=cat_features_ind, feature_names=list(train_X.columns), thread_count=1)
val_dataset = catboost.Pool(val_X, val_y, cat_features=cat_features_ind, feature_names=list(train_X.columns), thread_count=1)
param = {'params': {
                    'depth': 6,
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'l2_leaf_reg': 30,
                    'learning_rate': 0.05,
                    'bagging_temperature': 0.8,
                    'random_strength': 0.8,
                    'task_type': "GPU",
                    'grow_policy': 'Lossguide',
                    'iterations': 1500,
                    'early_stopping_rounds': 50,
                    "random_seed": 123,
                    'thread_count': 1,
                    "eval_metric": 'AUC',
                    "verbose": False,
                    'use_best_model': True
                    },
         'fit_params': {'verbose_eval': 100,},
}
param['params'].update({'bagging_temperature': 0.7497082074820156,
 'min_data_in_leaf': 67.0,
 'num_leaves': 4.0,
 'random_strength': 0.2017357950398055})

clf = catboost.CatBoostClassifier(**param['params'])#)dtrain=train_dataset, eval_set=val_dataset, **param)
clf.fit(train_dataset, eval_set=val_dataset, **param['fit_params'])

0:	learn: 0.6393258	test: 0.6382460	best: 0.6382460 (0)	total: 56.5ms	remaining: 1m 24s
100:	learn: 0.7572019	test: 0.7557452	best: 0.7557452 (100)	total: 5.6s	remaining: 1m 17s
200:	learn: 0.7745472	test: 0.7732451	best: 0.7732451 (200)	total: 11s	remaining: 1m 11s
300:	learn: 0.7803753	test: 0.7789677	best: 0.7789677 (300)	total: 16.4s	remaining: 1m 5s
400:	learn: 0.7832001	test: 0.7816364	best: 0.7816364 (400)	total: 21.9s	remaining: 1m
500:	learn: 0.7851328	test: 0.7834378	best: 0.7834378 (500)	total: 27.4s	remaining: 54.7s
600:	learn: 0.7863027	test: 0.7844165	best: 0.7844165 (600)	total: 33s	remaining: 49.4s
700:	learn: 0.7871187	test: 0.7850879	best: 0.7850879 (700)	total: 38.5s	remaining: 43.9s
800:	learn: 0.7876382	test: 0.7854389	best: 0.7854389 (800)	total: 43.9s	remaining: 38.3s
900:	learn: 0.7880720	test: 0.7857410	best: 0.7857410 (900)	total: 49.3s	remaining: 32.8s
1000:	learn: 0.7883600	test: 0.7859113	best: 0.7859113 (1000)	total: 54.8s	remaining: 27.3s
1100:	learn: 0.7

<catboost.core.CatBoostClassifier at 0x190c9aae6d8>

### Save model and it's params

In [19]:
with open('./best_models/catboost.params', 'w') as f:
    f.write(str(param))

clf.save_model('./best_models/catboost.model')
with open('./best_models/catboost_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [17]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test[cat_features] = test[cat_features].astype(int)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,0.0,1.0,0,1,0,3,...,0.182075,0.146751,3,0,4,5,20,0.235641,3,9
1,600001,0.0,0.0,0.0,0.0,1.0,2,0,4,5,...,0.234127,0.191781,1,0,1,13,13,-1.0,2,8
2,600002,0.0,0.0,0.0,0.0,1.0,0,0,0,5,...,0.144341,0.162252,1,2,2,8,13,0.125748,2,6
3,600003,1.0,0.0,0.0,0.0,0.0,2,1,0,2,...,0.169082,0.173469,1,2,3,12,1,0.108688,1,6
4,600004,0.0,0.0,1.0,0.0,1.0,2,0,-1,3,...,0.121985,0.154574,1,1,5,14,9,0.119081,3,3


In [18]:
predictions = clf.predict_proba(catboost.Pool(test.iloc[:, 1:], cat_features=cat_features_ind, feature_names=list(test.columns[1:]), thread_count=1)).T[1]
predictions

array([0.11482207, 0.25555373, 0.18102427, ..., 0.49895107, 0.25595565,
       0.21795591])

In [20]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_cat.csv', index=False)

# Logreg

### Preproc data

In [21]:
minmax_features = [f'ord_{i}' for i in range(5)]
target_encoder_features = [i for i in train.columns if i not in minmax_features and i != 'target']
preproc_params = {
    'label_encoding_features': label_encoding_features,
    'target_encoder_features': target_encoder_features,
    'min_max_features': minmax_features,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('ohe', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)


### Cross validation

In [22]:
X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns], train_preproc.target

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
#     'penalty': 'l2',
#     'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

# define cross_validation
cv_params = {
    'n_splits': 4,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)

cross_validation(cv, LogisticRegression(**train_params), X, y, verbose=True)

roc_auc_score
TRAIN: [0.78703 0.78752 0.78689 0.78727] (0.78718)
VAL: [0.78757 0.78611 0.78798 0.7868 ] (0.78712)


({'roc_auc_score': {'train': array([0.78703, 0.78752, 0.78689, 0.78727]),
   'val': array([0.78757, 0.78611, 0.78798, 0.7868 ]),
   'train_mean': 0.78718,
   'val_mean': 0.78712}},
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=2020,
                    multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                    solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))

### Fit model

In [23]:
X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns], train_preproc.target

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
#     'penalty': 'l2',
#     'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

clf = LogisticRegression(**train_params)
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2020,
                   multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

### Save model and it's params

In [24]:
with open('./best_models/logreg.params', 'w') as f:
    f.write(str(train_params))

with open('./best_models/logreg.model', 'wb') as f:
    pkl.dump(clf, f)
with open('./best_models/logreg_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [25]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.205417,0.201104,0.178947,...,0.182075,0.146751,1.0,0.0,0.8,0.357143,0.8,0.235641,0.16408,0.211688
1,600001,0.194674,0.19056,0.17113,0.186772,0.195305,0.183067,0.179694,0.209834,0.219534,...,0.234127,0.191781,0.0,0.0,0.2,0.928571,0.52,0.18745,0.19916,0.190094
2,600002,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.179694,0.201104,0.219534,...,0.144341,0.162252,0.0,0.5,0.4,0.571429,0.52,0.125748,0.19916,0.229951
3,600003,0.113365,0.19056,0.17113,0.186772,0.180234,0.183067,0.205417,0.201104,0.202836,...,0.169082,0.173469,0.0,0.5,0.6,0.857143,0.04,0.108688,0.213682,0.229951
4,600004,0.194674,0.19056,0.228917,0.186772,0.195305,0.183067,0.179694,0.187262,0.178947,...,0.121985,0.154574,0.0,0.25,1.0,1.0,0.36,0.119081,0.16408,0.146451


In [26]:
predictions = clf.predict_proba(test.iloc[:, 1:]).T[1]
predictions

array([0.14000958, 0.2410715 , 0.16268397, ..., 0.52645846, 0.25565631,
       0.22060699])

In [27]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_logreg.csv', index=False)

# Polynomial LogReg

### Since logreg with lbfgs solver takes a lot of CPU usage during training (I very care about my CPU xD) and we have a lot of data and relatively many features (therefore it takes a little bit more time for convergence) i decided to train on subset objects hoping the subset will reflect the same properties and polynomial logreg make a decision boundary aproximately as on the original full dataset. All preprocessing steps is the same as for LogReg.

In [28]:
compresed_xy = None
for tr_ind, val_ind in StratifiedKFold(10, shuffle=True, random_state=123).split(X, y):
    compresed_xy = (X.iloc[val_ind], y.iloc[val_ind])
    break
compresed_xy[0].shape, compresed_xy[1].shape

((60000, 23), (60000,))

### Cross validation

In [29]:
poly = PolynomialFeatures(2, interaction_only=True)
poly_X = pd.DataFrame(poly.fit_transform(compresed_xy[0].reset_index(drop=True)))
# poly_X = pd.DataFrame(poly.fit_transform(train_preproc[X_columns]))

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)
cross_validation(cv, LogisticRegression(**train_params), 
                 poly_X, 
                 compresed_xy[1].reset_index(drop=True), verbose=True)


roc_auc_score
TRAIN: [0.78955 0.78845 0.78815 0.78897 0.78619] (0.78826)
VAL: [0.78115 0.78525 0.78669 0.78469 0.79427] (0.78641)


({'roc_auc_score': {'train': array([0.78955, 0.78845, 0.78815, 0.78897, 0.78619]),
   'val': array([0.78115, 0.78525, 0.78669, 0.78469, 0.79427]),
   'train_mean': 0.78826,
   'val_mean': 0.78641}},
 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=2020,
                    multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                    solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))

### Fit model

In [30]:
poly = PolynomialFeatures(2, interaction_only=True)
poly_X = pd.DataFrame(poly.fit_transform(compresed_xy[0].reset_index(drop=True)))
train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 500, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}

clf = LogisticRegression(**train_params)
clf.fit(poly_X, compresed_xy[1].reset_index(drop=True))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

### Save model and it's params

In [31]:
with open('./best_models/poly_logreg.params', 'w') as f:
    f.write(str(train_params))

with open('./best_models/poly_logreg.model', 'wb') as f:
    pkl.dump(clf, f)
with open('./best_models/poly_logreg_preproc_pipeline.ppln', 'wb') as f:
    dill.dump(preproc_pipepline, f)

### Make submission

In [32]:
test = pd.read_csv('./data/test.csv')
preproc_pipepline['preprocessor'].isTrain = False
test = preproc_pipepline.transform(test)
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.205417,0.201104,0.178947,...,0.182075,0.146751,1.0,0.0,0.8,0.357143,0.8,0.235641,0.16408,0.211688
1,600001,0.194674,0.19056,0.17113,0.186772,0.195305,0.183067,0.179694,0.209834,0.219534,...,0.234127,0.191781,0.0,0.0,0.2,0.928571,0.52,0.18745,0.19916,0.190094
2,600002,0.194674,0.19056,0.17113,0.186772,0.195305,0.195763,0.179694,0.201104,0.219534,...,0.144341,0.162252,0.0,0.5,0.4,0.571429,0.52,0.125748,0.19916,0.229951
3,600003,0.113365,0.19056,0.17113,0.186772,0.180234,0.183067,0.205417,0.201104,0.202836,...,0.169082,0.173469,0.0,0.5,0.6,0.857143,0.04,0.108688,0.213682,0.229951
4,600004,0.194674,0.19056,0.228917,0.186772,0.195305,0.183067,0.179694,0.187262,0.178947,...,0.121985,0.154574,0.0,0.25,1.0,1.0,0.36,0.119081,0.16408,0.146451


In [33]:
predictions = clf.predict_proba(poly.transform(test.iloc[:, 1:])).T[1]
predictions

array([0.15732676, 0.23835215, 0.15664765, ..., 0.55263458, 0.24580884,
       0.18908509])

In [34]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_poly_logreg.csv', index=False)

## LogReg + KNN

### Preproc data

In [35]:
minmax_features = [f'ord_{i}' for i in range(5)]# + [f'nom_{i}' for i in range(4)] + ['day', 'month']
target_encoder_features = [i for i in train.columns if i not in minmax_features and i != 'target']
preproc_params = {
    'label_encoding_features': label_encoding_features,
    'target_encoder_features': target_encoder_features,
    'min_max_features': minmax_features,
    'custom_transform': {
        'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
        'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
        'ord_3': lambda x: ord(x) - 97,
        'ord_4': lambda x: ord(x) - 65,
    },
}
preproc_pipepline = Pipeline([
    ('preprocessor', Preprocessor(**preproc_params)),
    ('nan_imputer', NanImputer(('ohe', -1))),
])

preproc_pipepline[0].isTrain = True
train_preproc = preproc_pipepline.fit_transform(train, train.target)

X_columns = [column for column in train_preproc.columns if column != 'target' and 'NaN' not in column]
X, y = train_preproc[X_columns], train_preproc.target

train_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}
log_model = LogisticRegression(**train_params)
log_model.fit(X, y)
X *= abs(log_model.coef_[0])

In [36]:
compresed_xy = None
for tr_ind, val_ind in StratifiedKFold(10, shuffle=True, random_state=123).split(X, y):
    compresed_xy = (X.iloc[val_ind], y.iloc[val_ind])
    break
compresed_xy[0].shape, compresed_xy[1].shape

((60000, 23), (60000,))

In [37]:
train_params = {'n_neighbors': 188, 'p': 2, 'weights': 'uniform'}

cv_params = {
    'n_splits': 1,
    'test_size': 0.2,
    'random_state': 123,
}
cv = StratifiedShuffleSplit(**cv_params)

cross_validation(cv, KNeighborsClassifier(**train_params), 
                 compresed_xy[0].reset_index(drop=True), 
                 compresed_xy[1].reset_index(drop=True), verbose=True)#[0], verbose=True)#[0]


roc_auc_score
TRAIN: [0.77849] (0.77849)
VAL: [0.77885] (0.77885)


({'roc_auc_score': {'train': array([0.77849]),
   'val': array([0.77885]),
   'train_mean': 0.77849,
   'val_mean': 0.77885}},
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=188, p=2,
                      weights='uniform'))

### Since i've got relatively bad ROC-AUC score in training and 1 fold validation, i decided not to include that model in stack. Any changing n_neighbours parameter and other hyperparameters optimization didn't improve train and validation score sufficiently to add this model in stack.

# Stack LGB, CatBoost and LogReg and Polynomial LogReg

In [83]:
label_encoding_features = []
object_cols = train.select_dtypes('object').columns
for col in object_cols:
    if col[0] != 'o':# and col not in target_encoder_features:# special transform for ord features
        label_encoding_features.append(col)
print(label_encoding_features)
X_columns = [column for column in train.columns if column != 'target']
test = pd.read_csv('./data/test.csv')

# lgb preproc
print('lgb preproc')
if os.path.exists('./best_models/lgb_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/lgb_preproc_pipeline.ppln', 'rb') as f:
        lgb_preproc_pipepline = dill.load(f)
    lgb_preproc_pipepline[0].isTrain = True
    lgb_train_preproc = lgb_preproc_pipepline.transform(train)
else:
    target_encoder_features = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
    preproc_params = {
        'label_encoding_features': label_encoding_features,
        'target_encoder_features': target_encoder_features,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    lgb_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('fillna', -1))),
    ])

    lgb_preproc_pipepline[0].isTrain = True
    lgb_train_preproc = lgb_preproc_pipepline.fit_transform(train, train.target)
    
lgb_params = {
    'learning_rate': 0.1,
    'num_leaves': 11,
    'min_data_in_leaf': 141, 
    'objective':'binary',
    'reg_alpha': 1,
    'reg_lambda': 1,
    "boosting": "gbdt",
    "feature_fraction": 0.11159440461908189,
    "bagging_fraction": 0.7092434829167672,
    "seed": 123,
    'num_threads': 1,
    'is_unbalance': True,
    "metric": 'auc',
    "verbosity": -1
}
lgb_preproc_pipepline['preprocessor'].isTrain = False
lgb_test = lgb_preproc_pipepline.transform(test)


# catboost preproc
print('catboost preproc')
if os.path.exists('./best_models/catboost_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/catboost_preproc_pipeline.ppln', 'rb') as f:
        catboost_preproc_pipepline = dill.load(f)
    catboost_preproc_pipepline[0].isTrain = True
    catboost_train_preproc = catboost_preproc_pipepline.transform(train)
else:
    target_encoder_features = [f'nom_{i}' for i in range(4, 10)] + ['ord_5']
    preproc_params = {
        'label_encoding_features': label_encoding_features,
        'target_encoder_features': target_encoder_features,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    catboost_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('fillna', -1))),
    ])

    catboost_preproc_pipepline[0].isTrain = True
    catboost_train_preproc = catboost_preproc_pipepline.fit_transform(train, train.target)
    
cat_features = [column for column in catboost_train_preproc.columns if ('nom' in column or 'ord' in column) and column not in target_encoder_features] + ['day', 'month']
catboost_train_preproc[cat_features] = catboost_train_preproc[cat_features].astype(int)
cat_features_ind = [ind for ind, col in enumerate(X_columns) if col in cat_features]
catboost_params = {
    'params': {
        'depth': 6,
        'num_leaves': 4,
        'min_data_in_leaf': 67, 
        'l2_leaf_reg': 30,
        'learning_rate': 0.05,
        'bagging_temperature': 0.7497082074820156,
        'random_strength': 0.2017357950398055,
        'task_type': "GPU",
        'grow_policy': 'Lossguide',
        'iterations': 1500,
        'early_stopping_rounds': 50,
        "random_seed": 123,
        'thread_count': 1,
        "eval_metric": 'AUC',
        "verbose": False,
        'use_best_model': True
    },
    'fit_params': {'verbose_eval': 100,},
}
catboost_preproc_pipepline['preprocessor'].isTrain = False
catboost_test = catboost_preproc_pipepline.transform(test)
catboost_test[cat_features] = catboost_test[cat_features].astype(int)


# log_reg preproc
print('log_reg preproc')
if os.path.exists('./best_models/logreg_preproc_pipeline.ppln'):
    print('load existsing preproc_pipeline...')
    with open('./best_models/logreg_preproc_pipeline.ppln', 'rb') as f:
        logreg_preproc_pipepline = dill.load(f)
    logreg_preproc_pipepline[0].isTrain = True
    logreg_train_preproc = logreg_preproc_pipepline.transform(train)
else:
    minmax_features = [f'ord_{i}' for i in range(5)]
    target_encoder_features = [i for i in train.columns if i not in minmax_features and i != 'target']
    preproc_params = {
        'label_encoding_features': label_encoding_features,
        'target_encoder_features': target_encoder_features,
        'min_max_features': minmax_features,
        'custom_transform': {
            'ord_1': {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4},
            'ord_2': {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5},
            'ord_3': lambda x: ord(x) - 97,
            'ord_4': lambda x: ord(x) - 65,
        },
    }
    logreg_preproc_pipepline = Pipeline([
        ('preprocessor', Preprocessor(**preproc_params)),
        ('nan_imputer', NanImputer(('ohe', -1))),
    ])

    logreg_preproc_pipepline[0].isTrain = True
    logreg_train_preproc = logreg_preproc_pipepline.fit_transform(train, train.target)
    
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}
logreg_preproc_pipepline['preprocessor'].isTrain = False
logreg_test = logreg_preproc_pipepline.transform(test)


# polynimal logreg
polylogreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'penalty': 'l2',
    'C': 1,
    'verbose': 0,
    'n_jobs': 1
}


stack = pd.DataFrame(index=train.index)
for modelname in ['lgb', 'catboost', 'logreg', 'poly_logreg',]:
    stack[modelname] = 0.5
stack['target'] = logreg_train_preproc.target.values
test_pred = []
    
cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 123,
}
cv = StratifiedKFold(**cv_params)

n_fold = 1
for tr_ind, val_ind in cv.split(train, train.target):
    print(f'n_fold={n_fold}')
    n_fold += 1
    # lgb
    train_X, train_y = lgb_train_preproc.iloc[tr_ind][X_columns], lgb_train_preproc.iloc[tr_ind].target
    val_X, val_y = lgb_train_preproc.iloc[val_ind][X_columns], lgb_train_preproc.iloc[val_ind].target
    
    train_dataset = Dataset(train_X, train_y, free_raw_data=False)
    val_dataset = Dataset(val_X, val_y, free_raw_data=False)

    clf = lgb.train(lgb_params, train_dataset, num_boost_round=500, 
                    valid_sets=[val_dataset],
                    verbose_eval=50, 
                    early_stopping_rounds=50
                   )
    
    stack.iloc[val_ind, 0] = clf.predict(val_X)
    # test prediction
    test_pred.append(clf.predict(lgb_test.iloc[:, 1:]).tolist())
    
    # catboost
    train_X, train_y = catboost_train_preproc.iloc[tr_ind][X_columns], catboost_train_preproc.iloc[tr_ind].target
    val_X, val_y = catboost_train_preproc.iloc[val_ind][X_columns], catboost_train_preproc.iloc[val_ind].target
    train_dataset = catboost.Pool(train_X, train_y, cat_features=cat_features_ind, feature_names=list(train_X.columns), thread_count=1)
    val_dataset = catboost.Pool(val_X, val_y, cat_features=cat_features_ind, feature_names=list(train_X.columns), thread_count=1)

    clf = catboost.CatBoostClassifier(**catboost_params['params'])#)dtrain=train_dataset, eval_set=val_dataset, **param)
    clf.fit(train_dataset, eval_set=val_dataset, **catboost_params['fit_params'])
    
    stack.iloc[val_ind, 1] = clf.predict_proba(val_X).T[1]
    # test prediction
    test_pred.append(clf.predict_proba(catboost.Pool(catboost_test.iloc[:, 1:], 
                                                     cat_features=cat_features_ind, 
                                                     feature_names=list(catboost_test.columns[1:]), 
                                                     thread_count=1)
                                      ).T[1].tolist())
    
    # logreg
    train_X, train_y = logreg_train_preproc.iloc[tr_ind][X_columns], logreg_train_preproc.iloc[tr_ind].target
    val_X, val_y = logreg_train_preproc.iloc[val_ind][X_columns], logreg_train_preproc.iloc[val_ind].target

    clf = LogisticRegression(**logreg_params)
    clf.fit(train_X, train_y)
    
    stack.iloc[val_ind, 2] = clf.predict_proba(val_X).T[1]
    # test prediction
    test_pred.append(clf.predict_proba(logreg_test.iloc[:, 1:]).T[1].tolist())
    
    # polynomial logreg
    tm = time.time()
    train_X, train_y = logreg_train_preproc.iloc[tr_ind][X_columns], logreg_train_preproc.iloc[tr_ind].target
    val_X, val_y = logreg_train_preproc.iloc[val_ind][X_columns], logreg_train_preproc.iloc[val_ind].target
    compresed_xy = None
    for tr_ind1, val_ind1 in StratifiedShuffleSplit(1, test_size=0.2, random_state=123).split(train_X, train_y):
        compresed_xy = (train_X.iloc[val_ind1], train_y.iloc[val_ind1])
    poly = PolynomialFeatures(2, interaction_only=True)
    train_X = pd.DataFrame(poly.fit_transform(compresed_xy[0].reset_index(drop=True)))
    train_y = compresed_xy[1]

    clf = LogisticRegression(**polylogreg_params)
    clf.fit(train_X, train_y)
    
    stack.iloc[val_ind, 3] = clf.predict_proba(poly.transform(val_X)).T[1]
    # test prediction
    test_pred.append(clf.predict_proba(poly.transform(logreg_test.iloc[:, 1:])).T[1].tolist())
    

['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
lgb preproc
load existsing preproc_pipeline...
catboost preproc
load existsing preproc_pipeline...
log_reg preproc
load existsing preproc_pipeline...
n_fold=1
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.748124
[100]	valid_0's auc: 0.776565
[150]	valid_0's auc: 0.784365
[200]	valid_0's auc: 0.786809
[250]	valid_0's auc: 0.78784
[300]	valid_0's auc: 0.788184
[350]	valid_0's auc: 0.788262
[400]	valid_0's auc: 0.788477
[450]	valid_0's auc: 0.788502
[500]	valid_0's auc: 0.788552
Did not meet early stopping. Best iteration is:
[493]	valid_0's auc: 0.788556
0:	learn: 0.6386760	test: 0.6408865	best: 0.6408865 (0)	total: 64.1ms	remaining: 1m 36s
100:	learn: 0.7545951	test: 0.7537982	best: 0.7538504 (99)	total: 6.37s	remaining: 1m 28s
200:	learn: 0.7713061	test: 0.7706527	best: 0.7706527 (200)	total: 12.6s	remaining: 1m 21s
300:	learn: 0.7778513	tes

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.748072
[100]	valid_0's auc: 0.775928
[150]	valid_0's auc: 0.783181
[200]	valid_0's auc: 0.7855
[250]	valid_0's auc: 0.786524
[300]	valid_0's auc: 0.78726
[350]	valid_0's auc: 0.787493
[400]	valid_0's auc: 0.787636
[450]	valid_0's auc: 0.787725
[500]	valid_0's auc: 0.787772
Did not meet early stopping. Best iteration is:
[495]	valid_0's auc: 0.78779
0:	learn: 0.6389370	test: 0.6388341	best: 0.6388341 (0)	total: 61.6ms	remaining: 1m 32s
100:	learn: 0.7536778	test: 0.7531361	best: 0.7531361 (100)	total: 7.36s	remaining: 1m 41s
200:	learn: 0.7713929	test: 0.7710189	best: 0.7710512 (199)	total: 15.1s	remaining: 1m 37s
300:	learn: 0.7778854	test: 0.7772645	best: 0.7772645 (300)	total: 21.1s	remaining: 1m 23s
400:	learn: 0.7815905	test: 0.7807650	best: 0.7807822 (399)	total: 27.2s	remaining: 1m 14s
500:	learn: 0.7837973	test: 0.7827641	best: 0.7827641 (500)	total: 33.4s	remaining: 1m 6s
600:	learn: 0.7853378	t

## Logreg as a highlevel agregate model

In [84]:
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}

cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 321,
}
cv = StratifiedKFold(**cv_params)
cross_validation(cv, LogisticRegression(**logreg_params), stack.iloc[:, :stack.shape[1]-1], stack.target, verbose=True)

roc_auc_score
TRAIN: [0.78773 0.78735 0.78815 0.78807 0.78747] (0.78775)
VAL: [0.78785 0.78937 0.78618 0.78649 0.78889] (0.78776)


({'roc_auc_score': {'train': array([0.78773, 0.78735, 0.78815, 0.78807, 0.78747]),
   'val': array([0.78785, 0.78937, 0.78618, 0.78649, 0.78889]),
   'train_mean': 0.78775,
   'val_mean': 0.78776}},
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=2020,
                    multi_class='auto', n_jobs=1, penalty='l2', random_state=1,
                    solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))

## LGB as a highlevel agregate model

In [85]:
train_params = {'params': {
                    'num_leaves': 18,
                    'min_data_in_leaf': 10, 
                    'objective':'binary',
                    'learning_rate': 0.1,
                    "boosting": "gbdt",
                    "seed": 123,
                    'num_threads': 1,
                    'is_unbalance': True,
                    'boost_from_average': False,
                    "metric": 'auc',
                    "verbosity": -1
                    },
                'num_boost_round': 3000,
                'verbose_eval': 50,
                'early_stopping_rounds': 50,
                'modeltype': 'lgb',
}

# define cross_validation
cv_params = {
    'n_splits': 5,
    'shuffle': True,
    'random_state': 321,
}
cv = StratifiedKFold(**cv_params)
cross_validation(cv, None, stack.iloc[:, :stack.shape[1]-1], stack.target, verbose=True, train_params=train_params)

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.787962
Early stopping, best iteration is:
[24]	valid_0's auc: 0.788043
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.789377
Early stopping, best iteration is:
[17]	valid_0's auc: 0.789414
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.786115
Early stopping, best iteration is:
[23]	valid_0's auc: 0.786175
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.786576
Early stopping, best iteration is:
[29]	valid_0's auc: 0.786637
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.789009
Early stopping, best iteration is:
[15]	valid_0's auc: 0.789024
roc_auc_score
TRAIN: [0.7883  0.78793 0.7888  0.78875 0.78798] (0.78835)
VAL: [0.78804 0.78941 0.78618 0.78664 0.78902] (0.78786)


({'roc_auc_score': {'train': array([0.7883 , 0.78793, 0.7888 , 0.78875, 0.78798]),
   'val': array([0.78804, 0.78941, 0.78618, 0.78664, 0.78902]),
   'train_mean': 0.78835,
   'val_mean': 0.78786}},
 <lightgbm.basic.Booster at 0x1908b297cf8>)

### Since the difference between LGB and Logreg models as agregation models is miserable, i decided to choose the simplest model - Logreg

### Fit model

In [87]:
logreg_params = {
    'random_state': 1, 
    'solver': 'lbfgs', 
    'max_iter': 2020, 
    'verbose': 0,
    'n_jobs': 1
}

stack_logreg = LogisticRegression(**logreg_params)
stack_logreg.fit(stack.iloc[:, :stack.shape[1]-1], stack.target)
stack_logreg.coef_[0] # logreg coeficients

array([ 4.74451283, -0.38442193,  0.38007895,  0.31898184])

### Save model and it's params

In [88]:
with open('./best_models/agregate_logreg.params', 'w') as f:
    f.write(str(logreg_params))

with open('./best_models/agregate_logreg.model', 'wb') as f:
    pkl.dump(stack_logreg, f)

### Make submission

In [89]:
test_stack = pd.DataFrame(index=test.index)
for index, modelname in enumerate(['lgb', 'catboost', 'logreg', 'poly_logreg',]):
    test_stack[modelname] = np.mean(test_pred[index::4], axis=0) # where 4 - number of models
test_stack.describe()

Unnamed: 0,lgb,catboost,logreg,poly_logreg
count,400000.0,400000.0,400000.0,400000.0
mean,0.424287,0.186969,0.187021,0.187251
std,0.232041,0.160053,0.163298,0.162873
min,0.003644,0.001049,0.000911,0.001281
25%,0.22936,0.064466,0.063799,0.064526
50%,0.405955,0.137499,0.13511,0.135559
75%,0.605518,0.264683,0.262444,0.262069
max,0.987482,0.940318,0.953451,0.958971


In [90]:
predictions = stack_logreg.predict_proba(test_stack).T[1]
predictions

array([0.11681178, 0.2778794 , 0.16745677, ..., 0.5424866 , 0.27959457,
       0.18709986])

In [91]:
submission = pd.DataFrame.from_dict({
    'id': test.id,
    'target': predictions
})
submission.to_csv('./submits/best_logreg_stack_with_poly.csv', index=False)