# Find Best Ensemble!

To achieve a higher score, consider combining multiple models to enhance the AUC score.

Enjoy the process of training your model, and aim for the highest score!



In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost
import lightgbm
import warnings
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LinearRegression, RidgeClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
import copy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

warnings.filterwarnings('ignore')


In [17]:
# kaggle
# df_train = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv', index_col='id')
# df_test = pd.read_csv('/kaggle/input/playground-series-s3e23/test.csv', index_col='id')

# local
df_train = pd.read_csv("./playground-series-s3e23/train.csv", index_col="id")
df_test = pd.read_csv("./playground-series-s3e23/test.csv", index_col="id")

In [18]:
df_test['l']  = 1.0 - df_test['l']
df_train['l'] = 1.0 - df_train['l']

In [19]:
df_train['defects'] = df_train.defects.astype(int)
feat_list = list(set(df_train.columns) - set(['defects']))
target = 'defects'

## Using Quantile Transformer to solve it

**What is a Quantile Transformer?**


A Quantile Transformer is a method that transforms features to follow a uniform or a normal distribution. By doing so, it can smooth out unusual distributions and mitigate the effects of outliers. This transformation is particularly useful for handling features that follow a long-tail distribution.

In [20]:
# quantile transform
import numpy as np
from sklearn.preprocessing import QuantileTransformer


qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=0)
qt.fit(df_train[feat_list])

df_tmp_train = pd.DataFrame(qt.transform(df_train[feat_list]))
df_tmp_train.columns = feat_list
df_tmp_train[target] = df_train[target]
df_train = df_tmp_train
df_test = pd.DataFrame(qt.transform(df_test[feat_list]))
df_test.columns = feat_list

feat_list = list(set(df_train.columns.tolist()) - set([target]))

In [21]:
# device
device = torch.device(0) if torch.cuda.is_available() else torch.device("cpu")

In [22]:
def hill_climbing(x, y, x_test):

    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.71, 0.01) 
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0
    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True

    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds

    return [hill_ens_pred_1, hill_ens_pred_2]

**hyperparameter set**

In [60]:
random_state = 43

lgb_params0 = {
    "objective": 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
     "n_estimators": 1000,
     "max_depth": 7,
     "learning_rate":0.03,
     "num_leaves": 50,
     "reg_alpha":3,
     "reg_lambda": 3,
     "subsample": 0.7,
     'device': 'gpu' if torch.cuda.is_available() else 'cpu',
     "colsample_bytree": 0.7,
    'verbose': -1,
}

lgb_params1 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
    'random_state': 42,
    'colsample_bytree': 0.50,
    'subsample': 0.70,
    'learning_rate': 0.0625,
    'max_depth': -1,
    'n_estimators': 1000,
    'num_leaves': 20, 
    'reg_alpha': 0.0001,
    'reg_lambda': 2.0,
    'verbose': -1,
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': random_state,
}

lgb_params2 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': 42,
    'colsample_bytree': 0.50,
    'subsample': 0.70,
    'learning_rate': 0.0625,
    'max_depth': -1,
    'n_estimators': 1000,
    'num_leaves': 20, 
    'reg_alpha': 0.0001,
    'reg_lambda': 2.0,
    'verbose': -1,    
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': random_state,
}

lgb_params3 = {
    'n_estimators': 20000, 
    'learning_rate': 0.07,
    'objective': 'binary', 
    'boosting_type': 'gbdt', 
    
    'subsample': 1.0,
    'num_leaves': 23,  
    'max_bin': 1023,
    'n_jobs': -1,
    
    'reg_alpha': 0.65,
    'reg_lambda': 3.1,
    'colsample_bytree': 0.568,
    'min_child_samples': 864,     
    'random_state': 1920,
    'verbose': -1, 
}

xgb_optuna0 = {
    'n_estimators': 10000,
    'learning_rate': 0.01752354328845971,
    'booster': 'gbtree',
    'lambda': 0.08159630121074074,
    'alpha': 0.07564858712175693,
    'subsample': 0.5065979400270813,
    'colsample_bytree': 0.6187340851873067,
    'max_depth': 4,
    'min_child_weight': 5,
    'eta': 0.2603059902806757,
    'gamma': 0.6567360773618207,
    'early_stopping_rounds': 100,
    'tree_method': 'hist',
    'random_state': random_state
}

xgb_params0 = {
    'n_estimators': 10000,
    'learning_rate': 0.09641232707445854,
    'booster': 'gbtree',
    'lambda': 4.666002223704784,
    'alpha': 3.708175990751336,
    'subsample': 0.6100174145229473,
    'colsample_bytree': 0.5506821152321051,
    'max_depth': 7,
    'min_child_weight': 3,
    'eta': 1.740374368661041,
    'gamma': 0.007427363662926455,
    'grow_policy': 'depthwise',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'verbosity': 0,
    'random_state': random_state,
    'early_stopping_rounds': 100,
    'tree_method': 'hist',
}

xgb_params1 = {
    'n_estimators': 10000,
    'learning_rate': 0.012208383405206188,
    'booster': 'gbtree',
    'lambda': 0.009968756668882757,
    'alpha': 0.02666266827121168,
    'subsample': 0.7097814108897231,
    'colsample_bytree': 0.7946945784285216,
    'max_depth': 3,
    'min_child_weight': 4,
    'eta': 0.5480204506554545,
    'gamma': 0.8788654128774149,
    'scale_pos_weight': 4.71,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'early_stopping_rounds': 100,
    'verbosity': 0,
    'random_state': random_state,
    'tree_method': 'hist',
}



xgb_params2 = {
    'n_estimators': 10000,
    'colsample_bytree': 0.5646751146007976,
    'gamma': 7.788727238356553e-06,
    'learning_rate': 0.1419865761603358,
    'max_bin': 824,
    'min_child_weight': 1,
    'random_state': 811996,
    'reg_alpha': 1.6259583347890365e-07,
    'reg_lambda': 2.110691851528507e-08,
    'subsample': 0.879020578464637,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 3,
    'early_stopping_rounds': 100,
    'n_jobs': -1,
    'verbosity': 0,
    'random_state': random_state,
    'tree_method': 'hist',
   # 'scale_pos_weight': scale_pos_weight
}

xgb_params3 = {
    'n_estimators': 10000,
    'random_state': random_state,
    'colsample_bytree': 0.4836462317215041,
    'eta': 0.05976752607337169,
    'gamma': 1,
    'lambda': 0.2976432557733288,
    'max_depth': 6,
    'min_child_weight': 1,
    'n_estimators': 550,
    'early_stopping_rounds': 100,
    'objective': 'binary:logistic',
    'scale_pos_weight': 4.260162886376033,
    'subsample': 0.7119282378433924,
    'tree_method': 'hist',
}

xgb_params4 = {
    'n_estimators': 10000,
    'colsample_bytree': 0.8757972257439255,
    'gamma': 0.11135738771999848,
    'max_depth': 7,
    'min_child_weight': 3,
    'reg_alpha': 0.4833998914998038,
    'reg_lambda': 0.006223568555619563,
    'scale_pos_weight': 8,
    'subsample': 0.7056434340275685,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

xgb_params5 = {
    'n_estimators': 10000,
    'max_depth': 5, 
    'min_child_weight': 2.934487833919741,
    'learning_rate': 0.11341944575807082, 
    'subsample': 0.9045063514419968,
    'gamma': 0.4329153382843715,
    'colsample_bytree': 0.38872702868412506,
    'colsample_bylevel': 0.8321880031718571,
    'colsample_bynode': 0.802355707802605,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

xgb_base = {
    'n_estimators': 1000,
    'verbosity': 0,
    'random_state': random_state,
}

xgb_params6 = {
    'objective': 'binary:logistic',
    'colsample_bytree': 0.7, 
    'gamma': 2, 
    'learning_rate': 0.01, 
    'max_depth': 7, 
    'min_child_weight': 10, 
    'n_estimators': 10000, 
    'subsample':0.7,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

cat_params0 = {
    'iterations': 10000,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'early_stopping_rounds': 100,
    'auto_class_weights': 'Balanced'
}

In [61]:
from abc import ABCMeta
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

class BaseModel(metaclass=ABCMeta):
    def __init__(self):
        pass
    
    def train(self, trn_x, trn_y, val_x, val_y):
        pass
    
    def predict(self, X):
        pass
    
    
class RandomForestModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = None
    
    def train(self, trn_x, trn_y, val_x, val_y):
        self.model = RandomForestClassifier(n_estimators = 1000, 
                                            class_weight='balanced',
                                           max_depth = 7,
                                           min_samples_split = 15,
                                           min_samples_leaf = 10)
        self.model.fit(trn_x, trn_y)
        
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class HistGradientBoostingModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = None
    
    def train(self, trn_x, trn_y, val_x, val_y):
        self.model = HistGradientBoostingClassifier(l2_regularization = 0.01,
                                             early_stopping = True,
                                             learning_rate = 0.01,
                                             max_iter = 1000,
                                             max_depth = 5,
                                             max_bins = 255,
                                             min_samples_leaf = 15,
                                             max_leaf_nodes = 10)
        self.model.fit(trn_x, trn_y)

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class LGBMModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = LGBMClassifier(**self.config)
    
    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=(val_x, val_y))
        
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

    
class XGBModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = XGBClassifier(**self.config)
    
    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=0)

        
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class CatBoostModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = CatBoostClassifier(**self.config)
    
    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=(val_x, val_y), verbose=0)
        
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]


In [62]:
from sklearn.model_selection import RepeatedStratifiedKFold

def train(X_train, y_train, X_test, models, n_folds=10):

    valid_preds, test_preds = dict(), dict()
    hill_ens_preds =  list()

    sk = RepeatedStratifiedKFold(n_splits = n_folds, n_repeats = 1, random_state = 42)
    for i, (trn_idx, val_idx) in enumerate(sk.split(X_train, y_train)):

        trn_x, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        print('----------------------------------------------------------')
        ens_pred_1 = np.zeros((val_x.shape[0], ))
        ens_pred_2 = np.zeros((X_test.shape[0], ))

        for idx, model in enumerate(models):
            print("model", model)
            model.train(trn_x, trn_y, val_x, val_y)

            ens_pred_1 = model.predict(val_x)
            ens_pred_2 = model.predict(X_test)
            
            print(f"ROC: {roc_auc_score(val_y, ens_pred_1)}")
            
            valid_preds[f'model_{idx}'] = ens_pred_1
            test_preds[f'model_{idx}'] = ens_pred_2

        ens_pred_1 = np.mean(list(valid_preds.values()), axis=0)
        ens_pred_2 = np.mean(list(test_preds.values()), axis=0)


        ens_score_fold = roc_auc_score(val_y, ens_pred_1)
        print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

        ############################
        ## Hill Climbing Ensemble ##
        ############################

        x = pd.DataFrame(valid_preds)
        y = val_y
        x_test = pd.DataFrame(test_preds)

        hill_results = hill_climbing(x, y, x_test)
        hill_ens_score_fold = roc_auc_score(y, hill_results[0])

        # test preds
        hill_ens_preds.append(hill_results[1])

        print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)
        
    return hill_ens_preds

In [63]:
X_train, X_test, y_train = df_train[feat_list], df_test[feat_list], df_train[target]

In [64]:
# Log 
# X_train = X_train.apply(lambda x: np.log1p(x))
# X_test = X_test.apply(lambda x: np.log1p(x))

## LGBM Bayesian Optimization

In [65]:
def lgbm_cl_bo(min_child_samples, colsample_bytree, learning_rate, num_leaves, reg_alpha, reg_lambda):
    
    params_lgbm = {}
    params_lgbm['min_child_samples'] = round(min_child_samples)
    params_lgbm['colsample_bytree'] = colsample_bytree
    params_lgbm['learning_rate'] = learning_rate
    params_lgbm['num_leaves'] = round(num_leaves)
    params_lgbm['reg_alpha'] = reg_alpha
    params_lgbm['reg_lambda'] = reg_lambda    
       
    params_lgbm['boosting_type'] ='gbdt'   # Manual optimization
    params_lgbm['objective'] ='binary'     # Manual optimization
    params_lgbm['subsample'] = 1.0
    params_lgbm['max_bin'] = 1023
    params_lgbm['n_jobs'] = -1
    params_lgbm['verbose'] = -1

    scores = cross_val_score(LGBMClassifier(**params_lgbm, random_state=2920), X_train, y_train, scoring='roc_auc', cv=5).mean()
    score = scores.mean()
    return score

In [66]:
from bayes_opt import BayesianOptimization 

params_lgbm ={'min_child_samples':(0, 8),
              'colsample_bytree':(0.3, 1.0),
              'learning_rate':(0.005, 0.1),
              'num_leaves':(20, 60),
              'reg_alpha':(0.0, 10.0),
              'reg_lambda':(0.0, 5.0)}

# lgbm_bo = BayesianOptimization(lgbm_cl_bo, params_lgbm, random_state=2920)
# lgbm_bo.maximize(n_iter=30, init_points=20)

## XGBoost Bayesian Optmization

In [67]:
# from sklearn.model_selection import cross_val_score
# 
# def xgb_cl_bo(n_estimators, learning_rate, max_depth, min_child_weight, subsample, colsample_bytree, reg_alpha, reg_lambda):
#     
#     params_xgb = {}
#     params_xgb['n_estimators'] = n_estimators
#     params_xgb['learning_rate'] = learning_rate
#     params_xgb['max_depth'] = round(max_depth)
#     params_xgb['min_child_weight'] = round(min_child_weight)
#     params_xgb['subsample'] = subsample
#     params_xgb['colsample_bytree'] = colsample_bytree
#     params_xgb['alpha'] = reg_alpha
#     params_xgb['lambda'] = reg_lambda    
#        
#     params_xgb['boosting_type'] ='gbdt'   # Manual optimization
#     params_xgb['objective'] ='binary'     # Manual optimization
#     params_xgb['max_bin'] = 1023
#     params_xgb['n_jobs'] = -1
#     params_xgb['verbose'] = -1
# 
#     scores = cross_val_score(XGBClassifier(**params_xgb, random_state=2920), X_train, y_train, scoring='roc_auc', cv=5).mean()
#     score = scores.mean()
#     return score

In [68]:
# from bayes_opt import BayesianOptimization 
# 
# params_xgb = {
#     'n_estimators': (1000, 20000),
#     'learning_rate':(0.005, 0.12),
#     'max_depth': (3,10),
#     'min_child_weight': (0, 8),
#     'subsample': (0.5, 1.0),
#     'colsample_bytree':(0.3, 1.0),
#     'reg_alpha':(0.0, 10.0),
#     'reg_lambda':(0.0, 5.0)
# }
# 
# xgb_bo = BayesianOptimization(xgb_cl_bo, params_xgb, random_state=2920)
# xgb_bo.maximize(n_iter=30, init_points=20)

## All you need to do is find the best ensemble

In [70]:
models_v1 = [XGBModel(**xgb_optuna0), 
             XGBModel(**xgb_params1),
             XGBModel(**xgb_params3),
             XGBModel(**xgb_params6),
             LGBMModel(**lgb_params0), 
             LGBMModel(**lgb_params1),
             LGBMModel(**lgb_params2),
             LGBMModel(**lgb_params3),
             HistGradientBoostingModel(), 
             CatBoostModel(**cat_params0)]

In [None]:
hill_ens_preds = train(X_train, y_train, X_test, models_v1, n_folds=10)

----------------------------------------------------------
model <__main__.XGBModel object at 0x2ac473d90>
ROC: 0.7917085947469967
model <__main__.XGBModel object at 0x2ac4731c0>
ROC: 0.7871215388335264
model <__main__.XGBModel object at 0x2ac473430>
ROC: 0.7857408175438655
model <__main__.XGBModel object at 0x2ac473190>
ROC: 0.791705923467002
model <__main__.LGBMModel object at 0x2ac473d60>
ROC: 0.7911633782383762
model <__main__.LGBMModel object at 0x2ac473340>
ROC: 0.7925496348608099
model <__main__.LGBMModel object at 0x2ac473b50>
ROC: 0.7912036126721118
model <__main__.LGBMModel object at 0x2ac473220>
[LightGBM] [Info] Number of positive: 20757, number of negative: 70829
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8446
[LightGBM] [Info] Number of data points in the train set: 91586, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226639 -> initsc

In [17]:
# hill climbing
# 캐글 제출용
# submission = pd.read_csv('/kaggle/input/playground-series-s3e23/sample_submission.csv')

# 로컬
submission = pd.read_csv('./playground-series-s3e23/sample_submission.csv')

submission['defects'] = np.mean(hill_ens_preds, axis=0)
submission.to_csv('submission.csv', index=False)


In [18]:
submission

Unnamed: 0,id,defects
0,101763,0.253385
1,101764,0.207265
2,101765,0.671884
3,101766,0.488249
4,101767,0.159772
...,...,...
67837,169600,0.296455
67838,169601,0.133013
67839,169602,0.200931
67840,169603,0.114811
