In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost
import lightgbm
import warnings
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LinearRegression, RidgeClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
import copy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

warnings.filterwarnings("ignore")


# Load Data

In [2]:
# 캐글 실행용
# df_train = pd.read_csv('../input/playground-series-s3e23/train.csv', index_col='id')
# df_test = pd.read_csv('../input/playground-series-s3e23/test.csv', index_col='id')
# df_original = pd.read_csv('/kaggle/input/software-defect-prediction/jm1.csv')

# 로컬
df_train = pd.read_csv("./playground-series-s3e23/train.csv", index_col="id")
df_test = pd.read_csv("./playground-series-s3e23/test.csv", index_col="id")
df_original = pd.read_csv("./software-defect-prediction/jm1.csv")


# Feature

## Original 과  train 을 합치는 코드 

In [3]:
columns = ["uniq_Op", "uniq_Opnd", "total_Op", "total_Opnd", "branchCount"]

for col in columns:
    df_original[col][df_original[col] == "?"] = np.nan
    df_original[col] = df_original[col].astype(float)


df_train = pd.concat([df_train, df_original], ignore_index=True)
df_train = df_train.drop_duplicates()


## Quantile Transformer 
+ 여기서 train['defects']에 nan이 생김 -> 위치 바꿔줌
+ scale 해주는 함수

In [4]:
feat_list = list(set(df_train.columns) - set(['defects']))
target = 'defects'

In [5]:
qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=0)
qt.fit(df_train[feat_list])

tmp_train = pd.DataFrame(qt.transform(df_train[feat_list]))
tmp_train.columns = feat_list
tmp_train[target] = df_train[target]
df_train = tmp_train
df_test = pd.DataFrame(qt.transform(df_test[feat_list]))
df_test.columns = feat_list

feat_list = list(set(df_train.columns.tolist()) - set([target]))


In [6]:
device = torch.device(0) if torch.cuda.is_available() else torch.device("cpu")


## drop N/A

In [7]:
df_train.dropna(axis=0,inplace=True)
df_train

Unnamed: 0,uniq_Opnd,lOCode,branchCount,lOBlank,total_Op,loc,v,v(g),e,i,...,d,total_Opnd,b,locCodeAndComment,uniq_Op,l,t,ev(g),iv(g),defects
0,0.368368,0.580581,0.448949,0.278278,0.576577,0.484484,0.553554,0.449449,0.647648,0.212212,...,0.772773,0.531532,0.535536,0.000000,0.758258,0.359860,0.639708,0.0,0.410911,False
1,0.460460,0.416917,0.262763,0.278278,0.301301,0.288789,0.362362,0.267267,0.335013,0.434434,...,0.334334,0.363864,0.355856,0.000000,0.462462,0.732733,0.327028,0.0,0.410911,False
2,0.460460,0.296797,0.262763,0.278278,0.463463,0.196697,0.443443,0.267267,0.452452,0.481481,...,0.402803,0.432432,0.457958,0.000000,0.537538,0.612613,0.446446,0.0,0.410911,False
3,0.210711,0.141642,0.000000,0.436436,0.263764,0.103604,0.242242,0.000000,0.227544,0.335318,...,0.230230,0.175676,0.236737,0.000000,0.262763,0.843844,0.225744,0.0,0.000000,True
4,0.210711,0.258759,0.262763,0.436436,0.150651,0.196697,0.157157,0.267267,0.201201,0.153153,...,0.252252,0.265766,0.162162,0.000000,0.207708,0.827327,0.197197,0.0,0.410911,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110669,0.750250,0.602603,0.262763,0.687688,0.664164,0.526527,0.693010,0.267267,0.655852,0.806484,...,0.589590,0.743744,0.689189,0.000000,0.462462,0.471471,0.647595,0.0,0.410911,False
110671,0.321822,0.178178,0.262763,0.436436,0.319319,0.132132,0.321822,0.267267,0.362362,0.260260,...,0.412913,0.293293,0.301301,0.000000,0.537538,0.655155,0.353563,0.0,0.410911,False
110672,0.614615,0.779279,0.578078,0.916917,0.708709,0.737237,0.715465,0.574074,0.797786,0.390390,...,0.873654,0.743744,0.705706,0.000000,0.835836,0.215215,0.793514,0.0,0.410911,False
110673,0.321822,0.217217,0.000000,0.436436,0.356857,0.162162,0.353353,0.000000,0.384384,0.321468,...,0.425425,0.388388,0.355856,0.000000,0.324324,0.655155,0.377978,0.0,0.000000,False


## iqr outlier detection

In [8]:
def iqr_outlier_detection(data):

    q1, q3 = np.percentile(data, [25, 75])
 
    iqr = q3 - q1

    threshold = iqr * 1.5
    outliers = [x for x in data if x < q1 - threshold or x > q3 + threshold]

    data_filtered = data[~np.isin(data, outliers)]

    return data_filtered


In [9]:
for col in df_train.columns:
  iqr_outlier_detection(df_train[col])


## Feature Engineering

In [10]:
df_train['defects'] = df_train.defects.astype(int)


In [11]:
df_test['l']  = 1.0 - df_test['l']
df_train['l'] = 1.0 - df_train['l']


In [12]:
def add_feat(X):
    df = X.copy()
    df["mean_bnv"] = (df["n"] + df["v"] + df["b"]) / 3
    df["mean_uniqOpOpend"] = (df["uniq_Op"] + df["uniq_Opnd"]) / 2
    df["mean_totOpOpend"] = (df["total_Op"] + df["total_Opnd"]) / 2
    df["mean_brcntvg"] = (df["branchCount"] + df["v(g)"]) / 2
    return df


df_train = add_feat(df_train)
df_test = add_feat(df_test)

### HERE ###############
df_train.dropna(axis=0,inplace=True)
### HERE ###############


# Modeling

In [13]:
X_train = df_train.drop(['defects'], axis=1)
y_train = df_train['defects']
X_test = df_test

X_train


Unnamed: 0,uniq_Opnd,lOCode,branchCount,lOBlank,total_Op,loc,v,v(g),e,i,...,locCodeAndComment,uniq_Op,l,t,ev(g),iv(g),mean_bnv,mean_uniqOpOpend,mean_totOpOpend,mean_brcntvg
0,0.368368,0.580581,0.448949,0.278278,0.576577,0.484484,0.553554,0.449449,0.647648,0.212212,...,0.000000,0.758258,0.640140,0.639708,0.0,0.410911,0.545379,0.563313,0.554054,0.449199
1,0.460460,0.416917,0.262763,0.278278,0.301301,0.288789,0.362362,0.267267,0.335013,0.434434,...,0.000000,0.462462,0.267267,0.327028,0.0,0.410911,0.347347,0.461461,0.332583,0.265015
2,0.460460,0.296797,0.262763,0.278278,0.463463,0.196697,0.443443,0.267267,0.452452,0.481481,...,0.000000,0.537538,0.387387,0.446446,0.0,0.410911,0.450117,0.498999,0.447948,0.265015
3,0.210711,0.141642,0.000000,0.436436,0.263764,0.103604,0.242242,0.000000,0.227544,0.335318,...,0.000000,0.262763,0.156156,0.225744,0.0,0.000000,0.236904,0.236737,0.219720,0.000000
4,0.210711,0.258759,0.262763,0.436436,0.150651,0.196697,0.157157,0.267267,0.201201,0.153153,...,0.000000,0.207708,0.172673,0.197197,0.0,0.410911,0.159826,0.209209,0.208208,0.265015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110669,0.750250,0.602603,0.262763,0.687688,0.664164,0.526527,0.693010,0.267267,0.655852,0.806484,...,0.000000,0.462462,0.528529,0.647595,0.0,0.410911,0.689629,0.606356,0.703954,0.265015
110671,0.321822,0.178178,0.262763,0.436436,0.319319,0.132132,0.321822,0.267267,0.362362,0.260260,...,0.000000,0.537538,0.344845,0.353563,0.0,0.410911,0.310477,0.429680,0.306306,0.265015
110672,0.614615,0.779279,0.578078,0.916917,0.708709,0.737237,0.715465,0.574074,0.797786,0.390390,...,0.000000,0.835836,0.784785,0.793514,0.0,0.410911,0.710961,0.725225,0.726226,0.576076
110673,0.321822,0.217217,0.000000,0.436436,0.356857,0.162162,0.353353,0.000000,0.384384,0.321468,...,0.000000,0.324324,0.344845,0.377978,0.0,0.000000,0.357858,0.323073,0.372623,0.000000


## Evaluation Metric (AUC)

In [14]:
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score

def roc_auc(true_list, pred_list, figlen):
    
    fpr, tpr, _ = roc_curve(true_list, pred_list)    
    roc_auc = auc(fpr, tpr)
    print(f'\nROC_AUC: %0.6f\n' %roc_auc)
    
    if (figlen > 0):
        sns.set()
        plt.style.use('seaborn-whitegrid')
        plt.figure(figsize=(figlen, figlen), facecolor='lightyellow')
        plt.gca().set_facecolor('lightgray')
        plt.fill_between(fpr, tpr, color='r', alpha=0.1)
        plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([-0.01, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('The area under the ROC curve\n', fontsize=16, c='darkred')
        plt.legend(loc="lower right")
        plt.show()

## HyperParameter Tuning

In [15]:
from bayes_opt import BayesianOptimization


def lgbm_cl_bo(min_child_samples, colsample_bytree, learning_rate, num_leaves, reg_alpha, reg_lambda):
    
    params_lgbm = {}
    params_lgbm['min_child_samples'] = round(min_child_samples)
    params_lgbm['colsample_bytree'] = colsample_bytree
    params_lgbm['learning_rate'] = learning_rate
    params_lgbm['num_leaves'] = round(num_leaves)
    params_lgbm['reg_alpha'] = reg_alpha
    params_lgbm['reg_lambda'] = reg_lambda    
       
    params_lgbm['boosting_type'] ='gbdt'   # Manual optimization
    params_lgbm['objective'] ='binary'     # Manual optimization
    params_lgbm['subsample'] = 1.0
    params_lgbm['max_bin'] = 1023
    params_lgbm['n_jobs'] = -1
    params_lgbm['verbose'] = -1

    scores = cross_val_score(LGBMClassifier(**params_lgbm, random_state=2920), X_train, y_train, scoring='roc_auc', cv=5).mean()
    score = scores.mean()
    return score

In [16]:
params_lgbm ={'min_child_samples':(800, 1200),
              'colsample_bytree':(0.3, 1.0),
              'learning_rate':(0.005, 0.1),
              'num_leaves':(20, 60),
              'reg_alpha':(0.0, 10.0),
              'reg_lambda':(0.0, 5.0)}

# lgbm_bo = BayesianOptimization(lgbm_cl_bo, params_lgbm, random_state=2920)
# import warnings
# warnings.filterwarnings('ignore')
# lgbm_bo.maximize(n_iter=30, init_points=20)

## Hill Climbing Ensemble

In [17]:
def hill_climbing(x, y, x_test):

    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    # Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    # Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.71, 0.01)
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0
    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True

    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds

    return [hill_ens_pred_1, hill_ens_pred_2]


## hyperparameter sets

In [26]:
random_state = 43

lgb_params0 = {
    "objective": 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
     "n_estimators": 100,
     "max_depth": 7,
     "learning_rate":0.03,
     "num_leaves": 50,
     "reg_alpha":3,
     "reg_lambda": 3,
     "subsample": 0.7,
     'device': 'gpu' if torch.cuda.is_available() else 'cpu',
     "colsample_bytree": 0.7,
     'verbose': -1,
    'min_child_samples': 864,
}

lgb_params1 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
    'random_state': 42,
    'colsample_bytree': 0.50,
    'subsample': 0.70,
    'learning_rate': 0.0625,
    'max_depth': -1,
    'n_estimators': 100,
    'num_leaves': 20,
    'reg_alpha': 0.0001,
    'reg_lambda': 2.0,
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': random_state,
    'verbose': -1,  
    'min_child_samples': 864,
}

lgb_params2 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': 42,
    'colsample_bytree': 0.50,
    'subsample': 0.70,
    'learning_rate': 0.0625,
    'max_depth': -1,
    'n_estimators': 100,
    'num_leaves': 20,
    'reg_alpha': 0.0001,
    'reg_lambda': 2.0,
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': random_state,
    'verbose': -1,
    'min_child_samples': 864,
}
lgb_params3 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': 42,
    
    'colsample_bytree': 0.5688,
    'learning_rate': 0.07712,
    'min_child_samples': 864,
    'num_leaves': 23,
    'reg_alpha': 0.6515,
    'reg_lambda': 3.118,
    
    'subsample': 0.70, 
    'max_depth': -1,
    'n_estimators': 100,
    'device': 'gpu' if torch.cuda.is_available() else 'cpu',
    'random_state': random_state,
    'verbose': -1,
}

xgb_optuna0 = {
    'n_estimators': 10000,
    'learning_rate': 0.01752354328845971,
    'booster': 'gbtree',
    'lambda': 0.08159630121074074,
    'alpha': 0.07564858712175693,
    'subsample': 0.5065979400270813,
    'colsample_bytree': 0.6187340851873067,
    'max_depth': 4,
    'min_child_weight': 5,
    'eta': 0.2603059902806757,
    'gamma': 0.6567360773618207,
    'early_stopping_rounds': 100,
    'tree_method': 'hist',
    'random_state': random_state
}

xgb_params0 = {
    'n_estimators': 10000,
    'learning_rate': 0.09641232707445854,
    'booster': 'gbtree',
    'lambda': 4.666002223704784,
    'alpha': 3.708175990751336,
    'subsample': 0.6100174145229473,
    'colsample_bytree': 0.5506821152321051,
    'max_depth': 7,
    'min_child_weight': 3,
    'eta': 1.740374368661041,
    'gamma': 0.007427363662926455,
    'grow_policy': 'depthwise',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'verbosity': 0,
    'random_state': random_state,
    'early_stopping_rounds': 100,
    'tree_method': 'hist',
}

xgb_params1 = {
    'n_estimators': 10000,
    'learning_rate': 0.012208383405206188,
    'booster': 'gbtree',
    'lambda': 0.009968756668882757,
    'alpha': 0.02666266827121168,
    'subsample': 0.7097814108897231,
    'colsample_bytree': 0.7946945784285216,
    'max_depth': 3,
    'min_child_weight': 4,
    'eta': 0.5480204506554545,
    'gamma': 0.8788654128774149,
    'scale_pos_weight': 4.71,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'early_stopping_rounds': 100,
    'verbosity': 0,
    'random_state': random_state,
    'tree_method': 'hist',
}



xgb_params2 = {
    'n_estimators': 10000,
    'colsample_bytree': 0.5646751146007976,
    'gamma': 7.788727238356553e-06,
    'learning_rate': 0.1419865761603358,
    'max_bin': 824,
    'min_child_weight': 1,
    'random_state': 811996,
    'reg_alpha': 1.6259583347890365e-07,
    'reg_lambda': 2.110691851528507e-08,
    'subsample': 0.879020578464637,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 3,
    'early_stopping_rounds': 100,
    'n_jobs': -1,
    'verbosity': 0,
    'random_state': random_state,
    'tree_method': 'hist',
   # 'scale_pos_weight': scale_pos_weight
}

xgb_params3 = {
    'n_estimators': 10000,
    'random_state': random_state,
    'colsample_bytree': 0.4836462317215041,
    'eta': 0.05976752607337169,
    'gamma': 1,
    'lambda': 0.2976432557733288,
    'max_depth': 6,
    'min_child_weight': 1,
    'n_estimators': 550,
    'early_stopping_rounds': 100,
    'objective': 'binary:logistic',
    'scale_pos_weight': 4.260162886376033,
    'subsample': 0.7119282378433924,
    'tree_method': 'hist',
}

xgb_params4 = {
    'n_estimators': 10000,
    'colsample_bytree': 0.8757972257439255,
    'gamma': 0.11135738771999848,
    'max_depth': 7,
    'min_child_weight': 3,
    'reg_alpha': 0.4833998914998038,
    'reg_lambda': 0.006223568555619563,
    'scale_pos_weight': 8,
    'subsample': 0.7056434340275685,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

xgb_params5 = {
    'n_estimators': 10000,
    'max_depth': 5,
    'min_child_weight': 2.934487833919741,
    'learning_rate': 0.11341944575807082,
    'subsample': 0.9045063514419968,
    'gamma': 0.4329153382843715,
    'colsample_bytree': 0.38872702868412506,
    'colsample_bylevel': 0.8321880031718571,
    'colsample_bynode': 0.802355707802605,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

xgb_base = {
    'n_estimators': 1000,
    'verbosity': 0,
    'random_state': random_state,
}

xgb_params6 = {
    'objective': 'binary:logistic',
    'colsample_bytree': 0.7,
    'gamma': 2,
    'learning_rate': 0.01,
    'max_depth': 7,
    'min_child_weight': 10,
    'n_estimators': 10000,
    'subsample':0.7,
    'random_state': random_state,
    'tree_method': 'hist',
    'early_stopping_rounds': 100,
}

cat_params0 = {
    'iterations': 10000,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'early_stopping_rounds': 100,
    'auto_class_weights': 'Balanced'
}


## class initialization

In [19]:
from abc import ABCMeta
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

class BaseModel(metaclass=ABCMeta):
    def __init__(self):
        pass

    def train(self, trn_x, trn_y, val_x, val_y):
        pass

    def predict(self, X):
        pass


class RandomForestModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = None

    def train(self, trn_x, trn_y, val_x, val_y):
        self.model = RandomForestClassifier(n_estimators = 1000,
                                            class_weight='balanced',
                                           max_depth = 7,
                                           min_samples_split = 15,
                                           min_samples_leaf = 10)
        self.model.fit(trn_x, trn_y)

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class HistGradientBoostingModel(BaseModel):
    def __init__(self):
        super().__init__()
        self.model = None

    def train(self, trn_x, trn_y, val_x, val_y):
        self.model = HistGradientBoostingClassifier(l2_regularization = 0.01,
                                             early_stopping = True,
                                             learning_rate = 0.01,
                                             max_iter = 1000,
                                             max_depth = 5,
                                             max_bins = 255,
                                             min_samples_leaf = 15,
                                             max_leaf_nodes = 10)
        self.model.fit(trn_x, trn_y)

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class LGBMModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = LGBMClassifier(**self.config)

    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)])

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]


class XGBModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = XGBClassifier(**self.config)

    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=0)


    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

class CatBoostModel(BaseModel):
    def __init__(self, **config):
        super().__init__()
        self.config = config
        self.model = CatBoostClassifier(**self.config)

    def train(self, trn_x, trn_y, val_x, val_y):
        self.model.fit(trn_x, trn_y, eval_set=(val_x, val_y), verbose=0)

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]


In [20]:
from sklearn.model_selection import RepeatedStratifiedKFold

def train(X_train, y_train, X_test, models, n_folds=10):

    valid_preds, test_preds = dict(), dict()
    hill_ens_preds =  list()

    sk = RepeatedStratifiedKFold(n_splits = n_folds, n_repeats = 1, random_state = 42)
    for i, (trn_idx, val_idx) in enumerate(sk.split(X_train, y_train)):

        trn_x, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        print('----------------------------------------------------------')
        ens_pred_1 = np.zeros((val_x.shape[0], ))
        ens_pred_2 = np.zeros((X_test.shape[0], ))

        for idx, model in enumerate(models):
            print("model", model)
            model.train(trn_x, trn_y, val_x, val_y)

            ens_pred_1 = model.predict(val_x)
            ens_pred_2 = model.predict(X_test)
            
            print(f"ROC: {roc_auc_score(val_y, ens_pred_1)}")
            
            valid_preds[f'model_{idx}'] = ens_pred_1
            test_preds[f'model_{idx}'] = ens_pred_2

        ens_pred_1 = np.mean(list(valid_preds.values()), axis=0)
        ens_pred_2 = np.mean(list(test_preds.values()), axis=0)


        ens_score_fold = roc_auc_score(val_y, ens_pred_1)
        print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

        ############################
        ## Hill Climbing Ensemble ##
        ############################

        x = pd.DataFrame(valid_preds)
        y = val_y
        x_test = pd.DataFrame(test_preds)

        hill_results = hill_climbing(x, y, x_test)
        hill_ens_score_fold = roc_auc_score(y, hill_results[0])

        # test preds
        hill_ens_preds.append(hill_results[1])

        print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)
        
    return hill_ens_preds

In [21]:
# X_train, X_test, y_train = df_train[feat_list], df_test[feat_list], df_train[target]


In [29]:
models_v1 = [
    LGBMModel(**lgb_params0), 
    LGBMModel(**lgb_params1),
    LGBMModel(**lgb_params2),
    XGBModel(**xgb_optuna0), 
    XGBModel(**xgb_params3),
    XGBModel(**xgb_params6),
    HistGradientBoostingModel(), 
    CatBoostModel(**cat_paraams0)
]

In [30]:
hill_ens_preds = train(X_train, y_train, X_test, models_v1, n_folds=10)


----------------------------------------------------------
model <__main__.LGBMModel object at 0x2bd2924c0>
ROC: 0.7768932181673762
model <__main__.LGBMModel object at 0x2bd2920d0>
ROC: 0.7773868839971034
model <__main__.LGBMModel object at 0x2bd292be0>
ROC: 0.7795238052607194
model <__main__.XGBModel object at 0x2bd292c40>
ROC: 0.7796107249304134
model <__main__.XGBModel object at 0x13efbadc0>
ROC: 0.7725619423434121
model <__main__.XGBModel object at 0x13efbaeb0>
ROC: 0.7798307951131913
model <__main__.HistGradientBoostingModel object at 0x13efba9a0>
ROC: 0.7788812865361623
model <__main__.CatBoostModel object at 0x2bd27dac0>
ROC: 0.7802912088403982
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.7799328013849547
Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7807027628186737
----------------------------------------------------------
model <__main__.LGBMModel object at 0x2bd2924c0>
ROC: 0.7856755152785122
model <__main__.LGBMModel object at 0x2bd2920d0>
ROC: 0.78

In [24]:
# hill climbing
# 캐글 제출용
# submission = pd.read_csv('/kaggle/input/playground-series-s3e23/sample_submission.csv')

# 로컬
submission = pd.read_csv('./playground-series-s3e23/sample_submission.csv')

submission['defects'] = np.mean(hill_ens_preds, axis=0)
submission.to_csv('submission.csv', index=False)


NameError: name 'hill_ens_preds' is not defined