In [2]:
import numpy as np
import pandas as pd

import optuna
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv').drop('id',axis=1)
test = pd.read_csv('/kaggle/input/playground-series-s3e23/test.csv').drop('id',axis=1)
origin = pd.read_csv('/kaggle/input/software-defect-prediction/jm1.csv')
origin['defects'] = origin['defects'].map({False: 0, True: 1})
train_total = pd.concat([train, origin], axis=0, ignore_index=True)
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e23/sample_submission.csv')
train_total.shape, train.shape, test.shape

((112648, 22), (101763, 22), (67842, 21))

In [4]:
X = train.drop(columns = ['defects'], axis = 1)
Y = train['defects']

test_cv = test

In [5]:
def hill_climbing(x, y, x_test):
    # 初始化得分字典，用于存储每个模型的AUC得分
    scores = {}

    # 遍历x列标签（各模型）
    for col in x.columns:
        # 计算各模型的AUC指标
        scores[col] = roc_auc_score(y, x[col])

    # 根据AUC对模型进行排序，得分高的排在前面
    scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}

    # 根据AUC得分高低重新排列x和x_test的列顺序
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    # 停止标志，控制爬山算法的终止条件
    STOP = False

    # 取出得分最高的模型作为初始最优模型
    current_best_ensemble = x.iloc[:, 0]
    current_best_test_preds = x_test.iloc[:, 0]

    # 除最优模型外的其他模型
    MODELS = x.iloc[:, 1:]

    # 权重空间，用于尝试不同的权重值
    weight_range = np.arange(-0.5, 0.51, 0.01)

    # 计算最优模型AUC得分的历史，用于观察算法进展
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None

        # 遍历每个模型以及权重空间，寻找最优的组合
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)

                # 如果当前组合的AUC得分高于历史最优，更新最优组合
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            # 更新最优模型和测试集预测
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]

            # 从可用模型中移除已选模型
            MODELS.drop(k_best, axis=1, inplace=True)

            # 如果已经没有其他可用模型，则停止
            if MODELS.shape[1] == 0:
                STOP = True

            # 记录当前最优得分
            history.append(potential_new_best_cv_score)
        else:
            # 如果无法找到更好的组合，则停止
            STOP = True

    # 返回最终得到的集成模型的预测结果
    hill_ens_pred_1 = current_best_ensemble
    hill_ens_pred_2 = current_best_test_preds

    return [hill_ens_pred_1, hill_ens_pred_2]


In [6]:
def rf_search(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 1000, step=50)
    max_depth = trial.suggest_int("max_depth", 5, 100, step=2)
    min_samples_split = trial.suggest_int("min_samples_split", 5, 100, step=2)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 5, 100, step=2)
    sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                min_samples_split=min_samples_split,
                                min_samples_leaf = min_samples_leaf,
                                max_depth=max_depth)
    val = cross_val_score(rf, X, Y, scoring='roc_auc', cv=sk).mean()
    return -val

def et_search(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 1000, step=50)
    max_depth = trial.suggest_int("max_depth", 5, 100, step=2)
    min_samples_split = trial.suggest_int("min_samples_split", 5, 100, step=2)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 5, 100, step=2)
    sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
    et = ExtraTreesClassifier(n_estimators = n_estimators, 
                                 max_depth = max_depth,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf)
    val = cross_val_score(et, X, Y, scoring='roc_auc', cv=sk).mean()
    return -val

def hist_search(trial):
    l2_regularization = trial.suggest_float('l2_regularization', 0.001, 0.1, log=True)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    max_iter = trial.suggest_int('max_iter', 200, 1000, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 100, step=2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 10, 100, step=2)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 5, 100, step=2)
    sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
    hist = HistGradientBoostingClassifier(l2_regularization = l2_regularization,
                                          early_stopping = False,
                                          learning_rate = learning_rate,
                                          max_iter = max_iter,
                                          max_depth = max_depth,
                                          max_bins = 255,
                                          min_samples_leaf = min_samples_leaf,
                                          max_leaf_nodes = max_leaf_nodes)
    val = cross_val_score(hist, X, Y, scoring='roc_auc', cv=sk).mean()
    return -val

def xgb_search(trial):
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.1)
    gamma = trial.suggest_float('gamma', 1, 10, step=0.5)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    max_depth = trial.suggest_int('max_depth', 5, 100, step=2)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 15, step=1)
    n_estimators = trial.suggest_int('n_estimators', 200, 1000, step=50)
    subsample = trial.suggest_float('subsample', 0.6, 1.0, step=0.1)
    sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
    xgb = XGBClassifier(objective = 'binary:logistic',
                           tree_method = 'hist',
                           colsample_bytree = colsample_bytree, 
                           gamma = gamma, 
                           learning_rate = learning_rate, 
                           max_depth = max_depth, 
                           min_child_weight = min_child_weight, 
                           n_estimators = n_estimators, 
                           subsample = subsample)
    val = cross_val_score(xgb, X, Y, scoring='roc_auc', cv=sk).mean()
    return -val

def cat_search(trial):
    iterations = trial.suggest_int('iterations', 200, 1000, step=50)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    depth = trial.suggest_int('depth', 5, 16, step=2)
    random_strength = trial.suggest_float('random_strength', 0.5, 1, step=0.1)
    bagging_temperature = trial.suggest_float('bagging_temperature', 0.5, 1, step=0.1)
    border_count = trial.suggest_int('border_count', 1, 100, step=10)
    l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 10, step=1)
    sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
    cat = CatBoostClassifier(loss_function = 'Logloss',
                                iterations = iterations,
                                learning_rate = learning_rate,
                                depth = depth,
                                random_strength = random_strength,
                                bagging_temperature = bagging_temperature,
                                border_count = border_count,
                                l2_leaf_reg = l2_leaf_reg,
                                verbose = False, 
                                task_type = 'CPU')
    val = cross_val_score(cat, X, Y, scoring='roc_auc', cv=sk).mean()
    return -val

In [7]:
rf_study = optuna.create_study(study_name='rf')
rf_study.optimize(rf_search, timeout=int(2 * 60 * 60))
rf_params = rf_study.best_params

et_study = optuna.create_study(study_name='et')
et_study.optimize(et_search, timeout=int(2 * 60 * 60))
et_params = et_study.best_params

hist_study = optuna.create_study(study_name='hist')
hist_study.optimize(hist_search, timeout=int(2 * 60 * 60))
hist_params = hist_study.best_params

xgb_study = optuna.create_study(study_name='xgb')
xgb_study.optimize(xgb_search, timeout=int(2 * 60 * 60))
xgb_params = xgb_study.best_params

cat_study = optuna.create_study(study_name='cat')
cat_study.optimize(cat_search, timeout=int(2 * 60 * 60))
cat_params = cat_study.best_params

[I 2023-10-06 03:28:00,766] A new study created in memory with name: rf
[I 2023-10-06 03:34:35,235] Trial 0 finished with value: -0.7900455385600692 and parameters: {'n_estimators': 250, 'max_depth': 19, 'min_samples_split': 99, 'min_samples_leaf': 45}. Best is trial 0 with value: -0.7900455385600692.
[I 2023-10-06 04:04:47,573] Trial 1 finished with value: -0.7896404897373415 and parameters: {'n_estimators': 1000, 'max_depth': 73, 'min_samples_split': 89, 'min_samples_leaf': 15}. Best is trial 0 with value: -0.7900455385600692.
[I 2023-10-06 04:13:39,036] Trial 2 finished with value: -0.7893346613663202 and parameters: {'n_estimators': 300, 'max_depth': 57, 'min_samples_split': 71, 'min_samples_leaf': 25}. Best is trial 0 with value: -0.7900455385600692.
[I 2023-10-06 04:31:18,432] Trial 3 finished with value: -0.7898661237037052 and parameters: {'n_estimators': 950, 'max_depth': 9, 'min_samples_split': 91, 'min_samples_leaf': 85}. Best is trial 0 with value: -0.7900455385600692.
[I 2

In [8]:
# 初始化交叉检验分数，初始化预测标签
ens_cv_scores, ens_preds = list(), list()
# 初始化hill交叉检验分数，初始化hill预测标签
hill_ens_cv_scores, hill_ens_preds =  list(), list()

# 重复k折交叉验证
sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

# 遍历每折数据
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):
    
    # 划分训练测试集
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    print('----------------------------------------------------------')
    #----------------------------------------------------------------------------
    # 训练随机森林模型
    RF_md = RandomForestClassifier(**rf_params).fit(X_train, Y_train)
    # 得到验证集预测概率
    RF_pred = RF_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    RF_score = roc_auc_score(Y_test, RF_pred)
    # 得到测试集预测概率
    RF_pred_test = RF_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练极端森林
    ET_md = ExtraTreesClassifier(**et_params).fit(X_train, Y_train)
    # 得到验证集预测概率
    ET_pred = ET_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    ET_score = roc_auc_score(Y_test, ET_pred)
    # 得到测试集预测概率
    ET_pred_test = ET_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练HistGradientBoosting
    hist_md = HistGradientBoostingClassifier(early_stopping = False,max_bins = 255,**hist_params).fit(X_train, Y_train)
    # 得到验证集预测概率
    hist_pred = hist_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    hist_score = roc_auc_score(Y_test, hist_pred)
    # 得到测试集预测概率
    hist_pred_test = hist_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练LGBM模型
    LGBM_md = LGBMClassifier(objective = 'binary',
                             n_estimators = 500,
                             max_depth = 7,
                             learning_rate = 0.01,
                             num_leaves = 20,
                             reg_alpha = 3,
                             reg_lambda = 3,
                             subsample = 0.7,
                             colsample_bytree = 0.7).fit(X_train, Y_train)
    
    # 得到测试集预测概率
    lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    lgb_score = roc_auc_score(Y_test, lgb_pred)
    # 得到测试集预测概率
    lgb_pred_test = LGBM_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练XGB模型
    XGB_md = XGBClassifier(objective = 'binary:logistic',
                           tree_method = 'hist',
                           **xgb_params).fit(X_train, Y_train)
    # 得到验证集预测概率
    xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    xgb_score = roc_auc_score(Y_test, xgb_pred)
    # 得到测试集预测概率
    xgb_pred_test = XGB_md.predict_proba(test_cv)[:, 1]
    #--------------------------------------------------------------------------
    # 训练CatBoost模型
    Cat_md = CatBoostClassifier(loss_function = 'Logloss',
                                verbose = False, 
                                task_type = 'CPU',
                                **cat_params).fit(X_train, Y_train)
    # 得到验证集预测概率
    cat_pred = Cat_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    cat_score = roc_auc_score(Y_test, cat_pred)
    # 得到测试集预测概率
    cat_pred_test = Cat_md.predict_proba(test_cv)[:, 1]   
    #--------------------------------------------------------------------------
    # 简单组合模型
    # 各模型验证集预测概率均值
    ens_pred_1 = (RF_pred + ET_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 6
    # 各模型测试集预测概率均值
    ens_pred_2 = (RF_pred_test + ET_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test + cat_pred_test) / 6
    # 计算AUC分数
    ens_score_fold = roc_auc_score(Y_test, ens_pred_1)
    # 记录组合模型验证集预测概率均值
    ens_cv_scores.append(ens_score_fold)
    # 记录组合模型测试集预测概率均值
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    #--------------------------------------------------------------------------
    # 加权组合模型
    x = pd.DataFrame({'RF': RF_pred,
                      'ET': ET_pred, 
                      'Hist': hist_pred, 
                      'LGBM': lgb_pred,
                      'XGB': xgb_pred,
                      'Cat': cat_pred})
    y = Y_test
        
    x_test = pd.DataFrame({'RF': RF_pred_test,
                           'ET': ET_pred_test, 
                           'Hist': hist_pred_test, 
                           'LGBM': lgb_pred_test,
                           'XGB': xgb_pred_test,
                           'Cat': cat_pred_test})
    
    hill_results = hill_climbing(x, y, x_test)
    
    hill_ens_score_fold = roc_auc_score(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

----------------------------------------------------------
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.7913420786083346
Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7933417933046157
----------------------------------------------------------
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.8012767616816177
Fold 1 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.8025333097599758
----------------------------------------------------------
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.7960881996068537
Fold 2 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7979826052856093
----------------------------------------------------------
Fold 3 ==> Average Ensemble oof ROC-AUC score is ==> 0.7914849500391773
Fold 3 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7920428008917679
----------------------------------------------------------
Fold 4 ==> Average Ensemble oof ROC-AUC score is ==> 0.788211736467819
Fold 4 ==> Hill Climbing Ensemble 

In [9]:
# # 初始化交叉检验分数，初始化预测标签
# ens_cv_scores, ens_preds = list(), list()
# # 初始化hill交叉检验分数，初始化hill预测标签
# hill_ens_cv_scores, hill_ens_preds =  list(), list()

# # 重复k折交叉验证
# sk = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

# # 遍历每折数据
# for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):
    
#     # 划分训练测试集
#     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#     Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
#     print('----------------------------------------------------------')
#     #----------------------------------------------------------------------------
#     # 训练随机森林模型
#     RF_md = RandomForestClassifier(n_estimators = 500, 
#                                    max_depth = 7,
#                                    min_samples_split = 15,
#                                    min_samples_leaf = 10).fit(X_train, Y_train)
#     # 得到验证集预测概率
#     RF_pred = RF_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     RF_score = roc_auc_score(Y_test, RF_pred)
#     # 得到测试集预测概率
#     RF_pred_test = RF_md.predict_proba(test_cv)[:, 1]
#     #----------------------------------------------------------------------------
#     # 训练极端森林
#     ET_md = ExtraTreesClassifier(n_estimators = 500, 
#                                  max_depth = 7,
#                                  min_samples_split = 15,
#                                  min_samples_leaf = 10).fit(X_train, Y_train)
#     # 得到验证集预测概率
#     ET_pred = ET_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     ET_score = roc_auc_score(Y_test, ET_pred)
#     # 得到测试集预测概率
#     ET_pred_test = ET_md.predict_proba(test_cv)[:, 1]
#     #----------------------------------------------------------------------------
#     # 训练HistGradientBoosting
#     hist_md = HistGradientBoostingClassifier(l2_regularization = 0.01,
#                                              early_stopping = False,
#                                              learning_rate = 0.01,
#                                              max_iter = 500,
#                                              max_depth = 5,
#                                              max_bins = 255,
#                                              min_samples_leaf = 15,
#                                              max_leaf_nodes = 10).fit(X_train, Y_train)
#     # 得到验证集预测概率
#     hist_pred = hist_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     hist_score = roc_auc_score(Y_test, hist_pred)
#     # 得到测试集预测概率
#     hist_pred_test = hist_md.predict_proba(test_cv)[:, 1]
#     #----------------------------------------------------------------------------
#     # 训练LGBM模型
#     LGBM_md = LGBMClassifier(objective = 'binary',
#                              n_estimators = 500,
#                              max_depth = 7,
#                              learning_rate = 0.01,
#                              num_leaves = 20,
#                              reg_alpha = 3,
#                              reg_lambda = 3,
#                              subsample = 0.7,
#                              colsample_bytree = 0.7).fit(X_train, Y_train)
    
#     # 得到测试集预测概率
#     lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     lgb_score = roc_auc_score(Y_test, lgb_pred)
#     # 得到测试集预测概率
#     lgb_pred_test = LGBM_md.predict_proba(test_cv)[:, 1]
#     #----------------------------------------------------------------------------
#     # 训练XGB模型
#     XGB_md = XGBClassifier(objective = 'binary:logistic',
#                            tree_method = 'hist',
#                            colsample_bytree = 0.7, 
#                            gamma = 2, 
#                            learning_rate = 0.01, 
#                            max_depth = 7, 
#                            min_child_weight = 10, 
#                            n_estimators = 500, 
#                            subsample = 0.7).fit(X_train, Y_train)
#     # 得到验证集预测概率
#     xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     xgb_score = roc_auc_score(Y_test, xgb_pred)
#     # 得到测试集预测概率
#     xgb_pred_test = XGB_md.predict_proba(test_cv)[:, 1]
#     #--------------------------------------------------------------------------
#     # 训练CatBoost模型
#     Cat_md = CatBoostClassifier(loss_function = 'Logloss',
#                                 iterations = 500,
#                                 learning_rate = 0.01,
#                                 depth = 7,
#                                 random_strength = 0.5,
#                                 bagging_temperature = 0.7,
#                                 border_count = 30,
#                                 l2_leaf_reg = 5,
#                                 verbose = False, 
#                                 task_type = 'CPU').fit(X_train, Y_train)
#     # 得到验证集预测概率
#     cat_pred = Cat_md.predict_proba(X_test)[:, 1]
#     # 计算AUC分数
#     cat_score = roc_auc_score(Y_test, cat_pred)
#     # 得到测试集预测概率
#     cat_pred_test = Cat_md.predict_proba(test_cv)[:, 1]   
#     #--------------------------------------------------------------------------
#     # 简单组合模型
#     # 各模型验证集预测概率均值
#     ens_pred_1 = (RF_pred + ET_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 6
#     # 各模型测试集预测概率均值
#     ens_pred_2 = (RF_pred_test + ET_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test + cat_pred_test) / 6
#     # 计算AUC分数
#     ens_score_fold = roc_auc_score(Y_test, ens_pred_1)
#     # 记录组合模型验证集预测概率均值
#     ens_cv_scores.append(ens_score_fold)
#     # 记录组合模型测试集预测概率均值
#     ens_preds.append(ens_pred_2)
#     print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
#     #--------------------------------------------------------------------------
#     # 加权组合模型
#     x = pd.DataFrame({'RF': RF_pred,
#                       'ET': ET_pred, 
#                       'Hist': hist_pred, 
#                       'LGBM': lgb_pred,
#                       'XGB': xgb_pred,
#                       'Cat': cat_pred})
#     y = Y_test
        
#     x_test = pd.DataFrame({'RF': RF_pred_test,
#                            'ET': ET_pred_test, 
#                            'Hist': hist_pred_test, 
#                            'LGBM': lgb_pred_test,
#                            'XGB': xgb_pred_test,
#                            'Cat': cat_pred_test})
    
#     hill_results = hill_climbing(x, y, x_test)
    
#     hill_ens_score_fold = roc_auc_score(y, hill_results[0])
#     hill_ens_cv_scores.append(hill_ens_score_fold)
#     hill_ens_preds.append(hill_results[1])

#     print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

In [10]:
print('The average ensemble oof ROC-AUC score over the 10-folds is', np.mean(ens_cv_scores))
print('The hill climbing ensemble oof ROC-AUC score over the 10-folds is', np.mean(hill_ens_cv_scores))

The average ensemble oof ROC-AUC score over the 10-folds is 0.7926122618720736
The hill climbing ensemble oof ROC-AUC score over the 10-folds is 0.7937095315945933


In [11]:
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

sample_submission['defects'] = ens_preds_test
sample_submission.to_csv('Avereage_Ensemble_Baseline_submission.csv', index = False)

ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

sample_submission['defects'] = ens_preds_test
sample_submission.to_csv('Hill_Ensemble_Baseline_submission.csv', index = False)

In [None]:
X = train.drop(columns = ['defects'], axis = 1)
X = X.apply(lambda x: np.log1p(x))

test_cv = test
test_cv = test_cv.apply(lambda x: np.log1p(x))

In [None]:
# 初始化交叉检验分数，初始化预测标签
ens_cv_scores, ens_preds = list(), list()
# 初始化hill交叉检验分数，初始化hill预测标签
hill_ens_cv_scores, hill_ens_preds =  list(), list()

# 重复k折交叉验证
sk = RepeatedStratifiedKFold(n_splits = 25, n_repeats = 1, random_state = 42)

# 遍历每折数据
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):
    
    # 划分训练测试集
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    print('----------------------------------------------------------')
    #----------------------------------------------------------------------------
    # 训练随机森林模型
    RF_md = RandomForestClassifier(n_estimators = 500, 
                                   max_depth = 7,
                                   min_samples_split = 15,
                                   min_samples_leaf = 10).fit(X_train, Y_train)
    # 得到验证集预测概率
    RF_pred = RF_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    RF_score = roc_auc_score(Y_test, RF_pred)
    # 得到测试集预测概率
    RF_pred_test = RF_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练极端森林
    ET_md = ExtraTreesClassifier(n_estimators = 500, 
                                 max_depth = 7,
                                 min_samples_split = 15,
                                 min_samples_leaf = 10).fit(X_train, Y_train)
    # 得到验证集预测概率
    ET_pred = ET_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    ET_score = roc_auc_score(Y_test, ET_pred)
    # 得到测试集预测概率
    ET_pred_test = ET_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练HistGradientBoosting
    hist_md = HistGradientBoostingClassifier(l2_regularization = 0.01,
                                             early_stopping = False,
                                             learning_rate = 0.01,
                                             max_iter = 500,
                                             max_depth = 5,
                                             max_bins = 255,
                                             min_samples_leaf = 15,
                                             max_leaf_nodes = 10).fit(X_train, Y_train)
    # 得到验证集预测概率
    hist_pred = hist_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    hist_score = roc_auc_score(Y_test, hist_pred)
    # 得到测试集预测概率
    hist_pred_test = hist_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练LGBM模型
    LGBM_md = LGBMClassifier(objective = 'binary',
                             n_estimators = 500,
                             max_depth = 7,
                             learning_rate = 0.01,
                             num_leaves = 20,
                             reg_alpha = 3,
                             reg_lambda = 3,
                             subsample = 0.7,
                             colsample_bytree = 0.7).fit(X_train, Y_train)
    
    # 得到测试集预测概率
    lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    lgb_score = roc_auc_score(Y_test, lgb_pred)
    # 得到测试集预测概率
    lgb_pred_test = LGBM_md.predict_proba(test_cv)[:, 1]
    #----------------------------------------------------------------------------
    # 训练XGB模型
    XGB_md = XGBClassifier(objective = 'binary:logistic',
                           tree_method = 'hist',
                           colsample_bytree = 0.7, 
                           gamma = 2, 
                           learning_rate = 0.01, 
                           max_depth = 7, 
                           min_child_weight = 10, 
                           n_estimators = 500, 
                           subsample = 0.7).fit(X_train, Y_train)
    # 得到验证集预测概率
    xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    xgb_score = roc_auc_score(Y_test, xgb_pred)
    # 得到测试集预测概率
    xgb_pred_test = XGB_md.predict_proba(test_cv)[:, 1]
    #--------------------------------------------------------------------------
    # 训练CatBoost模型
    Cat_md = CatBoostClassifier(loss_function = 'Logloss',
                                iterations = 500,
                                learning_rate = 0.01,
                                depth = 7,
                                random_strength = 0.5,
                                bagging_temperature = 0.7,
                                border_count = 30,
                                l2_leaf_reg = 5,
                                verbose = False, 
                                task_type = 'CPU').fit(X_train, Y_train)
    # 得到验证集预测概率
    cat_pred = Cat_md.predict_proba(X_test)[:, 1]
    # 计算AUC分数
    cat_score = roc_auc_score(Y_test, cat_pred)
    # 得到测试集预测概率
    cat_pred_test = Cat_md.predict_proba(test_cv)[:, 1]   
    #--------------------------------------------------------------------------
    # 简单组合模型
    # 各模型验证集预测概率均值
    ens_pred_1 = (RF_pred + ET_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 6
    # 各模型测试集预测概率均值
    ens_pred_2 = (RF_pred_test + ET_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test + cat_pred_test) / 6
    # 计算AUC分数
    ens_score_fold = roc_auc_score(Y_test, ens_pred_1)
    # 记录组合模型验证集预测概率均值
    ens_cv_scores.append(ens_score_fold)
    # 记录组合模型测试集预测概率均值
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    #--------------------------------------------------------------------------
    # 加权组合模型
    x = pd.DataFrame({'RF': RF_pred,
                      'ET': ET_pred, 
                      'Hist': hist_pred, 
                      'LGBM': lgb_pred,
                      'XGB': xgb_pred,
                      'Cat': cat_pred})
    y = Y_test
        
    x_test = pd.DataFrame({'RF': RF_pred_test,
                           'ET': ET_pred_test, 
                           'Hist': hist_pred_test, 
                           'LGBM': lgb_pred_test,
                           'XGB': xgb_pred_test,
                           'Cat': cat_pred_test})
    
    hill_results = hill_climbing(x, y, x_test)
    
    hill_ens_score_fold = roc_auc_score(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

In [None]:
print('The average ensemble oof ROC-AUC score over the 10-folds is', np.mean(ens_cv_scores))
print('The hill climbing ensemble oof ROC-AUC score over the 10-folds is', np.mean(hill_ens_cv_scores))

In [None]:
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

sample_submission['defects'] = ens_preds_test
sample_submission.to_csv('Avereage_Ensemble_Baseline_submission_25_folds.csv', index = False)

ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

sample_submission['defects'] = ens_preds_test
sample_submission.to_csv('Hill_Ensemble_Baseline_submission_25_folds.csv', index = False)