In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer, RobustScaler
from sklearn.kernel_approximation import Nystroem

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler, FunctionTransformer

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv', index_col='id')
df_test = pd.read_csv('/kaggle/input/playground-series-s3e23/test.csv', index_col='id')

In [4]:
df_train['defects'] = df_train.defects.astype(int)
feat_list = list(set(df_train.columns) - set(['defects']))
target = 'defects'

In [5]:
device = torch.device(0) if torch.cuda.is_available() else torch.device("cpu")

In [6]:
X = df_train[feat_list]
Y = df_train[target]
test_cv = df_test[feat_list]

In [7]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

lr = make_pipeline(FunctionTransformer(np.log1p),
                    Nystroem(n_components=400, random_state=10, n_jobs=-1), # gamma=1/21
                    StandardScaler(),
                    LogisticRegression(dual=False, C=0.0024,
                                        class_weight='balanced',
                                        max_iter=1500,
                                        random_state=1,
                                        solver='newton-cholesky',
                                        n_jobs=-1)
                    )

svm = make_pipeline(FunctionTransformer(np.log1p),
                    Nystroem(n_components=400, random_state=10, n_jobs=-1),
                    StandardScaler(),
                    LinearSVC(dual=False, C=0.0024,
                              class_weight='balanced', 
                              random_state=1,
                              max_iter=1500))

xgb = XGBClassifier(objective = 'binary:logistic',
                    tree_method = 'gpu_hist',
                    colsample_bytree = 0.7, 
                    gamma = 2, 
                    learning_rate = 0.01, 
                    max_depth = 7, 
                    min_child_weight = 10, 
                    n_estimators = 1000, 
                    subsample = 0.7,
                    eval_metric = "auc"
                    )

et = make_pipeline(ColumnTransformer([('drop', 'drop',
                                      ['iv(g)', 't', 'b', 'n', 'lOCode', 'v',
                                       'branchCount', 'e', 'i', 'lOComment'])],
                                       remainder='passthrough'),
                   PowerTransformer(),
                   ExtraTreesClassifier(max_features=1.0, min_samples_leaf=100,
                                         n_estimators=500, random_state=1, bootstrap=True,
                                         n_jobs=-1))

lgbm = LGBMClassifier(objective = 'binary',
                      metric = 'auc',
                      n_estimators = 1000,
                      max_depth = -1,
                      learning_rate = 0.01,
                      num_leaves = 20,
                      reg_alpha = 3,
                      reg_lambda = 3,
                      subsample = 0.7,
                      colsample_bytree = 0.7,
                      random_state=1,
                      n_jobs=-1,
                      device='gpu')

hgb = make_pipeline(FunctionTransformer(np.log1p),
                    HistGradientBoostingClassifier(l2_regularization = 0.01,
                    early_stopping = False,
                    learning_rate = 0.01,
                    max_iter = 500,
                    max_depth = 5,
                    max_bins = 255,
                    min_samples_leaf = 15,
                    max_leaf_nodes = 10,
                    random_state=1,
                    class_weight='balanced'))

In [10]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('lr', lr),
    ('svm', svm),
    ('xgb', xgb),
    ('et',et),
    ('lgbm',lgbm),
    ('hgb',hgb)
]

stack_model = StackingClassifier(estimators, cv=10, n_jobs=-1)

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)

auc_scores = []
for fold, (idx_tr, idx_va) in enumerate(kf.split(X, Y)):
    X_tr = X.iloc[idx_tr]
    X_va = X.iloc[idx_va]
    y_tr = Y.iloc[idx_tr]
    y_va = Y.iloc[idx_va]
    
    stack_model.fit(X_tr, y_tr)
    y_va_pred = stack_model.predict_proba(X_va)[ : ,1]
    auc = roc_auc_score(y_va, y_va_pred)
    auc_scores.append(auc)
    print(auc)
print("mean auc score : ", np.mean(auc_scores))



0.7961296193206813




0.7946557585036046




0.786506781150778




0.7946076392064898




0.7924987115380562
mean auc score :  0.792879701943922


In [11]:
stack_model