In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import datetime as dt
from datetime import datetime, timedelta
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import re
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
import catboost
import lightgbm

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import load_model
import tensorflow_hub as hub

### Data

In [4]:
train = pd.read_csv("./custom_data/train_feature.csv")
test = pd.read_csv("./custom_data/test_feature.csv")
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [5]:
X = train.drop('problem', axis=1)
y = train['problem']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [7]:
# X_train = X
# y_train = y

### Scaler

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

### Super Learner

In [9]:
cat_params = {"logging_level" : "Silent"}
lgb_params = {'bagging_fraction': 0.9504775535991318, 'feature_fraction': 0.5598972214137229, 'lambda_l1': 2.9110070331408933, 'lambda_l2': 0.4065002276790508, 'learning_rate': 0.0118663823237834, 'max_depth': 12, 'min_child_samples': 19, 'min_child_weight': 35.21849812233922, 'min_samples_split': 0, 'min_split_gain': 0.05204638001792851, 'n_estimators': 949, 'num_leaves': 642, 'subsample': 0.5490080562116625}
xgb_params = {'gamma': 8.712501813678685, 'max_depth': 22, 'min_child_weight': 9.863337491640031, 'n_estimators': 106, 'num_boost_round': 165.69852832297272, 'reg_alpha': 0.032974405578371495, 'reg_lambda': 0.0006919861676045414}

In [10]:
def get_models():
    models = []
    models.append(catboost.CatBoostClassifier(**cat_params))
    models.append(LGBMClassifier(**lgb_params))
    models.append(xgb.XGBClassifier(**xgb_params))
    models.append(RandomForestClassifier())
    models.append(ExtraTreesClassifier())
    models.append(KNeighborsClassifier())
    models.append(GaussianNB())
    models.append(GradientBoostingClassifier())
    models.append(LinearDiscriminantAnalysis())
    models.append(RidgeClassifier())
    models.append(Lasso())
    
    return models

In [11]:
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, X_scaled, y, models):
    meta_X, meta_y = list(), list()
    kfold = KFold(n_splits=10, shuffle=True)
    fscores = np.zeros((10, len(models)))
    for total_idx, (train_ix, test_ix) in tqdm(enumerate(kfold.split(X))):
        fold_yhats = list()
        train_X, test_X = X[train_ix], X[test_ix]
        train_X_scaled, test_X_scaled = X_scaled[train_ix], X_scaled[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        meta_y.extend(test_y)
        
        scores = []
        for model_idx, model in enumerate(models):
            if model_idx < 5:
                model.fit(train_X, train_y)
                yhat = model.predict_proba(test_X)
                fold_yhats.append(yhat)
                score = roc_auc_score(test_y, yhat[:, 1], average='micro')
                fscores[total_idx, model_idx] = score
            else:
                model.fit(train_X_scaled, train_y)
                yhat = model.predict_proba(test_X_scaled)
                fold_yhats.append(yhat)
                score = roc_auc_score(test_y, yhat[:, 1], average='micro')
                fscores[total_idx, model_idx] = score                
        meta_X.append(hstack(fold_yhats))
    return vstack(meta_X), asarray(meta_y), fscores
 
# fit all base models on the training dataset
def fit_base_models(X, X_scaled, y, models):
    for model_idx, model in enumerate(models):
        if model_idx < 4:
            model.fit(X, y)
        else:
            model.fit(X_scaled, y)
 
# fit a meta model
def fit_meta_model(X, y):
    model = LogisticRegression(solver='liblinear')
    model.fit(X, y)
    return model
 
# evaluate a list of models on a dataset
def evaluate_models(X, X_scaled, y, models):
    for model_idx, model in enumerate(models):
        if model_idx < 4:     
            yhat = model.predict_proba(X)[:,1]
            auc = roc_auc_score(y, yhat)
            print('%s: %.3f' % (model.__class__.__name__, auc*100))
        else:
            yhat = model.predict_proba(X_scaled)[:,1]
            auc = roc_auc_score(y, yhat)
            print('%s: %.3f' % (model.__class__.__name__, auc*100))
# make predictions with stacked model
def super_learner_predictions(X, X_scaled, models, meta_model):
    meta_X = list()
    for model_idx, model in enumerate(models):
        if model_idx < 4:
            yhat = model.predict_proba(X)
            meta_X.append(yhat)
        else:
            yhat = model.predict_proba(X_scaled)
            meta_X.append(yhat)
    meta_X = hstack(meta_X)
   # predict
    return meta_model.predict_proba(meta_X)[:,1]

In [12]:
models = get_models()

NameError: name 'xgb' is not defined

In [None]:
meta_X, meta_y, fscores = get_out_of_fold_predictions(X_train.values, X_scaled_train, y_train.values, models)

In [None]:
meta_model = fit_meta_model(meta_X, meta_y)

status|cat|lgb|xgb|rf
:---|:---:|:---:|:---:|:---:
tuning X|0.83847134|0.83117312|0.83468733|0.82470269
tuning O|----|0.83677207|0.832428|0.81877891
tuning O, n = 500|----|----|0.82202676|0.81936112|
tuning X, n = 100|----|----|----|----|

### 전체 데이터 재학습

In [None]:
fit_base_models(X_train, X_scaled_train ,y_train.values, models)

### 성능 테스트

In [None]:
evaluate_models(X_test, X_scaled_test, y_test, models)

In [None]:
pd.DataFrame(fscores, columns=['cat', 'lgbm', 'xgb', 'rf', 'et', 'knn','NB', 'gbt', 'lda', 'ridge', 'lasso'])

In [None]:
np.mean(fscores, axis=0)

### 최대성능

In [None]:
yhat = super_learner_predictions(X_test, X_scaled_test, models, meta_model)
roc_auc_score(y_test, yhat)

### Submission

In [116]:
new_scaler = StandardScaler()
X_scaled = new_scaler.fit_transform(X)
test_scaled = scaler.transform(test)

In [117]:
yhat = super_learner_predictions(test, test_scaled, models, meta_model)

In [118]:
sample_submission['problem'] = yhat

In [119]:
sample_submission.to_csv('submission_0202_super_learner.csv', index=False)