## 0.패키지 임포트 

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from ngboost import NGBClassifier
from ngboost.distns import k_categorical, Bernoulli
from ngboost.scores import LogScore, CRPScore

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import *
import warnings
warnings.filterwarnings(action = 'ignore')

## 

## 1. 데이터 불러오기 

In [2]:
train_x = pd.read_csv("./datasets/x_train.csv")
target = pd.read_csv("./datasets/y_train.csv")
test = pd.read_csv("./datasets/x_test.csv")

In [3]:
train_x = np.array(train_x)
target = np.array(target.target)
test = np.array(test)

## 

## 2. 모델링
* ngboost

In [147]:
## ngboost
def f_pr_auc(probas_pred, y_true):
    labels = y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score = auc(r, p)
    return "pr_auc", score, True

models = []
recalls = []
precisions =[]
auc_scores = []
threshold = 0.5
# params = {
#     'Dist' : k_categorical(2),
#     'random_state' : 42,
# }

k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x, target):
    X = train_x[train_idx]
    y = target[train_idx].astype(np.int)
    valid_x = train_x[val_idx]
    valid_y = target[val_idx].astype(np.int)

    model = NGBClassifier(
                            Dist = k_categorical(2),
                            n_estimators = 1000,
                            learning_rate = 0.02
                        )
    model.fit(X, y, X_val=valid_x, Y_val = valid_y, early_stopping_rounds = 3)
    valid_prob = model.predict_proba(valid_x)[:,1]
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    recall = recall_score(valid_y, valid_pred)
    precision = precision_score(valid_y, valid_pred)
    auc_score = roc_auc_score(valid_y, valid_prob)
    
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)
    
    print("=============================================")

[iter 0] loss=0.6365 val_loss=0.6266 scale=2.0000 norm=4.0000
[iter 100] loss=0.4624 val_loss=0.4733 scale=1.0000 norm=1.8385
== Early stopping achieved.
== Best iteration / VAL175 (val_loss=0.4671)
[iter 0] loss=0.6365 val_loss=0.6263 scale=2.0000 norm=4.0000
[iter 100] loss=0.4646 val_loss=0.4608 scale=2.0000 norm=3.6658
== Early stopping achieved.
== Best iteration / VAL162 (val_loss=0.4549)
[iter 0] loss=0.6365 val_loss=0.6271 scale=2.0000 norm=4.0000
[iter 100] loss=0.4604 val_loss=0.4802 scale=1.0000 norm=1.8321
[iter 200] loss=0.4456 val_loss=0.4720 scale=1.0000 norm=1.8577
== Early stopping achieved.
== Best iteration / VAL239 (val_loss=0.4712)
[iter 0] loss=0.6365 val_loss=0.6267 scale=2.0000 norm=4.0000
[iter 100] loss=0.4618 val_loss=0.4714 scale=1.0000 norm=1.8389
[iter 200] loss=0.4469 val_loss=0.4624 scale=1.0000 norm=1.8626
== Early stopping achieved.
== Best iteration / VAL274 (val_loss=0.4602)
[iter 0] loss=0.6365 val_loss=0.6273 scale=2.0000 norm=4.0000
[iter 100] los

In [148]:
np.mean(auc_scores)

0.8201921999999999

In [149]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(test)[:,0]
    pred_y_list.append(pred_y.reshape(-1, 1))
    
pred_ensemble = np.mean(pred_y_list, axis=0)

In [150]:
pred_ensemble

array([[0.026541  ],
       [0.77323292],
       [0.5413317 ],
       ...,
       [0.36976752],
       [0.10665222],
       [0.73052152]])

In [151]:
sample_submission['problem'] = pred_ensemble.reshape(-1)

In [152]:
sample_submission

Unnamed: 0,user_id,problem
0,30000,0.026541
1,30001,0.773233
2,30002,0.541332
3,30003,0.192490
4,30004,0.054929
...,...,...
14994,44994,0.811991
14995,44995,0.715574
14996,44996,0.369768
14997,44997,0.106652


In [153]:
sample_submission.to_csv("submission_ngb_v1.csv", index=False)

* GradientBoostingClassifier

In [4]:
gbc = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2584, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [5]:
## GBC
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
# params =      {
#                 'ccp_alpha'     : 0.0,
#                 'criterion'     : 'friedman_mse',
#                 'init'          : None,
#                 'learning_rate' : 0.1,
#                 'loss'          : 'deviance',
#                 'max_depth'     : 3,
#                 'max_feature'   : None,
#                 'max_leaf_nodes': None,
#                 'min_impurity_decrease' : 0.0,
#                 'min_impurity_split'    : None,
#                 'min_samples_leaf'      : 1,
#                 'min_samples_split'     : 2,
#                 'min_weight_fraction_leaf' : 0.0,
#                 'n_estimators'             : 100,
#                 'n_iter_no_change'         : None,
#                 'presort'                  : 'deprecated',
#                 'random_state'             : 2584,
#                 'subsample'                : 1.0,
#                 'tol'                      : 0.0001,
#                 'validation_fraction'      : 0.1,
#                 'verbose'                  : 0,
#                 'warm_start'               : False
#                 }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x, target):

    # split train, validation set
    X = train_x[train_idx]
    y = target[train_idx]
    valid_x = train_x[val_idx]
    valid_y = target[val_idx]

#     d_train= lgb.Dataset(X, y)
#     d_val  = lgb.Dataset(valid_x, valid_y)
    
    
    #run traning
    model = gbc.fit(
                        X,
                        y
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    print(valid_prob)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    print(auc_score)
    
    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[1 0 0 ... 0 0 0]
0.7310000000000001
[0 1 0 ... 0 1 0]
0.733
[0 1 1 ... 0 0 0]
0.7169999999999999
[0 0 0 ... 0 0 0]
0.7209999999999999
[1 0 1 ... 0 0 0]
0.7160000000000001


In [6]:
np.mean(auc_score)

0.7160000000000001

In [7]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(test)[:,1]
    pred_y_list.append(pred_y.reshape(-1, 1))
    
pred_ensemble = np.mean(pred_y_list, axis=0)

In [10]:
sample_submission = pd.read_csv("./datasets/sample_submission.csv")

In [12]:
sample_submission['problem'] = pred_ensemble.reshape(-1)

In [126]:
sample_submission

Unnamed: 0,user_id,problem
0,30000,0.049398
1,30001,0.749064
2,30002,0.502084
3,30003,0.250826
4,30004,0.107821
...,...,...
14994,44994,0.794622
14995,44995,0.668592
14996,44996,0.273312
14997,44997,0.124996


In [13]:
sample_submission.to_csv("submission_gbc_v2.csv", index=False)