<a href="https://colab.research.google.com/github/y001003/system_quality_project/blob/main/notebooks/Final_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 시스템 품질 변화로 인한 사용자 불편 예지 AI 경진대회

https://dacon.io/competitions/official/235687/overview/description

## 1. Library Import & Data 불러오기

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost as cb
import lightgbm as lgb
from tqdm import tqdm

In [4]:
PATH = '/content/drive/MyDrive/project/data/system_quality_customer_complain_data/'
train_err  = pd.read_csv(PATH+'train_err_data_.csv')
id_error = train_err[['user_id','errtype']].values
# 불만을 제기한 사람 타겟값 생성
train_prob = pd.read_csv(PATH+'train_problem_data_.csv')

## 2. 학습 데이터 생성

### 2_1 train_df 생성

In [5]:
#error 0으로 이루어진 데이터셋 생성
error = np.zeros((15000,42)) # 15000개 user_id, 41가지(maxnum 42) errtype 

for person_idx, err in tqdm(id_error):
    # (person_idx - 10000)에 해당하는 행에, 열 기준 errtype 값의 -1 에 해당하는 위치에 error값을 +1
    error[person_idx - 10000,err - 1] += 1
error 

100%|██████████| 16554663/16554663 [00:53<00:00, 307349.98it/s]


array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

In [6]:
train_df = pd.DataFrame(error)
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.0,0.0,8.0,104.0,0.0,1.0,1.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,53.0,1.0,1.0,0.0,0.0,0.0,...,10.0,18.0,0.0,1.0,1.0,0.0,0.0,113.0,56.0,1.0
2,0.0,0.0,2.0,132.0,1.0,2.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,...,8.0,0.0,0.0,1.0,1.0,2.0,0.0,17.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,...,16.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,2.0,5.0,5.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,4.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.0,0.0,0.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,...,16.0,17.0,0.0,1.0,1.0,0.0,0.0,58.0,8.0,5.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,12.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.0


### 2_2 target 생성

In [7]:
# 0 으로 이루어진 열 생성
problem = np.zeros(15000)
# user_id 각각에 10000 빼서 저장
user_ids = train_prob['user_id'].unique() - 10000
# probelm[유저id번호] 에 1 저장
problem[user_ids] = 1

display(problem, user_ids, sum(problem))

array([0., 1., 0., ..., 1., 1., 0.])

array([ 9224, 13664,  5166, ...,  9114, 11505,  8822])

5000.0

In [8]:
target = pd.DataFrame(problem)
target

Unnamed: 0,0
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0
...,...
14995,0.0
14996,0.0
14997,1.0
14998,1.0


## 3. Modeling


### 3.1 훈련세트 나누기

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score ,f1_score, roc_auc_score, classification_report

In [10]:
def get_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print(f'\n정확도:: {accuracy:.4f}')
    print(f'정밀도: {precision:.4f}')
    print(f'재현율: {recall:.4f}')
    print(f'F1: {F1:.4f}')
    print(f'AUC: {AUC:.4f}')
    print(classification_report(y_test, y_pred))



In [11]:
# train validation 분리
X_train, X_val, y_train, y_val = train_test_split(train_df,target,test_size=0.2, random_state=42)
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((12000, 42), (3000, 42), (12000, 1), (3000, 1))

In [12]:
y_train

Unnamed: 0,0
9839,0.0
9680,0.0
7093,0.0
11293,0.0
820,0.0
...,...
5191,1.0
13418,1.0
5390,1.0
860,1.0


### 3.2 모델별 학습

#### 3.2.1 랜덤 포레스트

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
model = RandomForestClassifier(n_estimators=10, max_depth=10, min_samples_leaf=8,
                           min_samples_split=20, random_state=0)

model.fit(X_train, y_train)

pred = model.predict(X_val)

  after removing the cwd from sys.path.


In [15]:
get_eval(y_val,pred)

오차행렬:
 [[1550   83]
 [ 808  559]]

정확도:: 0.7030
정밀도: 0.8707
재현율: 0.4089
F1: 0.5565
AUC: 0.6790
              precision    recall  f1-score   support

         0.0       0.66      0.95      0.78      1633
         1.0       0.87      0.41      0.56      1367

    accuracy                           0.70      3000
   macro avg       0.76      0.68      0.67      3000
weighted avg       0.75      0.70      0.68      3000



#### 3.2.2 GradientBoosting

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model2 = GradientBoostingClassifier(n_estimators = 41,
                                   learning_rate = 0.1483,
                                    #loss='log_loss', 
                                    # learning_rate=0.1, 
                                    # n_estimators=100, 
                                    subsample=0.95, 
                                    # criterion='friedman_mse', 
                                    # min_samples_split=2, 
                                    # min_samples_leaf=1, 
                                    # min_weight_fraction_leaf=0.0, 
                                    max_depth=3, 
                                    # min_impurity_decrease=0.0, 
                                    # init=None, 
                                    # random_state=None, 
                                    # max_features=None, 
                                    # verbose=0, 
                                    # max_leaf_nodes=None, 
                                    # warm_start=False, 
                                    # validation_fraction=0.1, 
                                    # n_iter_no_change=None, 
                                    # tol=0.0001, 
                                    # ccp_alpha=0.0
                                    )

model2 = gb_model2.fit(X_train,y_train)
# Validation 데이터셋 예상
pred2 = model2.predict(X_val)

  return f(**kwargs)


In [17]:
get_eval(y_val,pred2)

오차행렬:
 [[1553   80]
 [ 809  558]]

정확도:: 0.7037
정밀도: 0.8746
재현율: 0.4082
F1: 0.5566
AUC: 0.6796
              precision    recall  f1-score   support

         0.0       0.66      0.95      0.78      1633
         1.0       0.87      0.41      0.56      1367

    accuracy                           0.70      3000
   macro avg       0.77      0.68      0.67      3000
weighted avg       0.76      0.70      0.68      3000



#### 3.2.3 CatBoost

In [18]:
cb_dtrain = cb.Pool(data=X_train, label = y_train)
cb_deval = cb.Pool(data=X_val, label = y_val)
cb_param = {
    'max_depth':10, # 트리 깊이
    'learning_rate': 0.01, # 학습률
    'n_estimators': 100, # 트리 생성 갯수
    'eval_metric' : 'Accuracy', # 평가 척도
    'loss_function': 'MultiClass' # 손실 함수
}
cb_model3 = cb.CatBoostClassifier(iterations=30,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')

model3 = cb_model3.fit(cb_dtrain)

0:	learn: 0.5251073	total: 52.7ms	remaining: 1.53s
1:	learn: 0.5034400	total: 58.3ms	remaining: 817ms
2:	learn: 0.4964993	total: 65.4ms	remaining: 589ms
3:	learn: 0.4904056	total: 71.1ms	remaining: 462ms
4:	learn: 0.4890739	total: 76.8ms	remaining: 384ms
5:	learn: 0.4814115	total: 82.3ms	remaining: 329ms
6:	learn: 0.4803820	total: 87.8ms	remaining: 288ms
7:	learn: 0.4797385	total: 94.3ms	remaining: 259ms
8:	learn: 0.4774377	total: 101ms	remaining: 235ms
9:	learn: 0.4757923	total: 106ms	remaining: 212ms
10:	learn: 0.4746199	total: 112ms	remaining: 194ms
11:	learn: 0.4741125	total: 123ms	remaining: 185ms
12:	learn: 0.4729151	total: 130ms	remaining: 170ms
13:	learn: 0.4725505	total: 135ms	remaining: 155ms
14:	learn: 0.4722845	total: 144ms	remaining: 144ms
15:	learn: 0.4717416	total: 151ms	remaining: 132ms
16:	learn: 0.4702004	total: 156ms	remaining: 120ms
17:	learn: 0.4694432	total: 162ms	remaining: 108ms
18:	learn: 0.4692262	total: 167ms	remaining: 96.9ms
19:	learn: 0.4686717	total: 173m

In [49]:
model3 = cb.CatBoostClassifier(iterations=2,
                           learning_rate=0.01,
                           max_depth=10,
                           early_stopping_rounds=3
                           )
cb_model3 = model3.fit(X_train,y_train)

0:	learn: 0.6886158	total: 89.7ms	remaining: 89.7ms
1:	learn: 0.6840796	total: 171ms	remaining: 0us


In [50]:
pred3 = model3.predict(X_val)
get_eval(y_val,pred3)

오차행렬:
 [[1564   69]
 [ 865  502]]

정확도:: 0.6887
정밀도: 0.8792
재현율: 0.3672
F1: 0.5181
AUC: 0.6625
              precision    recall  f1-score   support

         0.0       0.64      0.96      0.77      1633
         1.0       0.88      0.37      0.52      1367

    accuracy                           0.69      3000
   macro avg       0.76      0.66      0.64      3000
weighted avg       0.75      0.69      0.66      3000



In [51]:
pred3

array([0., 0., 1., ..., 0., 0., 0.])

#### 3.2.4 Xgboost

In [21]:
import xgboost as xgb
from xgboost import XGBClassifier

In [22]:
d_train = xgb.DMatrix(X_train,y_train)
d_val = xgb.DMatrix(X_val, y_val) 

params = {
    'max_depth':5,
    'eta': 0.01,
    'booster' : 'gbtree',
    'eval_metric' : 'logloss',
}
num_rounds = 500
wlist = [(d_train,'train'),(d_val,'eval')]
# model4 = xgb.train(params = params , dtrain = d_train, num_boost_round= num_rounds ,evals= wlist,early_stopping_rounds=10,verbose_eval= 0)
model4 = XGBClassifier(max_depth=5,learning_rate=0.01,booster='gbtree')
model4.fit(X_train,y_train)
pred_probs = model4.predict(X_val)
pred4 = np.where(pred_probs > 0.5 , 1, 0)

  return f(**kwargs)


In [23]:
get_eval(y_val,pred4)

오차행렬:
 [[1567   66]
 [ 851  516]]

정확도:: 0.6943
정밀도: 0.8866
재현율: 0.3775
F1: 0.5295
AUC: 0.6685
              precision    recall  f1-score   support

         0.0       0.65      0.96      0.77      1633
         1.0       0.89      0.38      0.53      1367

    accuracy                           0.69      3000
   macro avg       0.77      0.67      0.65      3000
weighted avg       0.76      0.69      0.66      3000



### 3.3 앙상블

In [24]:
from sklearn.ensemble import VotingClassifier

In [25]:
models = [
    ('gbc', GradientBoostingClassifier(n_estimators = 41,
                                   learning_rate = 0.1483,
                                    #loss='log_loss', 
                                    # learning_rate=0.1, 
                                    # n_estimators=100, 
                                    subsample=0.95, 
                                    # criterion='friedman_mse', 
                                    # min_samples_split=2, 
                                    # min_samples_leaf=1, 
                                    # min_weight_fraction_leaf=0.0, 
                                    max_depth=3, 
                                    # min_impurity_decrease=0.0, 
                                    # init=None, 
                                    # random_state=None, 
                                    # max_features=None, 
                                    # verbose=0, 
                                    # max_leaf_nodes=None, 
                                    # warm_start=False, 
                                    # validation_fraction=0.1, 
                                    # n_iter_no_change=None, 
                                    # tol=0.0001, 
                                    # ccp_alpha=0.0
                                    )),
    ('rfc', RandomForestClassifier(n_estimators=10, max_depth=10, min_samples_leaf=8,
                           min_samples_split=20, random_state=0)),
    ('xgb', XGBClassifier(max_depth=5,learning_rate=0.01,booster='gbtree')),
    ('cb', cb.CatBoostClassifier(iterations=2,
                           learning_rate=0.01,
                           max_depth=10,
                           early_stopping_rounds=3
                           )),
]

In [26]:
# hard vote
hard_vote  = VotingClassifier(models, voting='hard')
hard_vote.fit(X_train, y_train)

# soft vote
soft_vote  = VotingClassifier(models, voting='soft')
soft_vote.fit(X_train, y_train)

  return f(**kwargs)


0:	learn: 0.6886158	total: 70.6ms	remaining: 70.6ms
1:	learn: 0.6840796	total: 145ms	remaining: 0us


  return f(**kwargs)


0:	learn: 0.6886158	total: 73.2ms	remaining: 73.2ms
1:	learn: 0.6840796	total: 145ms	remaining: 0us


VotingClassifier(estimators=[('gbc',
                              GradientBoostingClassifier(learning_rate=0.1483,
                                                         n_estimators=41,
                                                         subsample=0.95)),
                             ('rfc',
                              RandomForestClassifier(max_depth=10,
                                                     min_samples_leaf=8,
                                                     min_samples_split=20,
                                                     n_estimators=10,
                                                     random_state=0)),
                             ('xgb',
                              XGBClassifier(learning_rate=0.01, max_depth=5)),
                             ('cb',
                              <catboost.core.CatBoostClassifier object at 0x7f6ac1261110>)],
                 voting='soft')

#### 3.3.1 HardVote

In [27]:
en_pred = hard_vote.predict(X_val)
get_eval(y_val,en_pred)

오차행렬:
 [[1569   64]
 [ 854  513]]

정확도:: 0.6940
정밀도: 0.8891
재현율: 0.3753
F1: 0.5278
AUC: 0.6680
              precision    recall  f1-score   support

         0.0       0.65      0.96      0.77      1633
         1.0       0.89      0.38      0.53      1367

    accuracy                           0.69      3000
   macro avg       0.77      0.67      0.65      3000
weighted avg       0.76      0.69      0.66      3000



#### 3.3.2 softvote

In [28]:
en_pred2 = soft_vote.predict(X_val)
get_eval(y_val,en_pred2)

오차행렬:
 [[1558   75]
 [ 822  545]]

정확도:: 0.7010
정밀도: 0.8790
재현율: 0.3987
F1: 0.5486
AUC: 0.6764
              precision    recall  f1-score   support

         0.0       0.65      0.95      0.78      1633
         1.0       0.88      0.40      0.55      1367

    accuracy                           0.70      3000
   macro avg       0.77      0.68      0.66      3000
weighted avg       0.76      0.70      0.67      3000



### 3.4 Threshold 조정

In [29]:
pred_proba = soft_vote.predict_proba(X_val)
pred_proba[:,1:]


array([[0.41372237],
       [0.2828118 ],
       [0.59233116],
       ...,
       [0.34024443],
       [0.24483493],
       [0.23966686]])

In [30]:
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
for i in thresholds:
    print(i)
    temp_pred = np.where(pred_proba[:,1] >= i , 1, 0)
    get_eval(y_val,temp_pred)

0
오차행렬:
 [[   0 1633]
 [   0 1367]]

정확도:: 0.4557
정밀도: 0.4557
재현율: 1.0000
F1: 0.6261
AUC: 0.5000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1633
         1.0       0.46      1.00      0.63      1367

    accuracy                           0.46      3000
   macro avg       0.23      0.50      0.31      3000
weighted avg       0.21      0.46      0.29      3000

0.1
오차행렬:
 [[   0 1633]
 [   0 1367]]

정확도:: 0.4557
정밀도: 0.4557
재현율: 1.0000
F1: 0.6261
AUC: 0.5000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1633
         1.0       0.46      1.00      0.63      1367

    accuracy                           0.46      3000
   macro avg       0.23      0.50      0.31      3000
weighted avg       0.21      0.46      0.29      3000

0.2
오차행렬:
 [[   0 1633]
 [   0 1367]]

정확도:: 0.4557
정밀도: 0.4557
재현율: 1.0000
F1: 0.6261
AUC: 0.5000
              precision    recall  f1-score   suppo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.63      0.98      0.77      1633
         1.0       0.92      0.33      0.48      1367

    accuracy                           0.68      3000
   macro avg       0.78      0.65      0.63      3000
weighted avg       0.77      0.68      0.64      3000

0.7
오차행렬:
 [[1616   17]
 [1055  312]]

정확도:: 0.6427
정밀도: 0.9483
재현율: 0.2282
F1: 0.3679
AUC: 0.6089
              precision    recall  f1-score   support

         0.0       0.61      0.99      0.75      1633
         1.0       0.95      0.23      0.37      1367

    accuracy                           0.64      3000
   macro avg       0.78      0.61      0.56      3000
weighted avg       0.76      0.64      0.58      3000

0.8
오차행렬:
 [[1633    0]
 [1367    0]]

정확도:: 0.5443
정밀도: 0.0000
재현율: 0.0000
F1: 0.0000
AUC: 0.5000
              precision    recall  f1-score   support

         0.0       0.54      1.00      0.70      1633
         1.0       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 3.5 AutoML

In [31]:
!pip install pycaret
!pip install jinja
!pip install markupsafe==2.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe~=2.1.1
  Using cached MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.0.1
    Uninstalling MarkupSafe-2.0.1:
      Successfully uninstalled MarkupSafe-2.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed markupsafe-2.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jinja
  Using cached Jinja-1.2.tar.gz (252 kB)
  Using cached Jinja-1.1.tar.gz (237 kB)
  Using cached Jinja-1.0.tar.gz (120 kB)
  Using c

In [32]:
import warnings
warnings.filterwarnings(action='ignore')
from pycaret.classification import *

In [33]:
y_train.columns = ['target']
train = pd.concat([X_train,y_train],axis=1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,target
9839,0.0,0.0,4.0,388.0,10.0,5.0,4.0,0.0,0.0,0.0,15.0,19.0,4.0,9.0,207.0,209.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9680,0.0,0.0,0.0,477.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,59.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7093,0.0,0.0,0.0,2.0,464.0,0.0,1.0,0.0,0.0,4.0,39.0,39.0,0.0,3.0,188.0,193.0,3.0,0.0,0.0,0.0,0.0,55.0,18.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,304.0,0.0,35.0,4.0,0.0,1.0,1.0,1.0,0.0,149.0,68.0,0.0,0.0
11293,0.0,0.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,0.0,15.0,15.0,0.0,0.0,157.0,127.0,0.0,0.0,0.0,0.0,0.0,183.0,169.0,11.0,1.0,35.0,0.0,0.0,0.0,0.0,250.0,0.0,14.0,253.0,1.0,1.0,1.0,0.0,0.0,148.0,8.0,3.0,0.0
820,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,14.0,14.0,0.0,0.0,152.0,113.0,0.0,0.0,0.0,0.0,0.0,39.0,24.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,204.0,0.0,14.0,0.0,0.0,1.0,1.0,0.0,0.0,94.0,1.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,0.0,0.0,0.0,0.0,0.0,7.0,8.0,0.0,0.0,1.0,39.0,42.0,3.0,16.0,64.0,30.0,4.0,1.0,1.0,1.0,1.0,1322.0,1588.0,0.0,0.0,65.0,0.0,0.0,0.0,0.0,213.0,0.0,39.0,1.0,4.0,0.0,0.0,0.0,0.0,43.0,1.0,1.0,1.0
13418,0.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0,17.0,19.0,3.0,7.0,103.0,61.0,0.0,0.0,0.0,0.0,0.0,55.0,21.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,109.0,0.0,17.0,4.0,3.0,2.0,2.0,0.0,0.0,14.0,0.0,3.0,1.0
5390,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,33.0,33.0,0.0,0.0,74.0,73.0,0.0,1.0,0.0,1.0,0.0,170.0,125.0,0.0,0.0,52.0,0.0,0.0,0.0,0.0,192.0,4.0,32.0,0.0,1.0,2.0,2.0,0.0,0.0,14.0,0.0,0.0,1.0
860,0.0,0.0,0.0,20.0,124.0,0.0,1.0,0.0,0.0,0.0,17.0,17.0,0.0,3.0,223.0,93.0,0.0,0.0,0.0,0.0,0.0,34.0,15.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,109.0,0.0,36.0,0.0,0.0,1.0,1.0,0.0,0.0,42.0,0.0,2.0,1.0


In [34]:
clf = setup(data = train, target='target')

Unnamed: 0,Description,Value
0,session_id,4455
1,Target,target
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(12000, 43)"
5,Missing Values,False
6,Numeric Features,42
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [35]:
best_5 = compare_models(sort='Accuracy' , n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7944,0.8042,0.4632,0.7698,0.578,0.4531,0.4788,1.336
rf,Random Forest Classifier,0.7938,0.8039,0.4527,0.778,0.5718,0.4482,0.477,1.763
catboost,CatBoost Classifier,0.7935,0.8054,0.4624,0.7676,0.5767,0.4512,0.4767,11.524
gbc,Gradient Boosting Classifier,0.7919,0.8014,0.4347,0.7868,0.5594,0.4377,0.471,1.725
lightgbm,Light Gradient Boosting Machine,0.788,0.7942,0.466,0.7429,0.572,0.4408,0.4624,0.316
ada,Ada Boost Classifier,0.7852,0.791,0.4413,0.7504,0.5555,0.4263,0.4525,0.486
lr,Logistic Regression,0.7727,0.751,0.3576,0.775,0.4885,0.3672,0.413,2.553
qda,Quadratic Discriminant Analysis,0.7706,0.7424,0.3736,0.7469,0.497,0.3694,0.407,0.054
lda,Linear Discriminant Analysis,0.7683,0.7429,0.3243,0.7913,0.4597,0.3438,0.3994,0.083
ridge,Ridge Classifier,0.7653,0.0,0.3083,0.7955,0.4441,0.3304,0.3902,0.036


In [37]:
top5 = [rank for rank in best_5]
top5_tune = [tune_model(i) for i in top5]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8024,0.8061,0.4745,0.7908,0.5931,0.4732,0.5001
1,0.7905,0.8087,0.4431,0.7687,0.5622,0.4373,0.4659
2,0.7833,0.7939,0.4392,0.7417,0.5517,0.421,0.4461
3,0.7976,0.8032,0.4531,0.7945,0.5771,0.4569,0.488
4,0.7857,0.797,0.4414,0.7533,0.5567,0.4278,0.4544
5,0.7893,0.8059,0.3984,0.816,0.5354,0.4193,0.4644
6,0.8012,0.7919,0.4766,0.7871,0.5937,0.4724,0.4985
7,0.7929,0.8092,0.4336,0.7929,0.5606,0.4399,0.4742
8,0.7976,0.8006,0.4727,0.7756,0.5874,0.4636,0.4885
9,0.7962,0.8146,0.4549,0.7838,0.5757,0.4537,0.4828


In [38]:
blended = blend_models(estimator_list= best_5 , fold= 5, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7964,0.8104,0.4579,0.7826,0.5778,0.4555,0.4839
1,0.797,0.8106,0.4501,0.7931,0.5743,0.454,0.4854
2,0.7964,0.8024,0.4423,0.7986,0.5693,0.45,0.4837
3,0.7875,0.8079,0.4395,0.7627,0.5576,0.4308,0.4592
4,0.7933,0.815,0.4599,0.768,0.5753,0.4499,0.4757
Mean,0.7941,0.8093,0.4499,0.781,0.5708,0.448,0.4776
Std,0.0036,0.0041,0.0081,0.0139,0.0072,0.0089,0.0098


In [39]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7948,0.7974,0.4345,0.7826,0.5588,0.439,0.4713


In [40]:
final_model = finalize_model(blended)

In [41]:
final_model

VotingClassifier(estimators=[('et',
                              ExtraTreesClassifier(bootstrap=False,
                                                   ccp_alpha=0.0,
                                                   class_weight=None,
                                                   criterion='gini',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                 

In [42]:
predictions = predict_model(final_model,data = X_val)

In [43]:
predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,Label,Score
11499,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,161.0,4.0,0.0,0.0,0.6062
6475,1.0,0.0,1.0,175.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7882
13167,0.0,0.0,0.0,2.0,14.0,10.0,10.0,0.0,0.0,4.0,...,8.0,0.0,0.0,2.0,0.0,12.0,1.0,0.0,1.0,0.6647
862,0.0,0.0,0.0,2.0,9.0,1.0,1.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.6578
5970,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,41.0,0.0,0.0,0.0,0.7928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6216,0.0,0.0,4.0,30.0,6.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9521
3585,0.0,0.0,0.0,283.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8655
10023,0.0,0.0,0.0,0.0,9.0,2.0,2.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.7539
14044,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,38.0,0.0,5.0,0.0,0.9174


In [62]:
pred = predictions['Label'].astype('float')
get_eval(y_val,pred)

오차행렬:
 [[1543   90]
 [ 748  619]]

정확도:: 0.7207
정밀도: 0.8731
재현율: 0.4528
F1: 0.5963
AUC: 0.6989
              precision    recall  f1-score   support

         0.0       0.67      0.94      0.79      1633
         1.0       0.87      0.45      0.60      1367

    accuracy                           0.72      3000
   macro avg       0.77      0.70      0.69      3000
weighted avg       0.76      0.72      0.70      3000



#### 3.5.2 optimize_threshold


In [63]:
optimized = optimize_threshold(blended)

KeyboardInterrupt: ignored

In [None]:
final_model = finalize_model(optimized)

In [None]:
final_model

In [None]:
predictions = predict_model(final_model,data = X_val)
pred = predictions['Label'].astype('float')
get_eval(y_val,pred)