# XGBOOST, LIGHTGBM, CATBOOST 실습

In [42]:
#! pip install xgboost lightgbm catboost



# preprocessing

In [1]:
import numpy as np
import random 
import warnings
warnings.simplefilter("ignore", UserWarning)
np.random.seed(1)
random.seed(1)

In [2]:
# Load libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
import pandas as pd
from sklearn.metrics import classification_report

In [3]:
filename = '../data/ensemble/pima-indians-diabetes.data.csv'
dataframe = pd.read_csv(filename, header =None)
dataframe.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = dataframe.iloc[:, :-1]
y = dataframe.iloc[:, -1] 

In [5]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Class, dtype: int64

In [7]:
# 데이터 셋 분할하기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0) 

# xgboost 모델

- 파라미터 설명
    - `max_depth`: tree의 최대높이(클수록 overfitting 경향이 있음)
    - `n_estimators`: boosting에 사용되는 tree 개수(클수록 overfitting 경향이 있음)
    - `learing_rate`: 학습률 (작을수록 overfitting 경향이 있음)
    - `min_child_weight`: 노드의 샘플수를 고려하여 트리분개를 멈추게하는 임계치

In [8]:
import xgboost as xgb
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [8, 16],
              "n_estimators": [8],
              "min_child_weight": [1],
              "learning_rate": [0.01]}

grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3,scoring="accuracy", 
                                   verbose=10, n_jobs=-1)
grid_search.fit(X_train, y_train)



Fitting 3 folds for each of 2 candidates, totalling 6 fits


GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                  

In [9]:
grid_search.best_estimator_.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.01,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 16,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 8,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [10]:
# 1) model 선언
model = xgb.XGBClassifier(**grid_search.best_estimator_.get_params())
# 2) model 학습
model = model.fit(X_train,y_train)

In [11]:
# 3) test 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82       107
           1       0.58      0.62      0.60        47

    accuracy                           0.75       154
   macro avg       0.70      0.71      0.71       154
weighted avg       0.75      0.75      0.75       154



In [13]:
x_gbm_results = classification_report(y_test, pred_y, output_dict=True)

In [14]:
x_gbm_results['1']['f1-score']

0.5979381443298969

In [15]:
model.feature_importances_

array([0.068464  , 0.4327433 , 0.09295039, 0.01331423, 0.0303725 ,
       0.16869941, 0.05060662, 0.14284952], dtype=float32)

In [16]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     0.432743
BMI                         0.168699
Age                         0.142850
BloodPressure               0.092950
Pregnancies                 0.068464
DiabetesPedigreeFunction    0.050607
Insulin                     0.030372
SkinThickness               0.013314
dtype: float32

----

# LightGBM 모델

- 파라미터 설명
    - `objective`:목적함수 정의 e.g. binary
    - `max_depth`: tree의 최대높이(클수록 overfitting 경향이 있음)
    - `learing_rate`: 학습률 (작을수록 overfitting 경향이 있음)
    - `num_leaves`: 사용되는 말단노드들의 최대수
    - `n_estimators`: 부스팅에 사용되는 tree의 수
    - `min_data_in_leaf`: 말단노드의 샘플수를 고려하여 트리분개를 멈추게하는 임계치

In [17]:
import lightgbm as lgb

lg = lgb.LGBMClassifier(silent=True)
param_dict = { "objective":['binary'], # multiclass, regression
              "max_depth": [50, 75],
              "learning_rate" : [0.01],
              "num_leaves": [300,900],
              "n_estimators": [100, 200], 
              "min_data_in_leaf": [20]
             }
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dict, cv = 3, scoring="accuracy")
grid_search.fit(X_train,y_train)



GridSearchCV(cv=3, estimator=LGBMClassifier(silent=True), n_jobs=-1,
             param_grid={'learning_rate': [0.01], 'max_depth': [50, 75],
                         'min_data_in_leaf': [20], 'n_estimators': [100, 200],
                         'num_leaves': [300, 900], 'objective': ['binary']},
             scoring='accuracy')

In [18]:
grid_search.best_estimator_.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.01,
 'max_depth': 50,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 200,
 'n_jobs': -1,
 'num_leaves': 300,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'min_data_in_leaf': 20}

In [19]:
# model 선언
model = lgb.LGBMClassifier( **grid_search.best_estimator_.get_params())
print(model)

LGBMClassifier(learning_rate=0.01, max_depth=50, min_data_in_leaf=20,
               n_estimators=200, num_leaves=300, objective='binary',
               silent=True)


In [20]:
# 모델 학습
model = model.fit(X_train,y_train)

In [21]:
# 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87       107
           1       0.70      0.66      0.68        47

    accuracy                           0.81       154
   macro avg       0.78      0.77      0.77       154
weighted avg       0.81      0.81      0.81       154



In [23]:
light_gbm_results = classification_report(y_test, pred_y, output_dict=True)

In [24]:
light_gbm_results['1']['f1-score']

0.6813186813186813

In [25]:
model.feature_importances_

array([256, 918, 359, 343, 299, 881, 777, 646], dtype=int32)

In [26]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     918
BMI                         881
DiabetesPedigreeFunction    777
Age                         646
BloodPressure               359
SkinThickness               343
Insulin                     299
Pregnancies                 256
dtype: int32

- LightGBM모델에서 입력데이터안에 있는 categorical variable처리 방법

In [27]:
# sample data
train_data = [[1, 0, 1, 4, 5, 6],
              [1, 0, 4, 5, 6, 7],
              [0, 1, 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [[1, 0, 2, 4, 6, 8],
             [1, 0, 1, 4, 50, 60]]


In [28]:

# Initialize CatBoostClassifier
model = lgb.LGBMClassifier()

# Fit model
model.fit(X = train_data,
          y = train_labels, 
          feature_name = ['c1','c2', 'c3', 'c4', 'c5', 'c6'],
          categorical_feature = ['c1','c2'])
# Get predicted classes
preds_class = model.predict(eval_data)
print('\npreds_class:\n', preds_class)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
print('preds_proba:\n', preds_proba)




preds_class:
 [1 1]
preds_proba:
 [[0.33333333 0.66666667]
 [0.33333333 0.66666667]]


---

## catboost - mac M1 GPU 지원 x

# CatBoost 모델

- 파라미터 설명
    - `depth`: tree의 최대높이, 최대16으로 제한되어 있음(클수록 overfitting 경향이 있음)
    - `num_trees`: 부스팅에 사용되는 tree의 수
    - `learing_rate`: 학습률 (작을수록 overfitting 경향이 있음)
    - `l2_leaf_reg`: 말단노드의 샘플수를 고려하여 penealty를 부여

In [72]:
import catboost as cb

cbm = cb.CatBoostClassifier(silent=True)

params = {'depth': [8, 16],
          'num_trees':[32, 64],
          'learning_rate' : [0.01],
          'l2_leaf_reg': [1]}


grid_search = GridSearchCV(cbm, params, scoring="accuracy", cv = 3)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=3,
             estimator=<catboost.core.CatBoostClassifier object at 0x7f40d9267b38>,
             param_grid={'depth': [8, 16], 'l2_leaf_reg': [1],
                         'learning_rate': [0.01], 'num_trees': [32, 64]},
             scoring='accuracy')

In [73]:
model = cb.CatBoostClassifier(**grid_search.best_estimator_.get_params())

In [74]:
# 모델 학습
model = model.fit(X_train,y_train)

In [75]:
# 3) test 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [76]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       107
           1       0.72      0.55      0.63        47

    accuracy                           0.80       154
   macro avg       0.77      0.73      0.74       154
weighted avg       0.79      0.80      0.79       154



In [77]:
catboost_results = classification_report(y_test, pred_y, output_dict=True)

In [78]:
catboost_results['1']['f1-score']

0.6265060240963856

In [79]:
model.feature_importances_

array([ 7.36816535, 35.50668811,  4.1718062 ,  4.39524573,  6.95184403,
       16.53234897,  7.01223369, 18.06166794])

In [80]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     35.506688
Age                         18.061668
BMI                         16.532349
Pregnancies                  7.368165
DiabetesPedigreeFunction     7.012234
Insulin                      6.951844
SkinThickness                4.395246
BloodPressure                4.171806
dtype: float64

- Catboost모델에서 입력데이터안에 있는 categorical variable처리 방법

In [81]:
# sample data
cat_features = [0, 1]
train_data = [["a", "b", 1, 4, 5, 6],
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [["a", "b", 2, 4, 6, 8],
             ["a", "d", 1, 4, 50, 60]]

In [82]:


# Initialize CatBoostClassifier
model = cb.CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
print('\npreds_class:\n', preds_class)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
print('preds_proba:\n', preds_proba)




0:	learn: 0.5800330	total: 598us	remaining: 598us
1:	learn: 0.4935379	total: 1.24ms	remaining: 0us

preds_class:
 [1 1]
preds_proba:
 [[0.37014499 0.62985501]
 [0.4641579  0.5358421 ]]


---

# Summary

In [83]:
pd.DataFrame({'Acc': [x_gbm_results['accuracy'], 
                      light_gbm_results['accuracy'],
                      catboost_results['accuracy']],
              'f1':[x_gbm_results['1']['f1-score'], 
                          light_gbm_results['1']['f1-score'], 
                          catboost_results['1']['f1-score']]},
             index =['Xgb', 'Lightgbm', 'Catboost'])

Unnamed: 0,Acc,f1
Xgb,0.746753,0.597938
Lightgbm,0.811688,0.681319
Catboost,0.798701,0.626506
