# Gradient Boosting - XGBoost, LightGBM, Catboost

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# pip install xgboost
# pip install lightgbm
# pip install catboost

In [3]:
# 데이터 불러오기
data = pd.read_csv("./data/otto_train.csv")
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
data = data.drop(['id'], axis = 1) # id 제거

In [5]:
nCar = data.shape[0]
nVar = data.shape[1]
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 94


## 타겟 변수의 문자열을 숫자로 변환

In [6]:
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

## 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [7]:
feature_columns = list(data.columns.difference(['target']))

X = data[feature_columns]
y = after_mapping_target

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


## 1. XGBoost

In [8]:
import xgboost as xgb
import time

start = time.time() # 시작 시간 지정

# 데이터를 XGBoost 모델에 맞게 변환
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) 
xgb_dtest = xgb.DMatrix(data = test_x)

xgb_param = {'max_depth': 10,           # 트리 깊이
             'learning_rate': 0.01,
#              'n_estimators': 200,       # Number of trees, 트리 생성 개수
             'objective': 'multi:softmax', # 목적 함수
             'num_class': len(set(train_y)) + 1} # num_class보다 1 커야 한다.

xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain)
xgb_model_predict = xgb_model.predict(xgb_dtest)

print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%")
print("Time: %.2f" % (time.time() - start), "seconds")

Accuracy: 76.67 %
Time: 7.86 seconds


## 2. LightGBM

In [9]:
import lightgbm as lgb

start = time.time()

# 데이터를 LightGBM 모델에 맞게 변환
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) 

lgb_param = {'max_depth': 10,
            'learning_rate': 0.01,
            'n_estimators': 200,
            'objective': 'multiclass',
            'num_class': len(set(train_y)) + 1}

lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain)
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # Softmax의 결과값 중 가장 큰 값의 Label로 예측

print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%")
print("Time: %.2f" % (time.time() - start), "seconds")



Accuracy: 76.51 %
Time: 11.15 seconds


In [10]:
lgb_model.predict(test_x)

array([[1.00742333e-15, 2.01394798e-02, 3.34963168e-01, ...,
        3.41210093e-02, 5.26052587e-02, 3.34489234e-02],
       [1.18049982e-15, 6.08644262e-02, 1.92080339e-01, ...,
        3.33882112e-01, 9.32287492e-02, 6.27346305e-02],
       [5.48699431e-16, 8.24077811e-03, 4.87854516e-02, ...,
        1.18961727e-02, 2.86517838e-02, 1.83205951e-02],
       ...,
       [7.07722485e-16, 5.09288655e-02, 1.05914231e-01, ...,
        4.64739495e-02, 7.28664913e-02, 5.92429565e-01],
       [9.72786360e-16, 1.39586784e-02, 5.64948122e-01, ...,
        2.25339300e-02, 5.07965981e-02, 3.33459841e-02],
       [7.18285791e-16, 1.38911876e-02, 6.38635557e-02, ...,
        6.08088158e-01, 1.38520498e-01, 2.39829357e-02]])

## 3. Catboost

In [11]:
import catboost as cb

start = time.time()

# 데이터를 Catboost 모델에 맞게 변환
cb_dtrain = cb.Pool(data = train_x, label = train_y) 

cb_param = {'max_depth': 10,
            'learning_rate': 0.01,
            'n_estimators': 200,
            'verbose': False,
            'eval_metric': 'Accuracy',
            'loss_function': 'MultiClass'} # 손실 함수, 목적 함수

cb_model = cb.train(pool = cb_dtrain, params = cb_param)
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 # Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1

print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%")
print("Time: %.2f" % (time.time() - start), "seconds")

Accuracy: 71.94 %
Time: 105.37 seconds


In [12]:
cb_model.predict(test_x)

array([[-0.48030792,  1.48684732,  0.59484977, ..., -0.20860974,
        -0.02078807, -0.35374442],
       [ 0.02423435,  0.54912819,  0.28776062, ...,  0.53319433,
         0.33994419,  0.29545539],
       [-0.47806849, -0.41467357, -0.40849988, ..., -0.3715223 ,
        -0.17932851, -0.51967478],
       ...,
       [ 0.22805019,  0.0258581 , -0.25227309, ..., -0.38291428,
         0.10353364,  2.36905607],
       [-0.88917216,  2.49396331,  1.58460429, ..., -0.49133902,
        -0.73437045, -0.5330829 ],
       [-0.48004436, -0.00543001, -0.25626999, ...,  1.60927572,
         1.29172517, -0.42076681]])