## catboost 공식 문서 참고
# https://catboost.ai/docs/concepts/python-usages-examples.html

# 회귀

In [1]:
from catboost import CatBoostRegressor

train_data = [[1, 4, 5, 6],
             [4, 5, 6, 7],
             [30, 40, 50, 60]]

eval_data = [[2, 4, 6, 8],
            [1, 4, 50, 60]]

train_labels = [10, 20, 30]

model = CatBoostRegressor(iterations=2,
                       learning_rate=1,
                       depth=2)

model.fit(train_data, train_labels)

pred = model.predict(eval_data)

0:	learn: 6.1237244	total: 143ms	remaining: 143ms
1:	learn: 4.5927933	total: 144ms	remaining: 0us


# GPU,CPU에서 학습

In [2]:
from catboost import CatBoostClassifier

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]
train_labels = [0, 0, 1, 1]

model = CatBoostClassifier(iterations=1000, 
                           task_type="CPU",
                           devices='0:1')
model.fit(train_data,
          train_labels,
          verbose=False)

<catboost.core.CatBoostClassifier at 0x16d7e9378b0>

# 이진 분류

In [3]:
from catboost import CatBoostClassifier

cat_features = [0, 1]

train_data = [["a", "b", 1, 4, 5, 6],
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]

train_labels = [1,1,2]

eval_data = [["a", "b", 2, 4, 6, 8],
             ["a", "d", 1, 4, 50, 60]]
              
model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
              
model.fit(train_data, train_labels, cat_features)

preds_class = model.predict(eval_data)

preds_proba = model.predict_proba(eval_data)

preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')

print(preds_proba)
preds_class

0:	learn: 0.5800330	total: 280us	remaining: 280us
1:	learn: 0.4935379	total: 741us	remaining: 0us
[[0.62985501 0.37014499]
 [0.5358421  0.4641579 ]]


array([1, 1], dtype=int64)

# Pool을 활용한 데이터 셋 구축

In [4]:
from catboost import CatBoostClassifier, Pool

train_data = Pool(data=[[1, 4, 5, 6],
                        [4, 5, 6, 7],
                        [30, 40, 50, 60]],
                  label=[1, 1, -1],
                  weight=[0.1, 0.2, 0.3])

model = CatBoostClassifier(iterations=10)

model.fit(train_data)
preds_class = model.predict(train_data)

train_data

Learning rate set to 0.058839
0:	learn: 0.6879920	total: 373us	remaining: 3.36ms
1:	learn: 0.6815428	total: 622us	remaining: 2.49ms
2:	learn: 0.6765119	total: 984us	remaining: 2.3ms
3:	learn: 0.6715373	total: 1.33ms	remaining: 2ms
4:	learn: 0.6653022	total: 1.73ms	remaining: 1.73ms
5:	learn: 0.6591482	total: 2.06ms	remaining: 1.37ms
6:	learn: 0.6543562	total: 2.55ms	remaining: 1.09ms
7:	learn: 0.6496176	total: 2.93ms	remaining: 732us
8:	learn: 0.6436669	total: 3.29ms	remaining: 365us
9:	learn: 0.6377932	total: 3.62ms	remaining: 0us


<catboost.core.Pool at 0x16d7e946520>

# 다중 분류

In [5]:
from catboost import Pool, CatBoostClassifier

train_data = [["summer", 1924, 44],
              ["summer", 1932, 37],
              ["winter", 1980, 37],
              ["summer", 2012, 204]]

eval_data = [["winter", 1996, 197],
             ["winter", 1968, 37],
             ["summer", 2002, 77],
             ["summer", 1948, 59]]

cat_features = [0]

train_label = ["France", "USA", "USA", "UK"]
eval_label = ["USA", "France", "USA", "UK"]

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')

model.fit(train_dataset)

preds_class = model.predict(eval_dataset)

preds_proba = model.predict_proba(eval_dataset)

preds_raw = model.predict(eval_dataset, prediction_type='RawFormulaVal')

print(preds_class)
print(preds_proba)
print(preds_raw)

0:	learn: 0.9417331	total: 183us	remaining: 1.65ms
1:	learn: 0.8421839	total: 466us	remaining: 1.86ms
2:	learn: 0.6597822	total: 616us	remaining: 1.44ms
3:	learn: 0.6028493	total: 811us	remaining: 1.22ms
4:	learn: 0.4900112	total: 1.03ms	remaining: 1.03ms
5:	learn: 0.4076408	total: 1.25ms	remaining: 834us
6:	learn: 0.3458205	total: 1.45ms	remaining: 623us
7:	learn: 0.2982687	total: 1.66ms	remaining: 415us
8:	learn: 0.2608927	total: 1.87ms	remaining: 207us
9:	learn: 0.2309514	total: 2.06ms	remaining: 0us
[['USA']
 ['USA']
 ['UK']
 ['USA']]
[[0.20060959 0.2862616  0.51312881]
 [0.07388963 0.06071726 0.86539311]
 [0.27590481 0.46474219 0.259353  ]
 [0.2580995  0.1213261  0.6205744 ]]
[[-0.43157053 -0.07602515  0.50759567]
 [-0.75475564 -0.95110009  1.70585572]
 [-0.15318701  0.36823989 -0.21505288]
 [-0.04081236 -0.7956756   0.83648797]]


# best_score 구하기

In [6]:
from catboost import CatBoostClassifier, Pool

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]

train_labels = [0, 0, 1, 1]

eval_data = [[2, 1],
             [3, 1],
             [9, 0],
             [5, 3]]

eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data,
                    eval_labels)

model = CatBoostClassifier(learning_rate=0.03,
                           custom_metric=['Logloss',
                                          'AUC:hints=skip_train~false'])

model.fit(train_data,
          train_labels,
          eval_set=eval_dataset,
          verbose=False)

print(model.get_best_score())

{'learn': {'Logloss': 0.005758294697120604, 'AUC': 1.0}, 'validation': {'Logloss': 0.5366281810311608, 'AUC': 1.0}}


# 최적의 iteration 값 구하기

In [7]:
from catboost import CatBoostClassifier, Pool

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]

train_labels = [0, 0, 1, 1]

eval_data = [[2, 1],
             [3, 1],
             [9, 0],
             [5, 3]]

eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data,
                    eval_labels)

model = CatBoostClassifier(
                           eval_metric='AUC')

model.fit(train_data,
          train_labels,
          eval_set=eval_dataset,
          verbose=False)

print(model.get_best_iteration())

8


# 범주형 데이터를 포함한 데이터 셋

In [8]:
from catboost import Pool
cat_features = [0, 1, 2]
data = [["a","b", 1, 4, 5, 6],
        ["a","b", 4, 5, 6, 7],
        ["c","d", 30, 40, 50, 60]]

label = [1, 1, -1]

dataset = Pool(data, label, cat_features)

# 범주형 데이터가 없는 데이터 셋

In [9]:
from catboost import Pool
data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]

label = [1, 1, -1]
dataset = Pool(data, label)

# 레이블이 없는 데이터 셋(예측용)

In [10]:
from catboost import Pool
data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]
dataset = Pool(data)

# pandas 활용한 데이터 셋 구축

In [11]:
import numpy as np
import pandas as pd

import catboost as cb

X = pd.DataFrame(
    {'a': [ 1, 4, 0,  0, 1],
     'b': [ 0, 0, 1,  7, 8],
     'c': [30, 0, 0, 50, 0]
    }
)

y = np.array([0, 1, 0, 1, 1])

dataset = cb.Pool(X, y)

dataset

<catboost.core.Pool at 0x16d7e9754c0>

In [12]:
import numpy as np
import pandas as pd

import catboost as cb

X = pd.DataFrame(
    {'a': pd.arrays.SparseArray([   1,   4,   0,   0,   1]),
     'b': pd.arrays.SparseArray([ 0.0, 0.0, 1.0, 7.0, 8.0]),
     'c': pd.arrays.SparseArray([  30,   0,   0,  50,   0]),
     'd': pd.arrays.SparseArray([ 'a', 'b',  '', 'c',  ''],
                               fill_value=''),
    }
)

y = np.array([0, 1, 0, 1, 1])

dataset = cb.Pool(X, y, cat_features=['d'])

dataset

X

Unnamed: 0,a,b,c,d
0,1,0.0,30,a
1,4,0.0,0,b
2,0,1.0,0,
3,0,7.0,50,c
4,1,8.0,0,


# Pool 타입 슬라이싱

In [13]:
from catboost import Pool

data = [[1, 3],
        [0, 4],
        [1, 7],
        [6, 4],
        [5, 3]]

dataset = Pool(data)
print(dataset.num_row())

dataset_part = dataset.slice([0, 1, 2])
print(dataset_part.num_row())

5
3


# catboost 교차 검증 수행

In [14]:
from catboost import Pool, cv

cv_data = [["France", 1924, 44],
           ["USA", 1932, 37],
           ["Switzerland", 1928, 25],
           ["Norway", 1952, 30],
           ["Japan", 1972, 35],
           ["Mexico", 1968, 112]]

labels = [1, 1, 0, 0, 0, 1]

cat_features = [0]

cv_dataset = Pool(data=cv_data,
                  label=labels,
                  cat_features=cat_features)

params = {"iterations": 100,
          "depth": 2,
          "loss_function": "Logloss",
          "verbose": False}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))