In [1]:
!pip install catboost xgboost lightgbm



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing, load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, balanced_accuracy_score

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

In [3]:

data = fetch_california_housing()

In [4]:
x = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

x_train, x_test, y_train, y_test = train_test_split(x, y)

x_train.shape, x_test.shape

((15480, 8), (5160, 8))

Дока по XGBoost
https://xgboost.readthedocs.io/en/stable/parameter.html

In [5]:
xgb = XGBRegressor(n_estimators=1000, eval_metric='rmse')
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
r2_score(y_test, y_pred)

0.827838341518788

Дока по LGBM
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

In [6]:
lgbm = LGBMRegressor(n_estimators=1000)
lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)
r2_score(y_test, y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 15480, number of used features: 8
[LightGBM] [Info] Start training from score 2.063820


0.8496507690369028

Лекция про CatBoost
https://www.youtube.com/watch?v=UYDwhuyWYSo


Дока по катбусту в докстринге

In [7]:
cb = CatBoostRegressor(eval_metric='R2')
cb.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot_file='model_fit.html',

)

Learning rate set to 0.078287
0:	learn: 0.0854330	test: 0.0848044	best: 0.0848044 (0)	total: 58.3ms	remaining: 58.2s
1:	learn: 0.1601609	test: 0.1582734	best: 0.1582734 (1)	total: 64.3ms	remaining: 32.1s
2:	learn: 0.2233984	test: 0.2212782	best: 0.2212782 (2)	total: 68.7ms	remaining: 22.8s
3:	learn: 0.2776539	test: 0.2740489	best: 0.2740489 (3)	total: 76ms	remaining: 18.9s
4:	learn: 0.3271087	test: 0.3216684	best: 0.3216684 (4)	total: 86.3ms	remaining: 17.2s
5:	learn: 0.3684623	test: 0.3624682	best: 0.3624682 (5)	total: 91.5ms	remaining: 15.2s
6:	learn: 0.4043690	test: 0.3974958	best: 0.3974958 (6)	total: 96.3ms	remaining: 13.7s
7:	learn: 0.4377984	test: 0.4306318	best: 0.4306318 (7)	total: 102ms	remaining: 12.7s
8:	learn: 0.4677645	test: 0.4607234	best: 0.4607234 (8)	total: 107ms	remaining: 11.7s
9:	learn: 0.4932692	test: 0.4858418	best: 0.4858418 (9)	total: 115ms	remaining: 11.4s
10:	learn: 0.5166820	test: 0.5085133	best: 0.5085133 (10)	total: 122ms	remaining: 10.9s
11:	learn: 0.5381

<catboost.core.CatBoostRegressor at 0x79935065d990>

In [8]:
digits = load_digits()
x = pd.DataFrame(digits['data'], columns=digits['feature_names'])
y = digits['target']

x_train, x_test, y_train, y_test = train_test_split(x, y)

x_train.shape, x_test.shape

((1347, 64), (450, 64))

In [9]:
cb = CatBoostClassifier(loss_function='MultiClass')
cb.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot_file='model_fit.html',

)

Learning rate set to 0.108621
0:	learn: 2.0384180	test: 2.0570200	best: 2.0570200 (0)	total: 28.2ms	remaining: 28.1s
1:	learn: 1.8396402	test: 1.8769228	best: 1.8769228 (1)	total: 55.7ms	remaining: 27.8s
2:	learn: 1.6641574	test: 1.7196247	best: 1.7196247 (2)	total: 82.5ms	remaining: 27.4s
3:	learn: 1.5134865	test: 1.5703738	best: 1.5703738 (3)	total: 113ms	remaining: 28.3s
4:	learn: 1.4056813	test: 1.4726460	best: 1.4726460 (4)	total: 140ms	remaining: 27.9s
5:	learn: 1.2897556	test: 1.3637873	best: 1.3637873 (5)	total: 170ms	remaining: 28.1s
6:	learn: 1.2052785	test: 1.2811754	best: 1.2811754 (6)	total: 202ms	remaining: 28.7s
7:	learn: 1.1195872	test: 1.1999126	best: 1.1999126 (7)	total: 229ms	remaining: 28.4s
8:	learn: 1.0518426	test: 1.1385508	best: 1.1385508 (8)	total: 256ms	remaining: 28.2s
9:	learn: 0.9853233	test: 1.0772872	best: 1.0772872 (9)	total: 285ms	remaining: 28.3s
10:	learn: 0.9264013	test: 1.0183854	best: 1.0183854 (10)	total: 311ms	remaining: 28s
11:	learn: 0.8738498	

<catboost.core.CatBoostClassifier at 0x79935065ed40>

In [10]:
accuracy_score(y_test, cb.predict(x_test))

0.9911111111111112

In [11]:
xgb = XGBClassifier(n_estimators=1000)
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
accuracy_score(y_test, y_pred)

0.9666666666666667

In [12]:
lgbm = LGBMClassifier(n_estimators=500)
lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)
accuracy_score(y_test, y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 831
[LightGBM] [Info] Number of data points in the train set: 1347, number of used features: 53
[LightGBM] [Info] Start training from score -2.322833
[LightGBM] [Info] Start training from score -2.263993
[LightGBM] [Info] Start training from score -2.278381
[LightGBM] [Info] Start training from score -2.292980
[LightGBM] [Info] Start training from score -2.307795
[LightGBM] [Info] Start training from score -2.256875
[LightGBM] [Info] Start training from score -2.285654
[LightGBM] [Info] Start training from score -2.345823
[LightGBM] [Info] Start training from score -2.385354
[LightGBM] [Info] Start training from score -2.292980


0.9777777777777777

In [13]:
col_names =['buying price', 'maintenance cost', 'number of doors',
            'number of persons', 'lug_boot' ,'safety', 'decision']

df = pd.read_csv('car_evaluation.csv', header=None, names=col_names)
x = df.drop('decision', axis=1)
y = df['decision']
x_train, x_test, y_train, y_test = train_test_split(x, y)

y_mapper = {
    'unacc': 0,
    'acc': 1,
    'good': 2,
    'vgood': 3
}

x_train_oh = pd.get_dummies(x_train)
x_test_oh = pd.get_dummies(x_test)
y_train_processed = y_train.map(y_mapper)
y_test_processed = y_test.map(y_mapper)

x_train.shape, x_test.shape, x_train_oh.shape, x_test_oh.shape

((1296, 6), (432, 6), (1296, 21), (432, 21))

In [14]:
cb = CatBoostClassifier(n_estimators=1000, loss_function='MultiClassOneVsAll', one_hot_max_size=5)
cb.fit(
    x_train, y_train,
    cat_features=x.columns.tolist(),
    eval_set=(x_test, y_test),
    plot_file='model_fit.html',

)
balanced_accuracy_score(y_test, cb.predict(x_test))

0:	learn: 0.6763781	test: 0.6761132	best: 0.6761132 (0)	total: 1.13ms	remaining: 1.13s
1:	learn: 0.6574661	test: 0.6573999	best: 0.6573999 (1)	total: 2.79ms	remaining: 1.39s
2:	learn: 0.6421170	test: 0.6424372	best: 0.6424372 (2)	total: 4.14ms	remaining: 1.38s
3:	learn: 0.6281117	test: 0.6286862	best: 0.6286862 (3)	total: 5.36ms	remaining: 1.33s
4:	learn: 0.6114375	test: 0.6120706	best: 0.6120706 (4)	total: 6.38ms	remaining: 1.27s
5:	learn: 0.5972605	test: 0.5985274	best: 0.5985274 (5)	total: 7.48ms	remaining: 1.24s
6:	learn: 0.5849168	test: 0.5865088	best: 0.5865088 (6)	total: 8.46ms	remaining: 1.2s
7:	learn: 0.5712004	test: 0.5733995	best: 0.5733995 (7)	total: 9.46ms	remaining: 1.17s
8:	learn: 0.5571417	test: 0.5594428	best: 0.5594428 (8)	total: 10.4ms	remaining: 1.14s
9:	learn: 0.5442609	test: 0.5466698	best: 0.5466698 (9)	total: 11.4ms	remaining: 1.13s
10:	learn: 0.5318864	test: 0.5344302	best: 0.5344302 (10)	total: 12.5ms	remaining: 1.12s
11:	learn: 0.5212882	test: 0.5240765	best:

0.9824019429887025

In [15]:
xgb = XGBClassifier(n_estimators=100)
xgb.fit(x_train_oh, y_train_processed)
y_pred = xgb.predict(x_test_oh)
balanced_accuracy_score(y_test_processed, y_pred)

0.9832408691631993

In [16]:
lgbm = LGBMClassifier(n_estimators=100)
lgbm.fit(x_train_oh, y_train)
y_pred = lgbm.predict(x_test_oh)
balanced_accuracy_score(y_test, y_pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 1296, number of used features: 21
[LightGBM] [Info] Start training from score -1.528683
[LightGBM] [Info] Start training from score -3.295837
[LightGBM] [Info] Start training from score -0.351398
[LightGBM] [Info] Start training from score -3.159705


1.0