# Gradient Boosting

In [1]:
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
import catboost
from catboost import CatBoost

from ydata_profiling import ProfileReport

In [2]:
print(f"lightgbm: {lgb.__version__}")
print(f"xgb: {xgb.__version__}")
print(f"catboost: {catboost.__version__}")

lightgbm: 4.5.0
xgb: 2.1.1
catboost: 1.2.5


## Impact of Null

In [3]:
data = pd.read_csv("~/data/german_credit_data.csv", index_col="Unnamed: 0")
data.columns = [
    "age",
    "sex",
    "job",
    "housing",
    "saving_accounts",
    "checking_account",
    "credit_amount",
    "duration",
    "purpose"
]

In [4]:
ProfileReport(data)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
numerical_features = [
    "age",
    "duration",
]

categorical_features = [
    "sex",
    "job",
    "housing",
    "saving_accounts", #Missing
    "checking_account", #Missing
    "purpose"
]

target = "credit_amount"

features = numerical_features + categorical_features

In [6]:
data[categorical_features] = data[categorical_features].astype('category')

In [7]:
from sklearn.model_selection import train_test_split

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_columns = [X.columns.get_loc(c) for c in categorical_features]

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns)
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_columns)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 100
}

lgb_model = lgb.train(params, train_data, valid_sets=[test_data])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 8
[LightGBM] [Info] Start training from score 3360.618750




In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# Definindo os parâmetros
params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 100
}

# Treinando o modelo
xgb_model = xgb.train(params, dtrain, evals=[(dtest, 'eval')], early_stopping_rounds=10)

[0]	eval-rmse:2444.25512
[1]	eval-rmse:2384.10360
[2]	eval-rmse:2325.20771
[3]	eval-rmse:2274.71491
[4]	eval-rmse:2223.98086
[5]	eval-rmse:2183.06771
[6]	eval-rmse:2144.75751
[7]	eval-rmse:2111.00714
[8]	eval-rmse:2080.52088
[9]	eval-rmse:2052.14169


Parameters: { "n_estimators" } are not used.



In [10]:

cat_features_indices = [X.columns.get_loc(col) for col in categorical_features]

catboost_model = CatBoost(
    params = {
        "learning_rate":0.05,
        "depth":6,
        "cat_features":cat_features_indices,
        "loss_function":'RMSE'
    }
)

catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)

CatBoostError: Invalid type for cat_feature[object_idx=3,feature_idx=5]=NaN : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [11]:
from sklearn.metrics import mean_squared_error
import numpy as np

# LightGBM
lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))

# XGBoost
xgb_pred = xgb_model.predict(dtest)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))

# # CatBoost
# catboost_pred = catboost_model.predict(X_test)
# catboost_rmse = np.sqrt(mean_squared_error(y_test, catboost_pred))

print(f'LightGBM RMSE: {lgb_rmse}')
print(f'XGBoost RMSE: {xgb_rmse}')
# print(f'CatBoost RMSE: {catboost_rmse}')

LightGBM RMSE: 1953.8570889224552
XGBoost RMSE: 2052.1416957523224
