# 4.3 Build LightGBM and CatBoost Models - Code Brief

Condensed reference for LightGBM and CatBoost classifiers.

## Setup

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## LightGBM Classifier

In [None]:
# Label encode categoricals for LightGBM
X_train_lgb = X_train.copy()
X_test_lgb = X_test.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_train_lgb[col] = le.fit_transform(X_train_lgb[col])
    X_test_lgb[col] = le.transform(X_test_lgb[col])

# Build LightGBM classifier
lgb_model = LGBMClassifier(
    n_estimators=100,
    max_depth=-1,
    num_leaves=31,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

lgb_model.fit(X_train_lgb, y_train, categorical_feature=categorical_cols)
y_pred = lgb_model.predict(X_test_lgb)

## LightGBM Pipeline

In [None]:
preprocessor_lgb = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols)
])

lgb_pipeline = Pipeline([
    ('preprocessor', preprocessor_lgb),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        num_leaves=31,
        learning_rate=0.1,
        random_state=42,
        verbose=-1
    ))
])

lgb_pipeline.fit(X_train, y_train)

## CatBoost Classifier

In [None]:
# CatBoost accepts string categoricals directly!
cat_model = CatBoostClassifier(
    iterations=100,
    depth=6,
    learning_rate=0.1,
    cat_features=categorical_cols,
    random_state=42,
    verbose=0
)

cat_model.fit(X_train, y_train)
y_pred = cat_model.predict(X_test)
y_pred_proba = cat_model.predict_proba(X_test)[:, 1]

## CatBoost Pipeline

In [None]:
preprocessor_cat = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', 'passthrough', categorical_cols)
])

n_numerical = len(numerical_cols)
cat_feature_indices = list(range(n_numerical, n_numerical + len(categorical_cols)))

cat_pipeline = Pipeline([
    ('preprocessor', preprocessor_cat),
    ('classifier', CatBoostClassifier(
        iterations=100,
        depth=6,
        learning_rate=0.1,
        cat_features=cat_feature_indices,
        random_state=42,
        verbose=0
    ))
])

cat_pipeline.fit(X_train, y_train)

## Library Comparison

| Aspect | XGBoost | LightGBM | CatBoost |
|:-------|:--------|:---------|:---------|
| Speed | Fast | Very Fast | Fast |
| Categorical | Requires encoding | Native (integer) | Native (strings OK) |
| Tree Growth | Level-wise | Leaf-wise | Symmetric |
| Best For | General purpose | Large datasets | Categorical features |

## Key Parameters

**LightGBM:**
- `num_leaves`: Max leaves per tree (default 31)
- `max_depth`: -1 for unlimited

**CatBoost:**
- `iterations`: Number of trees
- `depth`: Tree depth
- `cat_features`: List of categorical columns