# 4.2 Build XGBoost Models - Code Brief

Condensed reference for building XGBoost classifiers.

## Setup

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

## Basic XGBoost Classifier

In [None]:
# One-hot encode categorical features
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Build XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_model.fit(X_train_encoded, y_train)
y_pred = xgb_model.predict(X_test_encoded)
y_pred_proba = xgb_model.predict_proba(X_test_encoded)[:, 1]

## XGBoost with Class Weights

In [None]:
# Calculate class imbalance ratio
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_weighted = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_weighted.fit(X_train_encoded, y_train)

## XGBoost Pipeline

In [None]:
# Preprocessing
numerical_transformer = Pipeline([('scaler', StandardScaler())])
categorical_transformer = Pipeline([('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Full pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ))
])

xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)

## Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_accuracy = cross_val_score(xgb_pipeline, X_train, y_train, cv=cv, scoring='accuracy')
cv_roc_auc = cross_val_score(xgb_pipeline, X_train, y_train, cv=cv, scoring='roc_auc')

print(f"CV Accuracy: {cv_accuracy.mean():.3f} (+/- {cv_accuracy.std():.3f})")
print(f"CV ROC-AUC: {cv_roc_auc.mean():.3f} (+/- {cv_roc_auc.std():.3f})")

## Feature Importance

In [None]:
# Get feature importances
importances = xgb_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

# Different importance types
booster = xgb_model.get_booster()
weight_scores = booster.get_score(importance_type='weight')
gain_scores = booster.get_score(importance_type='gain')
cover_scores = booster.get_score(importance_type='cover')

## Key Parameters

| Parameter | Description | Typical Range |
|:----------|:------------|:--------------|
| n_estimators | Number of trees | 100-1000 |
| max_depth | Tree depth | 3-10 |
| learning_rate | Step size | 0.01-0.3 |
| subsample | Row sampling | 0.5-1.0 |
| colsample_bytree | Column sampling | 0.5-1.0 |
| scale_pos_weight | Class balance | 1 or neg/pos ratio |