XGBoost for Classification

!pip install xgboost

In [1]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train classifier
classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

roc_auc_score_value = roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score: {roc_auc_score_value:.4f}")

Accuracy: 0.9561
ROC AUC Score: 0.9912




XGBoost with Hyperparametric Tuning: 
RandomizedSearchCV to narrow the region, then GridSearchCV in that narrowed region for fine-tuning

In [2]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.stats import uniform, randint

# RandomizedSearchCV: Broad search
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 7),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5)
}

random_search = RandomizedSearchCV(
    xgb_clf, param_distributions=param_dist,
    n_iter=20, scoring='roc_auc', cv=3, verbose=1, random_state=42
)
random_search.fit(X_train, y_train)

print("Best params from RandomizedSearchCV:", random_search.best_params_)

# GridSearchCV: Fine-tune in narrowed region
narrowed_params = {
    'n_estimators': [random_search.best_params_['n_estimators'] - 20, 
                     random_search.best_params_['n_estimators'], 
                     random_search.best_params_['n_estimators'] + 20],
    'learning_rate': [random_search.best_params_['learning_rate'] * f for f in [0.8, 1.0, 1.2]],
    'max_depth': [max(1, random_search.best_params_['max_depth'] - 1), 
                 random_search.best_params_['max_depth'], 
                 random_search.best_params_['max_depth'] + 1],
}

grid_search = GridSearchCV(
    xgb_clf, param_grid=narrowed_params,
    scoring='roc_auc', cv=3, verbose=1
)
grid_search.fit(X_train, y_train)

print("Best params from GridSearchCV:", grid_search.best_params_)

# Evaluate on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Final Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Final Test ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")



Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best params from RandomizedSearchCV: {'colsample_bytree': 0.7693605922825478, 'gamma': 0.19744075908778486, 'learning_rate': 0.09804645241541143, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 292, 'subsample': 0.8827429375390468}
Fitting 3 folds for each of 27 candidates, totalling 81 fits




Best params from GridSearchCV: {'learning_rate': 0.11765574289849372, 'max_depth': 8, 'n_estimators': 312}
Final Test Accuracy: 0.9649
Final Test ROC AUC: 0.9934


Incase of High Dimensional data: Using PCA to reduce dimensions

In [3]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Simulate classification dataset: 12000 samples, 208 features, 2 classes
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=12000, n_features=208, n_informative=30,
                           n_redundant=10, n_classes=2, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build pipeline: Scale -> PCA -> Classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),  # Reducing dimensions
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Grid Search for hyperparameter tuning
param_grid = {
    'pca__n_components': [30, 50, 80],
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Best Parameters:", grid_search.best_params_)




Fitting 3 folds for each of 24 candidates, totalling 72 fits




Test Accuracy: 0.8304
Best Parameters: {'pca__n_components': 30, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 100}


Save the model

In [4]:
import joblib

# Save the model
joblib.dump(best_model, "xgb_pca_classifier_pipeline.pkl")
print("Model saved as 'xgb_pca_classifier_pipeline.pkl'")

Model saved as 'xgb_pca_classifier_pipeline.pkl'


KFold for Class Imbalance Problem:

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report

# Calculate class imbalance ratio
pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
print(f"Calculated scale_pos_weight: {pos_weight:.2f}")

# Define base model for feature selection
base_selector_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss', 
    use_label_encoder=False,
    scale_pos_weight=pos_weight,
    random_state=42
)

# Build pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_select', SelectFromModel(base_selector_model, threshold="median")),
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        scale_pos_weight=pos_weight,
        random_state=42
    ))
])

# Stratified K-Fold
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid search parameters
param_grid = {
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=stratified_cv, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Best Params:", grid_search.best_params_)


Calculated scale_pos_weight: 1.02
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Accuracy: 0.9254
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92      1170
           1       0.93      0.92      0.93      1230

    accuracy                           0.93      2400
   macro avg       0.93      0.93      0.93      2400
weighted avg       0.93      0.93      0.93      2400

Best Params: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 150}


SMOTE for Class Imbalance

In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define base model for feature selection
selector_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Create pipeline: Scaling → SMOTE → Feature Selection → XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('feature_select', SelectFromModel(selector_model, threshold="median")),
    ('xgb', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ))
])

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define hyperparameter grid
param_grid = {
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1]
}

# Perform Grid Search with CV
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Best Parameters:", grid_search.best_params_)

ImportError: cannot import name '_print_elapsed_time' from 'sklearn.utils' (c:\Users\varun\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\__init__.py)

Use the model in future

In [6]:
# Load model
loaded_model = joblib.load("xgb_pca_classifier_pipeline.pkl")

# Predict
preds = loaded_model.predict(X_test)

# Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1170
           1       0.85      0.81      0.83      1230

    accuracy                           0.83      2400
   macro avg       0.83      0.83      0.83      2400
weighted avg       0.83      0.83      0.83      2400

