In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
# Dataset
from sklearn import datasets
# Data processing
import numpy as np
# Standardize the data
from sklearn.preprocessing import StandardScaler
# Model and performance evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score
# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.metrics import accuracy_score

In [None]:
X = pd.read_csv("x_train.csv")
y = pd.read_csv("y_train.csv")

#### XGBoost Classifier With No Hyperparameter Tuning

https://medium.com/grabngoinfo/hyperparameter-tuning-for-xgboost-91449869c57e
https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning


In [None]:
# Initiate XGBoost Classifier
xgboost = XGBClassifier()
# Print default setting
xgboost.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
print(f'The training dataset has {len(X_train)} records.')
print(f'The testing dataset has {len(X_test)} records.')

The training dataset has 4380 records.
The testing dataset has 1096 records.


In [None]:
# Train the model
xgboost = XGBClassifier(seed=0).fit(X_train,y_train)
# Make prediction
y_predict = xgboost.predict(X_test)
# Get predicted probability
y_predict_prob = xgboost.predict_proba(X_test)[:,1]

In [None]:
# Get performance metrics
from sklearn.metrics import f1_score
precision, recall, fscore, support = score(y_test, y_predict)
# Print result
print(f'The precision value for the baseline xgboost model is {precision[1]:.4f}')
print(f'The recall value for the baseline xgboost model is {recall[1]:.4f}')
print(f'The fscore value for the baseline xgboost model is {fscore[1]:.4f}')
print("The fscore macro value for the baseline xgboost model is:",f1_score(y_test, y_predict, average='macro'))
print("The fscore micro value for the baseline xgboost model is:",f1_score(y_test, y_predict, average='micro'))
print("The fscore weighted value for the baseline xgboost model is: ",f1_score(y_test, y_predict, average='weighted'))
print("The accuracy value for the baseline xgboost model is:",accuracy_score(y_test,y_predict))

The precision value for the baseline xgboost model is 0.7673
The recall value for the baseline xgboost model is 0.8133
The fscore value for the baseline xgboost model is 0.7896
The fscore macro value for the baseline xgboost model is: 0.8768596087544812
The fscore micro value for the baseline xgboost model is: 0.8713503649635036
The fscore weighted value for the baseline xgboost model is:  0.8722472588429243
The accuracy value for the baseline xgboost model is: 0.8713503649635036


#### Grid Search for XGBoost

In [None]:
# Define the search space
param_grid = { 
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8 ],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
    }
# Set up score
scoring = ['f1_macro','f1_micro','f1_weighted','accuracy']
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# import sklearn
# sklearn.metrics.get_scorer_names()


In [None]:
# Define grid search
grid_search = GridSearchCV(estimator=xgboost, 
                           param_grid=param_grid, 
                           scoring=scoring, 
                           refit='f1_weighted', 
                           n_jobs=-1, 
                           cv=kfold, 
                           verbose=0)
# Fit grid search
grid_result = grid_search.fit(X, y_encoded)


In [None]:
grid_result.cv_results_

{'mean_fit_time': array([0.97302284, 0.95096145, 0.90263715, 0.942594  , 0.9280859 ,
        1.01424813, 0.95111475, 0.98748593, 0.94464722, 0.96357255,
        0.94874182, 0.9821763 , 0.76973925, 0.77890596, 0.84913855,
        0.79097552, 1.07795796, 0.97087922, 1.01748338, 1.14681249,
        1.13030481, 1.14381442, 1.11083765, 1.17370019, 1.14115863,
        1.1536119 , 1.17536573, 1.20857272, 0.86070657, 0.87144823,
        0.87964473, 0.89160762, 1.38508315, 1.55087523, 1.41353326,
        1.4424118 , 1.83906012, 1.45427256, 1.51258545, 1.52197227,
        1.43077302, 1.43145781, 1.45665107, 1.57004886, 1.10247684,
        1.04773469, 1.04141965, 1.12536039]),
 'std_fit_time': array([0.01284497, 0.04444879, 0.00762625, 0.03075958, 0.03090666,
        0.08676605, 0.0589317 , 0.00514409, 0.03264918, 0.02509839,
        0.02915311, 0.00323127, 0.02626663, 0.09501085, 0.0594179 ,
        0.01115296, 0.01348092, 0.03119276, 0.02891454, 0.07170761,
        0.03503956, 0.08517076, 0.034

In [None]:
# Print grid search summary
grid_result
# Print the best score and the corresponding hyperparameters
print(f'The best score is {grid_result.best_score_:.4f}')
print('The best score standard deviation is', round(grid_result.cv_results_['std_test_accuracy'][grid_result.best_index_], 4))
print(f'The best hyperparameters are {grid_result.best_params_}')

The best score is 0.8676
The best score standard deviation is 0.0048
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 5}


https://medium.com/sfu-cspmp/xgboost-a-deep-dive-into-boosting-f06c9c41349
https://www.datacamp.com/tutorial/xgboost-in-python

### XGBoost Native

The two most popular classification objectives are:

binary:logistic - binary classification (the target contains only two classes, i.e., cat or dog)
multi:softprob - multi-class classification (more than two classes in the target, i.e., apple/orange/banana)


XGBoost comes with its own class for storing datasets called DMatrix. It is a highly optimized class for memory and speed. That's why converting datasets into this format is a requirement for the native XGBoost API:

In [None]:

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 8}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)




In [None]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [None]:
results['test-auc-mean'].mean()

0.9825630796269936

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3a3c41eb-c99a-4899-842d-6de17d10af25' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>