In [206]:
import xgboost as xgb 

from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

In [207]:
# Generate synthetic data

X,y = make_classification(
    n_samples= 1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    random_state=42
)

In [208]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [209]:
# Define parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',      # Binary classification
    'eval_metric': 'logloss',            # Evaluation metric
    'num_boost_round' : 200,             # number of boostings (trees)
    'learning_rate' : 0.1,               # Step size shrinkage
    'max_depth' : 3,                     # Depth of each tree
    'subsample' : 0.8,                   # Fraction of samples used by tree
    'colsample_bytree' : 0.8,            # Fraction of features used by tree
    'random_state' : 42
}

In [210]:
# Convert to DMatrix

d_train = xgb.DMatrix(X_train, label=y_train)

d_test = xgb.DMatrix(X_test, label=y_test)

In [211]:
# Set up early stopping and validation

evals = [(d_test, 'eval'), (d_train, 'train')]

In [212]:
# Extract num_boost_round from params for clarity
num_boost_round = params.pop('num_boost_round')

# Train the model with early stopping
bst = xgb.train(
    params,
    d_train,
    num_boost_round = num_boost_round,  # Set a high number to allow early stopping
    evals=evals,
    early_stopping_rounds=10,  # Stop training if no improvement after 10 rounds
    verbose_eval=True
)

[0]	eval-logloss:0.66222	train-logloss:0.65630
[1]	eval-logloss:0.63606	train-logloss:0.62371
[2]	eval-logloss:0.61934	train-logloss:0.59552
[3]	eval-logloss:0.60349	train-logloss:0.57184
[4]	eval-logloss:0.58581	train-logloss:0.55054
[5]	eval-logloss:0.57287	train-logloss:0.53312
[6]	eval-logloss:0.55427	train-logloss:0.51397
[7]	eval-logloss:0.54133	train-logloss:0.49474
[8]	eval-logloss:0.53063	train-logloss:0.47797
[9]	eval-logloss:0.51758	train-logloss:0.46147
[10]	eval-logloss:0.50393	train-logloss:0.44618
[11]	eval-logloss:0.49440	train-logloss:0.43240
[12]	eval-logloss:0.48383	train-logloss:0.41847
[13]	eval-logloss:0.47076	train-logloss:0.40622
[14]	eval-logloss:0.46061	train-logloss:0.39685
[15]	eval-logloss:0.45235	train-logloss:0.38514
[16]	eval-logloss:0.44622	train-logloss:0.37545
[17]	eval-logloss:0.43829	train-logloss:0.36624
[18]	eval-logloss:0.43053	train-logloss:0.35843
[19]	eval-logloss:0.42350	train-logloss:0.35055
[20]	eval-logloss:0.41865	train-logloss:0.34475
[2

In [213]:
# Make predictions

y_pred = bst.predict(d_test)

In [214]:
# Evaluate the model

# Extract the true labels from DMatrix
y_true = d_test.get_label()


# Convert predicted probabilities to binary values (e.g., 1 if probability > 0.5)
y_pred_binary = (y_pred > 0.5).astype(int)


accuracy = accuracy_score(y_true,y_pred_binary)

report = classification_report(y_true,y_pred_binary)

print ( f"Accuracy Score: {accuracy}\n")

print ( f"Classification report:  \n")

print (f" {report}")


Accuracy Score: 0.9033333333333333

Classification report:  

               precision    recall  f1-score   support

         0.0       0.92      0.90      0.91       160
         1.0       0.89      0.91      0.90       140

    accuracy                           0.90       300
   macro avg       0.90      0.90      0.90       300
weighted avg       0.90      0.90      0.90       300

