# XGBoost Unveiled

## Building XGBoost models

In [3]:
import pandas as pd
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [4]:
df = pd.DataFrame(
    data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]
)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=2)

In [6]:
# Import XGBClassifier and accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize the XGBClassifier model with specific hyperparameters (to be discussed in the next chapter)
xgb = XGBClassifier(
    booster="gbtree",
    objective="multi:softprob",
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    random_state=2,
    n_jobs=-1,
)

In [7]:
# Fit the classifier to the data
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Score: ', score)

Score:  0.9736842105263158


In [8]:
# Import the diabetes dataset
X,y = datasets.load_diabetes(return_X_y=True)

In [10]:
# Import XGBRegressor and cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Initialize the XGBRegressor model
xgb = XGBRegressor(
    booster="gbtree",
    objective="reg:squarederror",
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    random_state=2,
    n_jobs=-1,
)

# Fit and score the regressor with cross_val_score
scores = cross_val_score(xgb, X, y, cv=5, scoring="neg_mean_squared_error")

# Display the results
rmse = np.sqrt(-scores)
print('RMSE: ', np.round(rmse, 3))
print('RMSE mean: ', np.round(rmse.mean(), 3))

RMSE:  [59.397 60.322 69.036 63.211 66.953]
RMSE mean:  63.784


In [11]:
# Convert the target column to a DataFrame and use the .describe() method to understand the results
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


## Finding the Higgs Boson with XGBoost

In [38]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [39]:
df = df.drop(columns=["Weight", "KaggleSet"])
df = df.rename(columns={"KaggleWeight": "Weight"})

In [40]:
# Move the 'Label' column to the end of the DataFrame
label_col = df['Label']
del df['Label']
df['Label'] = label_col

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [42]:
# Relabel the 'Label' column
df['Label'] = df['Label'].map({"s": 1, "b": 0})

# Split the DataFrame into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [43]:
# Understanding the Weight column
df['test_Weight'] = df['Weight'] * 5500000 / len(y)

# Compute the scaling factor
s = np.sum(df[df['Label'] == 1]['test_Weight']) 
b = np.sum(df[df['Label'] == 0]['test_Weight'])
b/s

593.9401931492318

In [51]:
# Build the model
import xgboost as xgb

# Initialize the model as a DMatrix and specify values that denote missing values
xgb_clf = xgb.DMatrix(X, y, missing=-999.0, weight=df["test_Weight"])


# Set additional hyperparameters
param = {}
param["objective"] = "binary:logitraw"
param["scale_pos_weight"] = b / s
param["eta"] = 0.1
param["max_depth"] = 6
param["eval_metric"] = "auc"

# Create a list of parameters including the preceding items, the evaluation metric, and the AMS.
plst = list(param.items()) + [("eval_metric", "ams@0.15")]

# Create a watchlist to keep track of the performance on the test set
watchlist = [(xgb_clf, "train")]

# Set the number of boosting rounds to 120
num_round = 120

# Train and save the model
print("Loading data end, start to boost trees")
bst = xgb.train(plst, xgb_clf, num_round, watchlist)
bst.save_model("higgs.model")
print("Finish training")

Loading data end, start to boost trees
[0]	train-auc:0.99971	train-ams@0.15:128.20801
[1]	train-auc:0.99982	train-ams@0.15:162.31381
[2]	train-auc:0.99982	train-ams@0.15:162.58270
[3]	train-auc:0.99982	train-ams@0.15:163.14288




[4]	train-auc:0.99982	train-ams@0.15:163.40131
[5]	train-auc:0.99982	train-ams@0.15:164.05534
[6]	train-auc:0.99982	train-ams@0.15:164.47148
[7]	train-auc:0.99982	train-ams@0.15:164.59755
[8]	train-auc:0.99982	train-ams@0.15:164.47482
[9]	train-auc:0.99983	train-ams@0.15:165.05211
[10]	train-auc:0.99983	train-ams@0.15:164.49304
[11]	train-auc:0.99983	train-ams@0.15:164.92082
[12]	train-auc:0.99983	train-ams@0.15:165.23534
[13]	train-auc:0.99983	train-ams@0.15:165.23524
[14]	train-auc:0.99983	train-ams@0.15:165.53572
[15]	train-auc:0.99983	train-ams@0.15:165.66557
[16]	train-auc:0.99983	train-ams@0.15:167.02490
[17]	train-auc:0.99983	train-ams@0.15:166.95227
[18]	train-auc:0.99983	train-ams@0.15:166.21353
[19]	train-auc:0.99983	train-ams@0.15:166.50726
[20]	train-auc:0.99984	train-ams@0.15:166.57159
[21]	train-auc:0.99984	train-ams@0.15:166.63289
[22]	train-auc:0.99984	train-ams@0.15:167.58266
[23]	train-auc:0.99984	train-ams@0.15:166.88652
[24]	train-auc:0.99984	train-ams@0.15:167.7085

