# Source
website:
https://www.datacamp.com/tutorial/xgboost-in-python

In [1]:
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# add the path to my packages to system paths so they can be imported
import sys
sys.path.append('/home/yasamanparhizkar/Documents/yorku/01_thesis/code/my_packages')
# sys.path.append('F:\MAScThesis\code\my_packages')
# sys.path.append('/home/yasamanparhizkar/Documents/thesis/code/my_packages')

import data_handler_01 as dh
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Load the data

In [2]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
type(diamonds)

pandas.core.frame.DataFrame

In [4]:
print(diamonds)

       carat        cut color clarity  depth  table  price     x     y     z
0       0.23      Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1       0.21    Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2       0.23       Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3       0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4       0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
...      ...        ...   ...     ...    ...    ...    ...   ...   ...   ...
53935   0.72      Ideal     D     SI1   60.8   57.0   2757  5.75  5.76  3.50
53936   0.72       Good     D     SI1   63.1   55.0   2757  5.69  5.75  3.61
53937   0.70  Very Good     D     SI1   62.8   60.0   2757  5.66  5.68  3.56
53938   0.86    Premium     H     SI2   61.0   58.0   2757  6.15  6.12  3.74
53939   0.75      Ideal     D     SI2   62.2   55.0   2757  5.83  5.87  3.64

[53940 rows x 10 columns]


In [5]:
diamonds.shape

(53940, 10)

In [6]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


# Regression

In [7]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [8]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [9]:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [12]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 100

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n
)

In [13]:
# Evaluation
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [14]:
rmse = mean_squared_error(y_test, preds, squared=False)
print('RMSE of the base model: {:.3f}'.format(rmse))

RMSE of the base model: 545.388


In [15]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]
n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10 # Every ten rounds
)

[0]	train-rmse:3985.31595	validation-rmse:3930.87087
[10]	train-rmse:557.19710	validation-rmse:591.03042
[20]	train-rmse:495.31647	validation-rmse:550.76666
[30]	train-rmse:467.13670	validation-rmse:547.16647
[40]	train-rmse:447.26879	validation-rmse:544.10422
[50]	train-rmse:432.51681	validation-rmse:543.97371
[60]	train-rmse:420.72943	validation-rmse:544.77874
[70]	train-rmse:408.72053	validation-rmse:544.77491
[80]	train-rmse:395.88816	validation-rmse:544.33808
[90]	train-rmse:383.62262	validation-rmse:545.99682
[99]	train-rmse:378.37454	validation-rmse:545.38842


In [49]:
n = 10000

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	train-rmse:3985.31595	validation-rmse:3930.87087
[50]	train-rmse:432.51681	validation-rmse:543.97371
[87]	train-rmse:386.38896	validation-rmse:545.45681


In [50]:
print('Best val. loss: ', model.best_score)
print('Best val. iteration: ', model.best_iteration)

Best val. loss:  543.496479339044
Best val. iteration:  38


In [51]:
# use the best model's parameters to predict the val. labels
preds = model.predict(dtest_reg, iteration_range=(0, model.best_iteration+1))

In [52]:
# compute val loss
rmse = mean_squared_error(y_test, preds, squared=False)
rmse

543.496479441815

In [None]:
# Specify which dataset and which metric should be used for early stopping.
early_stopping_rounds=50
early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,data_name='validation')
n = 10000

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50,
   callbacks=[early_stop]
)

In [17]:
# Cross-validation
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [18]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3985.648654,10.343596,3986.913623,41.642778
1,2848.365726,8.014086,2851.020437,28.028733
2,2063.401458,4.637773,2068.629977,19.969459
3,1521.493751,3.874078,1530.496272,13.59233
4,1156.827103,2.991735,1170.413316,11.695597


In [19]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

550.7196748119261

# Classification

In [20]:
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
    X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [21]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [22]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"]
)

KeyboardInterrupt: 

In [None]:
results.keys()

In [None]:
results['test-auc-mean'].max()

# XGBoost Native vs. XGBoost Sklearn

In [None]:
import xgboost as xgb

# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3, enable_categorical=True)
xgb_classifier.fit(X_train, y_train)


# Convert the model to a native API model
model = xgb_classifier.get_booster()