# Import the packages

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Import the dataframe

In [17]:
df: pd.DataFrame = pd.read_csv('../Datasets/trainingdata.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,datum,startdatum,einddatum,leeftijd,dienstperiode,status_jaar,status_actief,status_beëindigd,reisafstand,uitdienst_reden_Niet van toepassing,uitdienst_reden_Ontslag,uitdienst_reden_Vertrek,aantal_geboortes
0,0,2018-12-31 05:06:00,2013-05-23 05:06:00,1905-01-01 05:06:00,29.0,,2018,1,0,53,True,False,False,78
1,1,2016-12-30 05:06:00,2010-04-24 05:06:00,1905-01-01 05:06:00,33.0,,2016,1,0,50,True,False,False,119
2,2,2018-12-31 05:06:00,2006-06-09 05:06:00,1905-01-01 05:06:00,42.0,,2018,1,0,53,True,False,False,109
3,3,2018-12-31 05:06:00,2018-04-28 05:06:00,1905-01-01 05:06:00,20.0,,2018,1,0,50,True,False,False,119
4,4,2013-12-31 05:06:00,2009-01-09 05:06:00,1905-01-01 05:06:00,32.0,,2013,1,0,115,True,False,False,102


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48849 entries, 0 to 48848
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           48849 non-null  int64  
 1   datum                                48849 non-null  object 
 2   startdatum                           48849 non-null  object 
 3   einddatum                            48849 non-null  object 
 4   leeftijd                             48849 non-null  float64
 5   dienstperiode                        0 non-null      float64
 6   status_jaar                          48849 non-null  int64  
 7   status_actief                        48849 non-null  int64  
 8   status_beëindigd                     48849 non-null  int64  
 9   reisafstand                          48849 non-null  int64  
 10  uitdienst_reden_Niet van toepassing  48849 non-null  bool   
 11  uitdienst_reden_Ontslag     

In [19]:
X, y = df.drop('reisafstand', axis=1), df[['reisafstand']]

In [20]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [21]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [23]:
# Define hyperparameters
params = {
    "objective": "reg:squarederror",
    "eta": 0.05,  # Learning rate - controls the step size during training
    "max_depth": 8,  # Maximum depth of a tree - controls the complexity of the trees
    "min_child_weight": 5,  # Minimum sum of instance weight needed in a child
    "gamma": 0.1,  # Minimum loss reduction required to make a further partition on a leaf node
    "subsample": 0.8,  # Subsample ratio of the training instances
    "colsample_bytree": 0.8,  # Subsample ratio of columns when constructing each tree
    "reg_alpha": 0.001,  # L1 regularization term on weights
    "reg_lambda": 1.0,  # L2 regularization term on weights
    "n_estimators": 1000,  # Number of boosting rounds or trees
    "early_stopping_rounds": 50,  # Stop training if no improvement in the validation metric after this number of rounds
    "eval_metric": "rmse"  # Evaluation metric to use
}

# Training the XGBoost model
num_boost_round = 1000
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtrain_reg, "Train"), (dtest_reg, "Validation")],  # Adding a validation set for monitoring
    early_stopping_rounds=params["early_stopping_rounds"],  # Early stopping based on the validation set
    verbose_eval=50  # Display evaluation results every 50 rounds
)

[0]	Train-rmse:47.23732	Validation-rmse:46.70953


Parameters: { "early_stopping_rounds", "n_estimators" } are not used.



[50]	Train-rmse:11.42334	Validation-rmse:12.36796
[100]	Train-rmse:7.81652	Validation-rmse:9.19074
[150]	Train-rmse:6.71416	Validation-rmse:8.64441
[200]	Train-rmse:6.08314	Validation-rmse:8.54472
[250]	Train-rmse:5.57302	Validation-rmse:8.51952
[293]	Train-rmse:5.16624	Validation-rmse:8.55337


In [24]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [25]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 8.555
