# Import the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Import the dataframe

In [2]:
df: pd.DataFrame = pd.read_csv('Datasets/trainingdata.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,datum,startdatum,einddatum,leeftijd,dienstperiode,status_jaar,status_actief,status_beëindigd,reisafstand,uitdienst_reden_Niet van toepassing,uitdienst_reden_Ontslag,uitdienst_reden_Vertrek,aantal_geboortes
0,0,2018-12-31 05:06:00,2013-05-23 05:06:00,1905-01-01 05:06:00,29.0,,2018,1,0,53,True,False,False,78
1,1,2016-12-30 05:06:00,2010-04-24 05:06:00,1905-01-01 05:06:00,33.0,,2016,1,0,50,True,False,False,119
2,2,2018-12-31 05:06:00,2006-06-09 05:06:00,1905-01-01 05:06:00,42.0,,2018,1,0,53,True,False,False,109
3,3,2018-12-31 05:06:00,2018-04-28 05:06:00,1905-01-01 05:06:00,20.0,,2018,1,0,50,True,False,False,119
4,4,2013-12-31 05:06:00,2009-01-09 05:06:00,1905-01-01 05:06:00,32.0,,2013,1,0,115,True,False,False,102


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48849 entries, 0 to 48848
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           48849 non-null  int64  
 1   datum                                48849 non-null  object 
 2   startdatum                           48849 non-null  object 
 3   einddatum                            48849 non-null  object 
 4   leeftijd                             48849 non-null  float64
 5   dienstperiode                        0 non-null      float64
 6   status_jaar                          48849 non-null  int64  
 7   status_actief                        48849 non-null  int64  
 8   status_beëindigd                     48849 non-null  int64  
 9   reisafstand                          48849 non-null  int64  
 10  uitdienst_reden_Niet van toepassing  48849 non-null  bool   
 11  uitdienst_reden_Ontslag     

In [4]:
X, y = df.drop('reisafstand', axis=1), df[['reisafstand']]

In [5]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [6]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [8]:
# Define hyperparameters
params = {"objective": "reg:squarederror"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [9]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [10]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 8.438


In [11]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:35.93639	validation-rmse:35.71713
[1]	train-rmse:26.71746	validation-rmse:26.62477
[2]	train-rmse:20.41243	validation-rmse:20.50767
[3]	train-rmse:16.11431	validation-rmse:16.39603
[4]	train-rmse:13.22708	validation-rmse:13.66501
[5]	train-rmse:11.40807	validation-rmse:12.03329
[6]	train-rmse:10.16472	validation-rmse:10.98858
[7]	train-rmse:9.20774	validation-rmse:10.17936
[8]	train-rmse:8.64267	validation-rmse:9.72361
[9]	train-rmse:8.11072	validation-rmse:9.32167
[10]	train-rmse:7.77966	validation-rmse:9.05325
[11]	train-rmse:7.40199	validation-rmse:8.82076
[12]	train-rmse:7.03742	validation-rmse:8.59509
[13]	train-rmse:6.70820	validation-rmse:8.42276
[14]	train-rmse:6.48779	validation-rmse:8.26638
[15]	train-rmse:6.29428	validation-rmse:8.15998
[16]	train-rmse:6.12948	validation-rmse:8.08864
[17]	train-rmse:5.96593	validation-rmse:8.02692
[18]	train-rmse:5.86287	validation-rmse:7.98895
[19]	train-rmse:5.74626	validation-rmse:7.97294
[20]	train-rmse:5.66766	validation-