# Import the packages

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Import the dataframe

In [31]:
df: pd.DataFrame = pd.read_csv('Datasets/trainingdata.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,datum,startdatum,einddatum,leeftijd,dienstperiode,status_jaar,status_actief,status_beëindigd,reisafstand,uitdienst_reden_Niet van toepassing,uitdienst_reden_Ontslag,uitdienst_reden_Vertrek,aantal_geboortes
0,0,2018-12-31 05:06:00,2013-05-23 05:06:00,1905-01-01 05:06:00,29.0,,2018,1,0,53,True,False,False,78
1,1,2016-12-30 05:06:00,2010-04-24 05:06:00,1905-01-01 05:06:00,33.0,,2016,1,0,50,True,False,False,119
2,2,2018-12-31 05:06:00,2006-06-09 05:06:00,1905-01-01 05:06:00,42.0,,2018,1,0,53,True,False,False,109
3,3,2018-12-31 05:06:00,2018-04-28 05:06:00,1905-01-01 05:06:00,20.0,,2018,1,0,50,True,False,False,119
4,4,2013-12-31 05:06:00,2009-01-09 05:06:00,1905-01-01 05:06:00,32.0,,2013,1,0,115,True,False,False,102


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48849 entries, 0 to 48848
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           48849 non-null  int64  
 1   datum                                48849 non-null  object 
 2   startdatum                           48849 non-null  object 
 3   einddatum                            48849 non-null  object 
 4   leeftijd                             48849 non-null  float64
 5   dienstperiode                        0 non-null      float64
 6   status_jaar                          48849 non-null  int64  
 7   status_actief                        48849 non-null  int64  
 8   status_beëindigd                     48849 non-null  int64  
 9   reisafstand                          48849 non-null  int64  
 10  uitdienst_reden_Niet van toepassing  48849 non-null  bool   
 11  uitdienst_reden_Ontslag     

In [33]:
X, y = df.drop('uitdienst_reden_Vertrek', axis=1), df[['uitdienst_reden_Vertrek']]

In [34]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [35]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [37]:
# Define hyperparameters
params = {"objective": "reg:squarederror"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [38]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [39]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 0.027


In [40]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:0.06193	validation-rmse:0.06522
[1]	train-rmse:0.04346	validation-rmse:0.04953
[2]	train-rmse:0.03051	validation-rmse:0.03960
[3]	train-rmse:0.02144	validation-rmse:0.03368
[4]	train-rmse:0.01508	validation-rmse:0.03035
[5]	train-rmse:0.01063	validation-rmse:0.02858
[6]	train-rmse:0.00750	validation-rmse:0.02767
[7]	train-rmse:0.00532	validation-rmse:0.02721
[8]	train-rmse:0.00378	validation-rmse:0.02698
[9]	train-rmse:0.00271	validation-rmse:0.02687
[10]	train-rmse:0.00196	validation-rmse:0.02682
[11]	train-rmse:0.00143	validation-rmse:0.02679
[12]	train-rmse:0.00106	validation-rmse:0.02678
[13]	train-rmse:0.00080	validation-rmse:0.02677
[14]	train-rmse:0.00061	validation-rmse:0.02677
[15]	train-rmse:0.00048	validation-rmse:0.02677
[16]	train-rmse:0.00038	validation-rmse:0.02677
[17]	train-rmse:0.00031	validation-rmse:0.02677
[18]	train-rmse:0.00025	validation-rmse:0.02677
[19]	train-rmse:0.00021	validation-rmse:0.02677
[20]	train-rmse:0.00017	validation-rmse:0.02677
[2