# Decision Tree Regression

## Import packages

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    root_mean_squared_error,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
CLEAN_DATA_FOLDER = "clean_data"
MODELS_FOLDER = "models"

## Load the dataframe

In [3]:
train_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "train.csv"))
test_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "test.csv")).sort_values(
    ["Day", "Line", "Service", "Direction Number", "Sequence"]
)

## Split into X and y

In [4]:
train_X = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
train_y = train_df["On"]
test_X = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
test_y = test_df["On"]

## Train the Decision Tree Regressor Model

In [5]:
dt = DecisionTreeRegressor(random_state=42, criterion="poisson")
dt = dt.fit(X=train_X, y=train_y)

In [6]:
train_y_pred = np.floor(dt.predict(train_X)).astype(int)
test_y_pred = np.floor(dt.predict(test_X)).astype(int)

## Plot the Tree

In [7]:
# _, ax = plt.subplots(1, 1, figsize=(32, 14))
# _ = plot_tree(
#     dt,
#     max_depth=4,
#     feature_names=train_X.columns,
#     filled=True,
#     proportion=True,
#     rounded=True,
#     precision=2,
#     fontsize=9,
#     ax=ax,
# )

## Feature Importance

In [8]:
# feat_imp = pd.DataFrame(
#     {
#         "Feature": [x for x in dt.feature_names_in_],
#         "Importance": [x for x in dt.feature_importances_],
#     }
# )
# _, ax = plt.subplots(1, 1, figsize=(16, 9))
# _ = sns.barplot(feat_imp, x="Feature", y="Importance")
# _ = plt.title("Feature Importance for Decision Tree Regressor")

## Visualize the Predictions

In [9]:
# line_fit = pd.DataFrame({"True": test_y, "Predicted": test_y_pred}, index=test_df["Day"])
# _, ax = plt.subplots(1, 1, figsize=(16,9))
# _ = sns.lineplot(line_fit, legend=True, ax=ax)

## Report Train and Test results

In [10]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 4.00479883337299
train mae: 0.26889678665473526
train r2 score: 0.9934890118967664


In [11]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 38.07236030675256
test mae: 10.010802185075336
test r2 score: 0.3786645178275053


## Export Model

In [12]:
pickle.dump(dt, open(os.path.join(MODELS_FOLDER, "base_decision_tree.pkl"), "wb"))

## Hyperparameter Tuning with GridSearchCV

### Declare base model and parameters

In [13]:
base_dt = DecisionTreeRegressor(random_state=42)
param_grid = [
    {
        "criterion": ["squared_error", "friedman_mse", "poisson"],
        "max_depth": [20, 40, None],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
    },
    {
        "criterion": ["squared_error", "friedman_mse", "poisson"],
        "max_depth": [20, 40, None],
        "min_samples_split": [14],
        "min_samples_leaf": [7],
    },
]

### Declare the scorer and grid search

In [14]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_dt, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=5)

### Train the models

In [15]:
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  31.6s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  32.0s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  32.7s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  33.2s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  33.0s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  43.1s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  43.4s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  43.4s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_sam

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,32.018134,0.536445,0.483637,0.08658,squared_error,20.0,1,2,"{'criterion': 'squared_error', 'max_depth': 20...",-849.746842,-897.279619,-922.608781,-900.47419,-910.906547,-896.203196,24.863774,12
1,41.51636,0.168588,1.853322,0.021912,squared_error,40.0,1,2,"{'criterion': 'squared_error', 'max_depth': 40...",-1076.826382,-1108.697054,-1149.469039,-1124.072361,-1130.517218,-1117.916411,24.354184,16
2,40.76448,0.538411,2.256078,0.087899,squared_error,,1,2,"{'criterion': 'squared_error', 'max_depth': No...",-1082.055098,-1115.886084,-1160.678629,-1118.041001,-1118.343889,-1119.00094,24.952277,18
3,30.843978,0.3972,0.536588,0.079155,friedman_mse,20.0,1,2,"{'criterion': 'friedman_mse', 'max_depth': 20,...",-856.303037,-891.641692,-919.099496,-899.299062,-911.612559,-895.591169,21.827645,11
4,39.615143,0.218012,2.097597,0.149558,friedman_mse,40.0,1,2,"{'criterion': 'friedman_mse', 'max_depth': 40,...",-1068.919467,-1116.436217,-1152.580107,-1120.96549,-1112.909827,-1114.362222,26.74197,13
5,39.298535,0.652641,2.387165,0.055826,friedman_mse,,1,2,"{'criterion': 'friedman_mse', 'max_depth': Non...",-1069.740935,-1116.462462,-1145.909478,-1138.94634,-1121.366826,-1118.485208,26.680135,17
6,32.240023,0.535084,0.436688,0.086919,poisson,20.0,1,2,"{'criterion': 'poisson', 'max_depth': 20, 'min...",-865.902555,-873.77959,-884.301041,-915.546982,-823.533831,-872.6128,29.782857,10
7,41.595801,0.540971,2.027079,0.103082,poisson,40.0,1,2,"{'criterion': 'poisson', 'max_depth': 40, 'min...",-1086.299834,-1116.589406,-1153.562669,-1156.578661,-1073.230896,-1117.252293,33.945203,14
8,40.216851,0.637977,2.113874,0.113727,poisson,,1,2,"{'criterion': 'poisson', 'max_depth': None, 'm...",-1082.95238,-1115.270305,-1142.736793,-1159.054469,-1089.392089,-1117.881207,29.500872,15
9,27.588221,0.359044,0.310576,0.024659,squared_error,20.0,7,14,"{'criterion': 'squared_error', 'max_depth': 20...",-717.600006,-728.163741,-735.43405,-715.6265,-712.030222,-721.770904,8.690004,1


In [17]:
print(grid_search.best_params_)

{'criterion': 'squared_error', 'max_depth': 20, 'min_samples_leaf': 7, 'min_samples_split': 14}


### Extract the best model

In [18]:
best_dt = grid_search.best_estimator_

In [26]:
train_y_pred = np.floor(best_dt.predict(train_X)).astype(int)
test_y_pred = np.floor(best_dt.predict(test_X)).astype(int)

### Plot the Tree

In [27]:
# _, ax = plt.subplots(1, 1, figsize=(32, 14))
# _ = plot_tree(
#     best_dt,
#     max_depth=4,
#     feature_names=train_X.columns,
#     filled=True,
#     proportion=True,
#     rounded=True,
#     precision=2,
#     fontsize=9,
#     ax=ax,
# )

### Feature Importance

In [28]:
# feat_imp = pd.DataFrame(
#     {
#         "Feature": [x for x in best_dt.feature_names_in_],
#         "Importance": [x for x in best_dt.feature_importances_],
#     }
# )
# _, ax = plt.subplots(1, 1, figsize=(16, 9))
# _ = sns.barplot(feat_imp, x="Feature", y="Importance")
# _ = plt.title("Feature Importance for Best Decision Tree Regressor")

### Visualize the Predictions

In [29]:
# line_fit = pd.DataFrame({"True": test_y, "Predicted": test_y_pred}, index=test_df["Day"])
# _, ax = plt.subplots(1, 1, figsize=(16,9))
# _ = sns.lineplot(line_fit, legend=True, ax=ax)

### Report Train and Test results

In [30]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 21.51692017420295
train mae: 6.321249553088095
train r2 score: 0.812048538264482


In [31]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 28.99371217484579
test mae: 7.75416613247825
test r2 score: 0.6396585696426409


### Export Models

In [32]:
pickle.dump(best_dt, open(os.path.join(MODELS_FOLDER, "tuned_decision_tree.pkl"), "wb"))