# Decision Tree Regression

## Import packages

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    root_mean_squared_error,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
CLEAN_DATA_FOLDER = "clean_data"
MODELS_FOLDER = "models"

## Load the dataframe

In [3]:
train_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "train_wo_weather.csv"))
test_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "test_wo_weather.csv")).sort_values(
    ["Day", "Line", "Service", "Direction Number", "Sequence"]
)

## Split into X and y

In [4]:
train_X = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
train_y = train_df["On"]
test_X = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
test_y = test_df["On"]

## Train the Decision Tree Regressor Model

In [5]:
dt = DecisionTreeRegressor(random_state=42, criterion="poisson")
dt = dt.fit(X=train_X, y=train_y)

In [6]:
train_y_pred = np.floor(dt.predict(train_X)).astype(int)
test_y_pred = np.floor(dt.predict(test_X)).astype(int)

## Plot the Tree

In [7]:
# _, ax = plt.subplots(1, 1, figsize=(32, 14))
# _ = plot_tree(
#     dt,
#     max_depth=4,
#     feature_names=train_X.columns,
#     filled=True,
#     proportion=True,
#     rounded=True,
#     precision=2,
#     fontsize=9,
#     ax=ax,
# )

## Feature Importance

In [8]:
# feat_imp = pd.DataFrame(
#     {
#         "Feature": [x for x in dt.feature_names_in_],
#         "Importance": [x for x in dt.feature_importances_],
#     }
# )
# _, ax = plt.subplots(1, 1, figsize=(16, 9))
# _ = sns.barplot(feat_imp, x="Feature", y="Importance")
# _ = plt.title("Feature Importance for Decision Tree Regressor")

## Visualize the Predictions

In [9]:
# line_fit = pd.DataFrame({"True": test_y, "Predicted": test_y_pred}, index=test_df["Day"])
# _, ax = plt.subplots(1, 1, figsize=(16,9))
# _ = sns.lineplot(line_fit, legend=True, ax=ax)

## Report Train and Test results

In [10]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 16.310750216199562
train mae: 3.386988032495105
train r2 score: 0.8919976104787393


In [11]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 37.58158037687529
test mae: 9.581289629916652
test r2 score: 0.39458018689686514


## Export Model

In [12]:
pickle.dump(dt, open(os.path.join(MODELS_FOLDER, "base_decision_tree_wo_weather.pkl"), "wb"))

## Hyperparameter Tuning with GridSearchCV

### Declare base model and parameters

In [13]:
base_dt = DecisionTreeRegressor(random_state=42)
param_grid = [
    {
        "criterion": ["squared_error", "friedman_mse", "poisson"],
        "max_depth": [20, 40, None],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
    },
    {
        "criterion": ["squared_error", "friedman_mse", "poisson"],
        "max_depth": [20, 40, None],
        "min_samples_split": [14],
        "min_samples_leaf": [7],
    },
]

### Declare the scorer and grid search

In [14]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_dt, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=5)

### Train the models

In [15]:
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  20.2s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  20.3s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  20.3s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  20.4s
[CV] END criterion=squared_error, max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=  20.6s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  25.0s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  25.1s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_samples_split=2; total time=  25.2s
[CV] END criterion=squared_error, max_depth=40, min_samples_leaf=1, min_sam

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,20.031031,0.156634,0.341155,0.010027,squared_error,20.0,1,2,"{'criterion': 'squared_error', 'max_depth': 20...",-936.242599,-955.959652,-951.361906,-956.829397,-855.919627,-931.262636,38.389977,12
1,23.697423,0.299798,1.692238,0.055357,squared_error,40.0,1,2,"{'criterion': 'squared_error', 'max_depth': 40...",-1179.887433,-1173.415986,-1204.132855,-1182.710038,-1107.566264,-1169.542515,32.658696,16
2,23.790865,0.38066,2.354581,0.119648,squared_error,,1,2,"{'criterion': 'squared_error', 'max_depth': No...",-1185.748948,-1178.086895,-1210.202582,-1185.028313,-1108.295386,-1173.472425,34.358975,18
3,20.525642,0.346721,0.481808,0.085528,friedman_mse,20.0,1,2,"{'criterion': 'friedman_mse', 'max_depth': 20,...",-936.198091,-954.598429,-951.362415,-956.328011,-855.871748,-930.871738,38.16861,11
4,24.469335,0.350441,1.863414,0.078923,friedman_mse,40.0,1,2,"{'criterion': 'friedman_mse', 'max_depth': 40,...",-1182.237252,-1171.736849,-1203.781881,-1181.855537,-1107.329213,-1169.388146,32.742303,15
5,24.778765,0.407131,2.092934,0.195631,friedman_mse,,1,2,"{'criterion': 'friedman_mse', 'max_depth': Non...",-1185.354476,-1175.642575,-1208.696665,-1185.331482,-1111.184757,-1173.241991,32.883932,17
6,23.701582,0.382179,0.508263,0.036682,poisson,20.0,1,2,"{'criterion': 'poisson', 'max_depth': 20, 'min...",-914.718278,-926.228416,-908.688007,-924.802808,-825.270401,-899.941582,37.893836,10
7,28.882256,0.237617,2.008212,0.07536,poisson,40.0,1,2,"{'criterion': 'poisson', 'max_depth': 40, 'min...",-1146.612102,-1145.642068,-1174.519421,-1152.998754,-1086.876735,-1141.329816,29.15253,13
8,27.319026,0.166226,1.698273,0.04966,poisson,,1,2,"{'criterion': 'poisson', 'max_depth': None, 'm...",-1152.233927,-1148.887224,-1176.998168,-1157.29446,-1086.958586,-1144.474473,30.364129,14
9,20.206816,0.239119,0.304821,0.017901,squared_error,20.0,7,14,"{'criterion': 'squared_error', 'max_depth': 20...",-727.207964,-737.529423,-742.555385,-722.182115,-696.417485,-725.178474,16.0922,2


In [17]:
print(grid_search.best_params_)

{'criterion': 'friedman_mse', 'max_depth': 20, 'min_samples_leaf': 7, 'min_samples_split': 14}


### Extract the best model

In [18]:
best_dt = grid_search.best_estimator_

In [19]:
train_y_pred = np.floor(best_dt.predict(train_X)).astype(int)
test_y_pred = np.floor(best_dt.predict(test_X)).astype(int)

### Plot the Tree

In [20]:
# _, ax = plt.subplots(1, 1, figsize=(32, 14))
# _ = plot_tree(
#     best_dt,
#     max_depth=4,
#     feature_names=train_X.columns,
#     filled=True,
#     proportion=True,
#     rounded=True,
#     precision=2,
#     fontsize=9,
#     ax=ax,
# )

### Feature Importance

In [21]:
# feat_imp = pd.DataFrame(
#     {
#         "Feature": [x for x in best_dt.feature_names_in_],
#         "Importance": [x for x in best_dt.feature_importances_],
#     }
# )
# _, ax = plt.subplots(1, 1, figsize=(16, 9))
# _ = sns.barplot(feat_imp, x="Feature", y="Importance")
# _ = plt.title("Feature Importance for Best Decision Tree Regressor")

### Visualize the Predictions

In [22]:
# line_fit = pd.DataFrame({"True": test_y, "Predicted": test_y_pred}, index=test_df["Day"])
# _, ax = plt.subplots(1, 1, figsize=(16,9))
# _ = sns.lineplot(line_fit, legend=True, ax=ax)

### Report Train and Test results

In [23]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 22.986859254300235
train mae: 6.575650933917046
train r2 score: 0.7854913727012474


In [24]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 28.71175364843212
test mae: 7.653364599172468
test r2 score: 0.6466329998859526


### Export Models

In [25]:
pickle.dump(best_dt, open(os.path.join(MODELS_FOLDER, "tuned_decision_tree_wo_weather.pkl"), "wb"))