In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.width', 500)
np.set_printoptions(linewidth=500)
pd.set_option('display.max_columns', 10)

df = pd.read_csv(
    "../../dataset_building/suzhou_china_baseline_dataset.csv",
    index_col=False
)

df.shape

df.head()

Unnamed: 0,num,weekday,hour
0,0,5,17
1,1,5,17
2,1,5,17
3,2,5,17
4,4,5,17


In [3]:
df.isnull().values.any()

False

In [4]:
df.describe()

Unnamed: 0,num,weekday,hour
count,45949.0,45949.0,45949.0
mean,6.996387,2.93384,11.495767
std,7.805887,2.061045,6.925131
min,0.0,0.0,0.0
25%,1.0,1.0,5.0
50%,4.0,3.0,11.0
75%,11.0,5.0,18.0
max,30.0,6.0,23.0


In [6]:
X = df.drop(["num"], axis=1)

In [7]:
y = df["num"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [9]:
X_train.shape

(36759, 2)

In [10]:
X_test.shape

(9190, 2)

In [11]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

In [12]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [13]:
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)

RandomForestRegressor()

In [14]:
tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

In [15]:
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))

Decision Tree training mse =  33.335094494795555  & mae =  4.3163502691380895  & rmse =  5.773655210938349
Random Forest training mse =  33.336850473252376  & mae =  4.315326301385905  & rmse =  5.773807277113809


In [17]:
tree_test_mse = mean_squared_error(y_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(y_test, tree_model.predict(test_scaled))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))

In [18]:
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

Decision Tree test mse =  33.17775685904789  & mae =  4.295474051565442  & rmse =  5.760013616220702
Random Forest test mse =  33.18779908677382  & mae =  4.295042727451679  & rmse =  5.760885269363887
