In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.preprocessing import StandardScaler

In [18]:
pd.set_option('display.width', 500)
np.set_printoptions(linewidth=500)
pd.set_option('display.max_columns', 10)

df = pd.read_csv(
    "../../dataset_building/oslo_citybike_dataset_preprocessed.csv",
    index_col=False
)

df.shape

df.head()

Unnamed: 0,station_lat,station_lon,is_installed,is_renting,is_returning,...,record_timestamp_second_34,record_timestamp_second_36,record_timestamp_second_43,record_timestamp_second_46,record_timestamp_second_59
0,0.599163,0.107574,0,0,0,...,1,0,0,0,0
1,0.599132,0.1075,0,0,0,...,1,0,0,0,0
2,0.599127,0.107272,0,0,0,...,1,0,0,0,0
3,0.599125,0.107509,0,0,0,...,1,0,0,0,0
4,0.599069,0.107603,0,0,0,...,1,0,0,0,0


In [19]:
df.isnull().values.any()

False

In [20]:
df.describe()

Unnamed: 0,station_lat,station_lon,is_installed,is_renting,is_returning,...,record_timestamp_second_34,record_timestamp_second_36,record_timestamp_second_43,record_timestamp_second_46,record_timestamp_second_59
count,2223.0,2223.0,2223.0,2223.0,2223.0,...,2223.0,2223.0,2223.0,2223.0,2223.0
mean,0.599223,0.107442,0.0,0.0,0.0,...,0.111111,0.111111,0.111111,0.111111,0.111111
std,0.000108,0.000276,0.0,0.0,0.0,...,0.31434,0.31434,0.31434,0.31434,0.31434
min,0.599032,0.106511,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
25%,0.599137,0.107253,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
50%,0.599209,0.10748,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
75%,0.599294,0.107622,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
max,0.599534,0.108143,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0


In [21]:
X = df.drop(["bike_availability_ratio"], axis=1)

In [22]:
y = df["bike_availability_ratio"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [24]:
X_train.shape

(1778, 28)

In [25]:
X_test.shape

(445, 28)

In [26]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

In [27]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [28]:
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)

RandomForestRegressor()

In [29]:
tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

In [30]:
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))

Decision Tree training mse =  1.0803578763853565e-31  & mae =  6.593900528707341e-17  & rmse =  3.286879791512547e-16
Random Forest training mse =  0.6825211434528584  & mae =  0.4249628981849911  & rmse =  0.8261483785936147


In [31]:
tree_test_mse = mean_squared_error(y_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(y_test, tree_model.predict(test_scaled))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))

In [32]:
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

Decision Tree test mse =  6.156762519262882  & mae =  0.9023288683630204  & rmse =  2.4812824344001796
Random Forest test mse =  4.148980022878501  & mae =  1.0597111144805929  & rmse =  2.0369045198237696
