In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.preprocessing import StandardScaler

In [17]:
pd.set_option('display.width', 500)
np.set_printoptions(linewidth=500)
pd.set_option('display.max_columns', 10)

df = pd.read_csv(
    "../../dataset_building/paris_velib_dataset_preprocessed.csv",
    index_col=False
)

df.shape

df.head()

Unnamed: 0,is_installed,is_renting,is_returning,longitude,latitude,...,record_timestamp_minute_46,record_timestamp_minute_50,record_timestamp_minute_51,record_timestamp_minute_59,record_timestamp_second_0
0,1,1,1,0.023661,0.48871,...,1,0,0,0,1
1,1,1,1,0.02336,0.488375,...,1,0,0,0,1
2,1,1,1,0.02352,0.488439,...,1,0,0,0,1
3,1,1,1,0.023851,0.489104,...,1,0,0,0,1
4,1,1,1,0.023768,0.488158,...,1,0,0,0,1


In [18]:
df.isnull().values.any()

False

In [19]:
df.describe()

Unnamed: 0,is_installed,is_renting,is_returning,longitude,latitude,...,record_timestamp_minute_46,record_timestamp_minute_50,record_timestamp_minute_51,record_timestamp_minute_59,record_timestamp_second_0
count,95642.0,95642.0,95642.0,95642.0,95642.0,...,95642.0,95642.0,95642.0,95642.0,95642.0
mean,0.98967,0.978169,0.978169,0.023417,0.48858,...,0.01491,0.01491,0.104368,0.029861,1.0
std,0.101112,0.146134,0.146134,0.000552,0.000296,...,0.121192,0.121192,0.305739,0.170206,0.0
min,0.0,0.0,0.0,0.021656,0.487646,...,0.0,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,0.023044,0.488379,...,0.0,0.0,0.0,0.0,1.0
50%,1.0,1.0,1.0,0.023439,0.488585,...,0.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,1.0,0.023785,0.488791,...,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,0.025382,0.48947,...,1.0,1.0,1.0,1.0,1.0


In [20]:
X = df.drop(["bike_availability_ratio"], axis=1)

In [21]:
y = df["bike_availability_ratio"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [23]:
X_train.shape

(76513, 39)

In [24]:
X_test.shape

(19129, 39)

In [25]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

In [26]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [27]:
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)

RandomForestRegressor()

In [28]:
tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

In [29]:
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))

Decision Tree training mse =  5.443429585343566e-05  & mae =  5.334560993657215e-05  & rmse =  0.007377960141762468
Random Forest training mse =  11.630675149769722  & mae =  2.2007538316009887  & rmse =  3.410377567040008


In [30]:
tree_test_mse = mean_squared_error(y_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(y_test, tree_model.predict(test_scaled))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))

In [31]:
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

Decision Tree test mse =  120.2329595283558  & mae =  5.983877695082856  & rmse =  10.965079093575012
Random Forest test mse =  82.30887355782913  & mae =  5.9477617040989585  & rmse =  9.07242379730076
