In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
import matplotlib.pyplot as plt

In [None]:
file_path = '../../data/input/integrated_data_dummy.csv'

data = pd.read_csv(file_path, nrows=50000)

In [None]:
data.isnull().any()

In [None]:
target_col = ['headway']
bb_col = ['busBunching']
features = list(set(list(data.columns))-set(target_col)-set(bb_col))

In [None]:
# Get label column and remove it from data
y = data['headway']
data.drop('headway', axis=1, inplace=True)
data.drop('busBunching', axis=1, inplace=True)

In [None]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y, test_size=0.20, random_state=7)

In [None]:
def rmse_cv(model, X_train, y_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = "neg_mean_squared_error", cv = 5))
    return(rmse)

# function to plot the RMSE vs parameter value
def plot_rmse_param(series, param_name):
    series.plot(title = "Validation Error vs " + param_name)
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    
# function to get the best RMSE and the best parameter value of the model
def best_rmse_param(series):
    best_rmse = series.min()
    best_param = series.idxmin() 
    
    return(best_rmse, best_param)

### Random Forest

In [None]:
n_estimators = [10, 50, 100]
cv_rf_rmse = [rmse_cv(RandomForestRegressor(n_estimators = n, n_jobs=8), train_X, train_Y).mean() 
            for n in n_estimators]

series = pd.Series(cv_rf_rmse, index = n_estimators)
plot_rmse_param(series, "n_estimators")
best_rmse_rf, best_estimator_rf = best_rmse_param(series)

In [None]:
n_min_samples_split = [5, 10, 15, 20, 25]
cv_rf_rmse = [rmse_cv(RandomForestRegressor(n_estimators = best_estimator_rf, min_samples_split = n, n_jobs=8), 
                      train_X, train_Y).mean() 
            for n in n_min_samples_split]

series = pd.Series(cv_rf_rmse, index = n_min_samples_split)
plot_rmse_param(series, "n_min_samples_split")
best_rmse_rf, best_split_rf = best_rmse_param(series)

In [None]:
random.seed(42)
rf = RandomForestRegressor(n_estimators=best_estimator_rf, min_samples_split=best_split_rf, n_jobs=8)
rf.fit(train_X, train_Y)

In [None]:
pred = rf.predict(test_X)
rmse_rf = np.sqrt(mean_squared_error(test_Y,pred))
print(rmse_rf)

In [None]:
# Features importance

#create dictionary
f_imps = {}
for i in range(num_features):
    f_imps[features[i]] = rf.feature_importances_[i]
    
#sort dictionary 
sorted_feature_names = sorted(f_imps, key=f_imps.__getitem__, reverse=True)
sorted_values = sorted(f_imps.values(), reverse=True)

num_to_print = 20
for i in range(num_to_print):
    print("%15s %4.3f" % (sorted_feature_names[i], sorted_values[i]))