In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import matplotlib.pyplot as plt
import time
import seaborn as sns
from sklearn.externals import joblib



In [2]:
file_path = '../../data/input/integrated_data_dummy.csv'

data = pd.read_csv(file_path)
data = data.sort_values(["busCode","busCodeSB"])

In [3]:
target_col = ['headway']
bb_col = ['busBunching']
hd_threshold = ["headwayThreshold"]
features = list(set(list(data.columns))-set(target_col)-set(bb_col)-set(hd_threshold))

In [4]:
# Get label column and remove it from data
y = data['headway']
y_threshold = data['headwayThreshold']

data.drop('headway', axis=1, inplace=True)
data.drop('busBunching', axis=1, inplace=True)
data.drop('headwayThreshold', axis=1, inplace=True)

In [5]:
# Normalize data
min_max_scaler = preprocessing.MinMaxScaler()
data_scale = min_max_scaler.fit_transform(data)

In [6]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data_scale, y, test_size=0.20, random_state=7)

In [7]:
def rmse_cv(model, X_train, y_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = "neg_mean_squared_error", cv = 5))
    return(rmse)

# function to plot the RMSE vs parameter value
def plot_rmse_param(series, param_name):
    series.plot(title = "Validation Error vs " + param_name)
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    
# function to get the best RMSE and the best parameter value of the model
def best_rmse_param(series):
    best_rmse = series.min()
    best_param = series.idxmin() 
    
    return(best_rmse, best_param)

### SVR

In [None]:
start = time.time()

cs = [0.001, 0.01, 0.05, 1]
cv_srv_rmse = [rmse_cv(SVR(C = c), train_X, train_Y).mean() 
            for c in cs]

series = pd.Series(cv_srv_rmse, index = cs)
plot_rmse_param(series, "cs")
best_rmse_srv, best_c_svr = best_rmse_param(series)



In [None]:
epsilons = [0.001, 0.01, 0.05, 0.08]
cv_srv_rmse = [rmse_cv(SVR(C = best_c_svr, epsilon = e), 
                      train_X, train_Y).mean() 
            for e in epsilons]

series = pd.Series(cv_srv_rmse, index = epsilons)
plot_rmse_param(series, "epsilons")
best_rmse_srv, best_epsilon_svr = best_rmse_param(series)

In [None]:
random.seed(42)

try:
    start
except NameError: # start does not exist at all
    start = time.time()

svr = SVR(C=best_c_svr, epsilon=best_epsilon_svr)
svr.fit(train_X, train_Y)

end = time.time()
print("Execution time: " + str((end - start)/60) + " min")

In [None]:
# Saving a pickle file for the model
joblib.dump(svr, 'Saved_SVR_.pkl')

In [None]:
print("best_c_svr: " + str(best_c_svr))
print("best_epsilon_svr: " + str(best_epsilon_svr))

In [None]:
pred_array = svr.predict(test_X)

In [None]:
# removing the array of each element
pred = []
for p in pred_array:
    pred.append(p)

rmse_rf = np.sqrt(mean_squared_error(test_Y, pred))
print(rmse_rf)

In [None]:
print(min(pred))
print(max(pred))

In [None]:
alpha = y_threshold[test_Y.index]
alpha

In [None]:
bb_pred = np.less_equal(pred, alpha)
bb_label = np.less_equal(test_Y, alpha)

In [None]:
accuracy = accuracy_score(bb_label, bb_pred)
precision = precision_score(bb_label, bb_pred)
recall = recall_score(bb_label, bb_pred)
f_measure = f1_score(bb_label, bb_pred)

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))