In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn_rvm import EMRVR
import time



In [2]:
data_path = '../data/normalized_data_X_headway_5p.csv'
y_path = '../data/y_headway_5p.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(69884, 420)
(69884, 2)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.0,0.084945,0.264128,0.4394,0.010487,0.0,6e-06,0.265656,0.439493,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.118068,0.0,0.084965,0.264428,0.443488,0.014304,0.0,9e-06,0.265956,0.444051,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.0,0.084972,0.263625,0.448606,0.01803,0.0,1e-05,0.264915,0.448957,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.118068,0.0,0.084979,0.261608,0.458297,0.025182,0.0,1.4e-05,0.263352,0.456961,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.118068,0.0,0.084985,0.260228,0.463266,0.028953,0.0,0.0,0.261792,0.46317,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [3]:
data = data[0:20000]
y = y[0:20000]
print(data.shape)
print(y.shape)

(20000, 420)
(20000, 2)


In [4]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y.headway, test_size=0.20, random_state=7)

In [5]:
def get_quality(pred):
    alpha = y.headwayThreshold[test_Y.index]
    
    bb_pred = np.less_equal(pred, alpha)
    bb_label = np.less_equal(test_Y, alpha)
    
    rmse = np.sqrt(mean_squared_error(test_Y, pred))
    print('RMSE: ' + str(rmse))
    
    # Bus Bunching
    print("Accuracy: " + str(accuracy_score(bb_label, bb_pred)))
    print("Precision: " + str(precision_score(bb_label, bb_pred)))
    print("Recall: " + str(recall_score(bb_label, bb_pred)))
    print("F-measure: " + str(f1_score(bb_label, bb_pred)))

### EMRVR

In [6]:
model = EMRVR(kernel='poly')
model.fit(train_X, train_Y)



In [7]:
pred_array = model.predict(test_X)
pred_array

array([40.70787218, 24.39194826, 28.11163297, ..., 63.88202036,
       98.63967354, 35.06682969])

### RBF

In [15]:
get_quality(pred_array) #5000

RMSE: 26.99151654920116
Accuracy: 0.934
Precision: 0.9166666666666666
Recall: 0.7475728155339806
F-measure: 0.8235294117647058


In [8]:
get_quality(pred_array) #10000

RMSE: 38.61351299825054
Accuracy: 0.826
Precision: 0.5278969957081545
Recall: 0.6577540106951871
F-measure: 0.5857142857142857


In [9]:
get_quality(pred_array) #15000

RMSE: 42.91724483387854
Accuracy: 0.865
Precision: 0.6616666666666666
Recall: 0.662771285475793
F-measure: 0.6622185154295246


In [None]:
get_quality(pred_array) #20000

In [None]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-30000

In [None]:
get_quality(pred_array) #rbf-30000

In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-3000

RMSE: 25.08775719784502


In [9]:
get_quality(pred_array) #rbf-3000

Accuracy: 0.9533333333333334
Precision: 0.9848484848484849
Recall: 0.7065217391304348
F-measure: 0.8227848101265823


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-1000

RMSE: 43.86635136944401


In [13]:
get_quality(pred_array) #rbf-1000

Accuracy: 0.94
Precision: 0.4
Recall: 0.18181818181818182
F-measure: 0.25000000000000006


### Linear

In [8]:
get_quality(pred_array) #1000

RMSE: 43.477460345852435
Accuracy: 0.94
Precision: 0.4
Recall: 0.18181818181818182
F-measure: 0.25000000000000006


In [12]:
get_quality(pred_array) #5000

RMSE: 27.051489268519177
Accuracy: 0.938
Precision: 0.9337349397590361
Recall: 0.7524271844660194
F-measure: 0.8333333333333334


In [11]:
get_quality(pred_array) #10000

RMSE: 42.63768314043779
Accuracy: 0.8255
Precision: 0.5273522975929978
Recall: 0.6443850267379679
F-measure: 0.5800240673886883


In [12]:
get_quality(pred_array) #15000

RMSE: 45.559858114008414
Accuracy: 0.864
Precision: 0.6583747927031509
Recall: 0.662771285475793
F-measure: 0.6605657237936773


In [None]:
get_quality(pred_array) #20000

In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-4000

RMSE: 21.693086312479963


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-30000

RMSE: 44.16072216916598


In [13]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-10000

RMSE: 42.63768314043779


In [10]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-10000

RMSE: 38.61351299825054


### Poly

In [11]:
get_quality(pred_array) #1000

RMSE: 43.333844703118544
Accuracy: 0.945
Precision: 0.5
Recall: 0.09090909090909091
F-measure: 0.15384615384615385


In [9]:
get_quality(pred_array) #5000

RMSE: 26.030245432060145
Accuracy: 0.942
Precision: 0.9457831325301205
Recall: 0.7621359223300971
F-measure: 0.8440860215053764


In [8]:
get_quality(pred_array) #10000

RMSE: 34.9088053711807
Accuracy: 0.86
Precision: 0.6186868686868687
Recall: 0.6550802139037433
F-measure: 0.6363636363636364


In [16]:
get_quality(pred_array) #15000

RMSE: 40.08488186678014
Accuracy: 0.8786666666666667
Precision: 0.7124773960216998
Recall: 0.657762938230384
F-measure: 0.6840277777777778


In [8]:
get_quality(pred_array) #20000

RMSE: 41.57439986464653
Accuracy: 0.8305
Precision: 0.5213776722090261
Recall: 0.6148459383753502
F-measure: 0.56426735218509


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #poly #base toda e 3ll0000- kernel dead

RMSE: 34.9088053711807
