In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn_rvm import EMRVR
import time



In [2]:
city = 'CityA'

data_path = '../../data/output/' + city + '/normalized_data_X.csv'
y_path = '../../data/output/' + city + '/y_headway.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(1432633, 420)
(1432633, 2)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.066667,0.085019,0.2646,0.49615,0.057584,0.0,0.0,0.266155,0.496047,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.118068,0.066667,0.085026,0.270218,0.501802,0.064231,0.0,0.000148,0.271812,0.501642,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.066667,0.085033,0.273653,0.505296,0.068301,0.0,0.0,0.275187,0.505192,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.118068,0.066667,0.085039,0.277433,0.509404,0.0729,0.0,0.000149,0.280622,0.510132,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.118068,0.066667,0.085053,0.28005,0.513166,0.077983,0.0,0.000155,0.281878,0.513178,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
data_size = 25000 #[25000, 20000, 15000, 10000, 5000]

data = data[0:data_size]
y = y[0:data_size]
print(data.shape)
print(y.shape)

(25000, 420)
(25000, 2)


In [4]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y.headway, test_size=0.20, random_state=7)

In [5]:
def get_quality(pred):
    alpha = y.headwayThreshold[test_Y.index]
    
    bb_pred = np.less_equal(pred, alpha)
    bb_label = np.less_equal(test_Y, alpha)
    
    rmse = np.sqrt(mean_squared_error(test_Y, pred))
    print('RMSE: ' + str(rmse))
    
    # Bus Bunching
    print("Accuracy: " + str(accuracy_score(bb_label, bb_pred)))
    print("Precision: " + str(precision_score(bb_label, bb_pred)))
    print("Recall: " + str(recall_score(bb_label, bb_pred)))
    print("F-measure: " + str(f1_score(bb_label, bb_pred)))

### EMRVR

In [None]:
start = time.time()
model = EMRVR(kernel='poly')
model.fit(train_X, train_Y)
end = time.time()



In [None]:
pred_array = model.predict(test_X)
print(pred_array)

alpha = y.headwayThreshold[test_Y.index]
preds = np.less_equal(pred_array, alpha)
print(preds)

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - poly - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - poly - 5000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - rbf - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - rbf - 5000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - linear - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # citya - linear - 5000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - poly - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - poly - 5000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - rbf - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - rbf - 5000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - linear - 25000

In [None]:
print("Execution time: " + str((end - start)/60) + " min") # curitiba - linear - 5000

### RBF

#### Curitiba

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_rbf_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #25000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_rbf_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #20000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_rbf_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #15000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_rbf_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #10000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_rbf_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #5000

#### City A

In [None]:
get_quality(pred_array) #25000

In [15]:
get_quality(pred_array) #5000

RMSE: 26.99151654920116
Accuracy: 0.934
Precision: 0.9166666666666666
Recall: 0.7475728155339806
F-measure: 0.8235294117647058


In [8]:
get_quality(pred_array) #10000

RMSE: 38.61351299825054
Accuracy: 0.826
Precision: 0.5278969957081545
Recall: 0.6577540106951871
F-measure: 0.5857142857142857


In [9]:
get_quality(pred_array) #15000

RMSE: 42.91724483387854
Accuracy: 0.865
Precision: 0.6616666666666666
Recall: 0.662771285475793
F-measure: 0.6622185154295246


In [9]:
get_quality(pred_array) #20000

RMSE: 47.259825764333144
Accuracy: 0.81125
Precision: 0.47853403141361256
Recall: 0.6400560224089635
F-measure: 0.5476333133612941


In [13]:
get_quality(pred_array) #25000

RMSE: 44.0995410484576
Accuracy: 0.8228
Precision: 0.42971887550200805
Recall: 0.5737265415549598
F-measure: 0.4913892078071183


In [None]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-30000

In [None]:
get_quality(pred_array) #rbf-30000

In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-3000

RMSE: 25.08775719784502


In [9]:
get_quality(pred_array) #rbf-3000

Accuracy: 0.9533333333333334
Precision: 0.9848484848484849
Recall: 0.7065217391304348
F-measure: 0.8227848101265823


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-1000

RMSE: 43.86635136944401


In [13]:
get_quality(pred_array) #rbf-1000

Accuracy: 0.94
Precision: 0.4
Recall: 0.18181818181818182
F-measure: 0.25000000000000006


### Linear

#### Curitiba

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_linear_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #25000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_linear_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #20000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_linear_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #15000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_linear_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #10000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_linear_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #5000

#### CityA

In [8]:
get_quality(pred_array) #1000

RMSE: 43.477460345852435
Accuracy: 0.94
Precision: 0.4
Recall: 0.18181818181818182
F-measure: 0.25000000000000006


In [12]:
get_quality(pred_array) #5000

RMSE: 27.051489268519177
Accuracy: 0.938
Precision: 0.9337349397590361
Recall: 0.7524271844660194
F-measure: 0.8333333333333334


In [11]:
get_quality(pred_array) #10000

RMSE: 42.63768314043779
Accuracy: 0.8255
Precision: 0.5273522975929978
Recall: 0.6443850267379679
F-measure: 0.5800240673886883


In [12]:
get_quality(pred_array) #15000

RMSE: 45.559858114008414
Accuracy: 0.864
Precision: 0.6583747927031509
Recall: 0.662771285475793
F-measure: 0.6605657237936773


In [12]:
get_quality(pred_array) #20000

RMSE: 51.55758430185139
Accuracy: 0.80075
Precision: 0.4573484069886948
Recall: 0.623249299719888
F-measure: 0.5275637225844695


In [9]:
get_quality(pred_array) #25000

RMSE: 47.66864772541542
Accuracy: 0.8094
Precision: 0.39902439024390246
Recall: 0.5482573726541555
F-measure: 0.4618859401468097


In [9]:
get_quality(pred_array) #30000

RMSE: 44.178654729052894
Accuracy: 0.8045
Precision: 0.39619520264681557
Recall: 0.5195227765726681
F-measure: 0.44955419990614737


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-4000

RMSE: 21.693086312479963


In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-30000

RMSE: 44.16072216916598


In [13]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #linear-10000

RMSE: 42.63768314043779


In [10]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #rbf-10000

RMSE: 38.61351299825054


### Poly

#### Curitiba

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_poly_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #25000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_poly_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #20000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_poly_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #15000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_poly_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #10000

In [None]:
preds_pd = pd.DataFrame(preds, columns=['pred'])
preds_pd.to_csv('outputs/q7/rvm_poly_pred_q7_' + str(data_size) + '_' + city + '.csv', index=False)

get_quality(pred_array) #5000

#### CityA

In [11]:
get_quality(pred_array) #1000

RMSE: 43.333844703118544
Accuracy: 0.945
Precision: 0.5
Recall: 0.09090909090909091
F-measure: 0.15384615384615385


In [9]:
get_quality(pred_array) #5000

RMSE: 26.030245432060145
Accuracy: 0.942
Precision: 0.9457831325301205
Recall: 0.7621359223300971
F-measure: 0.8440860215053764


In [8]:
get_quality(pred_array) #10000

RMSE: 34.9088053711807
Accuracy: 0.86
Precision: 0.6186868686868687
Recall: 0.6550802139037433
F-measure: 0.6363636363636364


In [16]:
get_quality(pred_array) #15000

RMSE: 40.08488186678014
Accuracy: 0.8786666666666667
Precision: 0.7124773960216998
Recall: 0.657762938230384
F-measure: 0.6840277777777778


In [8]:
get_quality(pred_array) #20000

RMSE: 41.57439986464653
Accuracy: 0.8305
Precision: 0.5213776722090261
Recall: 0.6148459383753502
F-measure: 0.56426735218509


In [9]:
get_quality(pred_array) #25000

RMSE: 39.17801609109757
Accuracy: 0.8412
Precision: 0.4720930232558139
Recall: 0.5442359249329759
F-measure: 0.5056039850560398


In [None]:
get_quality(pred_array) #30000

In [7]:
rmse = np.sqrt(mean_squared_error(test_Y, pred_array))
print('RMSE: ' + str(rmse)) #poly #base toda e 3ll0000- kernel dead

RMSE: 34.9088053711807
