In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
import time
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm



In [2]:
city = 'CityA'

data_path = '../../data/output/' + city + '/normalized_data_X.csv'
y_path = '../../data/output/' + city + '/y_headway.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(1432633, 420)
(1432633, 2)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.066667,0.085019,0.2646,0.49615,0.057584,0.0,0.0,0.266155,0.496047,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.118068,0.066667,0.085026,0.270218,0.501802,0.064231,0.0,0.000148,0.271812,0.501642,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.066667,0.085033,0.273653,0.505296,0.068301,0.0,0.0,0.275187,0.505192,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.118068,0.066667,0.085039,0.277433,0.509404,0.0729,0.0,0.000149,0.280622,0.510132,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.118068,0.066667,0.085053,0.28005,0.513166,0.077983,0.0,0.000155,0.281878,0.513178,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
y.head()

Unnamed: 0,headway,headwayThreshold
0,85.0,5.0
1,90.0,5.0
2,99.0,5.0
3,116.0,5.0
4,116.0,5.0


In [4]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y.headway, test_size=0.20, random_state=7)

## Linear Regression

In [5]:
model = LinearRegression()
model.fit(train_X, train_Y)
pred_array = model.predict(test_X)

### Evaluating model

In [11]:
pred_array

array([35.796875, 13.703125, 16.953125, ..., 76.65625 , 18.453125,
       24.296875])

In [6]:
pred = []
for p in pred_array:
    pred.append(p)

In [12]:
pred

[35.796875,
 13.703125,
 16.953125,
 20.265625,
 49.078125,
 21.390625,
 39.015625,
 26.078125,
 28.203125,
 46.109375,
 48.734375,
 43.671875,
 24.484375,
 10.296875,
 28.453125,
 24.296875,
 37.296875,
 14.671875,
 28.859375,
 24.328125,
 19.4375,
 6.71875,
 19.125,
 15.34375,
 38.625,
 58.0,
 30.515625,
 23.296875,
 41.765625,
 29.75,
 39.515625,
 45.921875,
 60.171875,
 20.265625,
 29.296875,
 14.984375,
 43.015625,
 16.953125,
 2.46875,
 10.078125,
 7.703125,
 14.765625,
 26.640625,
 11.203125,
 21.109375,
 35.921875,
 42.515625,
 35.640625,
 30.984375,
 84.859375,
 32.984375,
 43.34375,
 14.3125,
 21.0625,
 17.15625,
 17.671875,
 48.765625,
 26.984375,
 24.953125,
 8.859375,
 45.953125,
 26.171875,
 15.453125,
 20.609375,
 83.203125,
 35.265625,
 37.234375,
 35.453125,
 23.890625,
 30.609375,
 15.171875,
 37.421875,
 35.328125,
 20.328125,
 10.109375,
 35.90625,
 21.671875,
 9.625,
 25.421875,
 24.546875,
 33.890625,
 21.546875,
 7.71875,
 18.359375,
 33.265625,
 30.921875,
 24.7

In [7]:
alpha = y.headwayThreshold[test_Y.index]
alpha

341586     5.0
573490     4.0
50737      5.0
840747     5.0
796060     5.0
          ... 
1421784    5.0
514588     6.0
1169290    5.0
1023921    5.0
988420     5.0
Name: headwayThreshold, Length: 286527, dtype: float64

In [13]:
test_Y

341586     84.0
573490      4.0
50737      12.0
840747     25.0
796060     30.0
           ... 
1421784    16.0
514588      3.0
1169290    82.0
1023921    12.0
988420     15.0
Name: headway, Length: 286527, dtype: float64

In [8]:
bb_pred = np.less_equal(pred, alpha)
bb_label = np.less_equal(test_Y, alpha)

In [14]:
bb_pred

341586     False
573490     False
50737      False
840747     False
796060     False
           ...  
1421784    False
514588     False
1169290    False
1023921    False
988420     False
Name: headwayThreshold, Length: 286527, dtype: bool

In [15]:
bb_label

341586     False
573490      True
50737      False
840747     False
796060     False
           ...  
1421784    False
514588      True
1169290    False
1023921    False
988420     False
Length: 286527, dtype: bool

In [9]:
# Headway
accuracy = accuracy_score(bb_label, bb_pred)
precision = precision_score(bb_label, bb_pred)
recall = recall_score(bb_label, bb_pred)
f_measure = f1_score(bb_label, bb_pred)

In [10]:
# Recife
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.8683300352148314
Precision: 0.5445492662473794
Recall: 0.13466573346812868
F-measure: 0.21593199908556227


### Update to calculate p-value

In [None]:
X = sm.add_constant(train_X)
linear_model = sm.OLS(train_Y, X) # Linear Regression/Ordinary Least Squares
result = linear_model.fit()
print(result.summary())