In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
import time
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

  import pandas.util.testing as tm


In [3]:
city = 'CityA'

data_path = '../../data/output/' + city + '/normalized_data_X.csv'
y_path = '../../data/output/' + city + '/y_headway.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(1432633, 420)
(1432633, 2)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.066667,0.085019,0.2646,0.49615,0.057584,0.0,0.0,0.266155,0.496047,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.118068,0.066667,0.085026,0.270218,0.501802,0.064231,0.0,0.000148,0.271812,0.501642,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.066667,0.085033,0.273653,0.505296,0.068301,0.0,0.0,0.275187,0.505192,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.118068,0.066667,0.085039,0.277433,0.509404,0.0729,0.0,0.000149,0.280622,0.510132,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.118068,0.066667,0.085053,0.28005,0.513166,0.077983,0.0,0.000155,0.281878,0.513178,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
y.head()

Unnamed: 0,headway,headwayThreshold
0,85.0,5.0
1,90.0,5.0
2,99.0,5.0
3,116.0,5.0
4,116.0,5.0


In [5]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y.headway, test_size=0.20, random_state=7)

## Linear Regression

In [18]:
random.seed(42)

start = time.time()

model = LinearRegression()
model.fit(train_X, train_Y)
pred_array = model.predict(test_X)

end = time.time()
print("Execution time: " + str((end - start)/60) + " min")

Execution time: 0.390319279829661 min


### Evaluating model

In [7]:
pred_array

array([35.71875, 13.5625 , 17.1875 , ..., 77.0625 , 18.8125 , 23.9375 ])

In [8]:
pred = []
for p in pred_array:
    pred.append(p)

In [9]:
pred

[35.71875,
 13.5625,
 17.1875,
 20.09375,
 49.21875,
 21.1875,
 39.4375,
 25.90625,
 28.40625,
 46.15625,
 48.78125,
 43.5,
 24.4375,
 10.0625,
 28.6875,
 24.75,
 37.15625,
 14.34375,
 28.78125,
 23.9375,
 19.75,
 6.59375,
 19.09375,
 15.3125,
 39.0,
 57.6875,
 30.84375,
 23.65625,
 41.5625,
 30.0625,
 39.5625,
 46.03125,
 60.71875,
 20.0625,
 29.375,
 14.84375,
 43.125,
 16.90625,
 2.40625,
 10.0,
 8.0,
 14.5625,
 26.71875,
 10.96875,
 21.0625,
 36.21875,
 42.375,
 35.5625,
 31.03125,
 85.3125,
 33.34375,
 43.0625,
 14.25,
 21.3125,
 17.1875,
 17.59375,
 48.6875,
 26.71875,
 25.21875,
 8.46875,
 45.6875,
 26.59375,
 15.25,
 20.46875,
 83.40625,
 35.4375,
 37.59375,
 35.6875,
 24.03125,
 30.375,
 15.3125,
 37.46875,
 35.5625,
 20.84375,
 10.1875,
 35.84375,
 21.8125,
 9.625,
 25.84375,
 25.03125,
 33.46875,
 21.71875,
 7.5,
 18.375,
 33.0625,
 31.1875,
 24.46875,
 14.0,
 82.9375,
 11.9375,
 6.53125,
 26.65625,
 18.4375,
 21.28125,
 27.875,
 20.59375,
 23.3125,
 25.5625,
 5.1875,
 45.65

In [10]:
alpha = y.headwayThreshold[test_Y.index]
alpha

341586     5.0
573490     4.0
50737      5.0
840747     5.0
796060     5.0
          ... 
1421784    5.0
514588     6.0
1169290    5.0
1023921    5.0
988420     5.0
Name: headwayThreshold, Length: 286527, dtype: float64

In [11]:
test_Y

341586     84.0
573490      4.0
50737      12.0
840747     25.0
796060     30.0
           ... 
1421784    16.0
514588      3.0
1169290    82.0
1023921    12.0
988420     15.0
Name: headway, Length: 286527, dtype: float64

In [12]:
bb_pred = np.less_equal(pred, alpha)
bb_label = np.less_equal(test_Y, alpha)

In [13]:
bb_pred

341586     False
573490     False
50737      False
840747     False
796060     False
           ...  
1421784    False
514588     False
1169290    False
1023921    False
988420     False
Name: headwayThreshold, Length: 286527, dtype: bool

In [14]:
bb_label

341586     False
573490      True
50737      False
840747     False
796060     False
           ...  
1421784    False
514588      True
1169290    False
1023921    False
988420     False
Length: 286527, dtype: bool

In [15]:
# Headway
accuracy = accuracy_score(bb_label, bb_pred)
precision = precision_score(bb_label, bb_pred)
recall = recall_score(bb_label, bb_pred)
f_measure = f1_score(bb_label, bb_pred)

In [10]:
# CityA
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.8683300352148314
Precision: 0.5445492662473794
Recall: 0.13466573346812868
F-measure: 0.21593199908556227


### Update to calculate p-value

In [17]:
X = sm.add_constant(train_X)
linear_model = sm.OLS(train_Y, X) # Linear Regression/Ordinary Least Squares
result = linear_model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                headway   R-squared:                       0.227
Model:                            OLS   Adj. R-squared:                  0.227
Method:                 Least Squares   F-statistic:                     910.2
Date:                Sun, 25 Oct 2020   Prob (F-statistic):               0.00
Time:                        22:46:23   Log-Likelihood:            -5.6540e+06
No. Observations:             1146106   AIC:                         1.131e+07
Df Residuals:                 1145735   BIC:                         1.131e+07
Df Model:                         370                                         
Covariance Type:            nonrobust                                         
                                                        coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------