In [10]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
from skrvm import RVR
import time
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
city = 'CityA'

data_path = '../../data/output/' + city + '/normalized_data_X_5p.csv'
y_path = '../../data/output/' + city + '/y_5p.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(69884, 420)
(69884, 1)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.0,0.084945,0.264128,0.4394,0.010487,0.0,6e-06,0.265656,0.439493,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.118068,0.0,0.084965,0.264428,0.443488,0.014304,0.0,9e-06,0.265956,0.444051,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.0,0.084972,0.263625,0.448606,0.01803,0.0,1e-05,0.264915,0.448957,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.118068,0.0,0.084979,0.261608,0.458297,0.025182,0.0,1.4e-05,0.263352,0.456961,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.118068,0.0,0.084985,0.260228,0.463266,0.028953,0.0,0.0,0.261792,0.46317,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# proportion of unbalaced data
y.busBunching[y.busBunching == 1].count() / len(y)

0.13544362024328632

In [7]:
data['busBunching'] = y.busBunching
data.busBunching

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
1432628    0.0
1432629    0.0
1432630    0.0
1432631    0.0
1432632    0.0
Name: busBunching, Length: 1432633, dtype: float64

In [8]:
# Class count
count_class_0, count_class_1 = data.busBunching.value_counts()

# Divide by class
df_class_0 = data[data['busBunching'] == 0]
df_class_1 = data[data['busBunching'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_data_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_data_under.busBunching.value_counts())

Random under-sampling:
1.0    194041
0.0    194041
Name: busBunching, dtype: int64


In [9]:
y_data_under = df_data_under.busBunching
y_data_under

718522     0.0
961078     0.0
88902      0.0
1275620    0.0
1362240    0.0
          ... 
1431026    1.0
1432512    1.0
1432513    1.0
1432522    1.0
1432626    1.0
Name: busBunching, Length: 388082, dtype: float64

In [3]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y, test_size=0.20, random_state=7)

In [12]:
# proportion of balaced data -  undersample
y_data_under[y_data_under == 1].count() / len(y_data_under)
train_X, test_X, train_Y, test_Y = train_test_split(df_data_under, y_data_under, test_size=0.20, random_state=7)

0.5

### LR

In [4]:
start = time.time()

model = LogisticRegression(random_state=47, max_iter = 10000, multi_class='ovr', n_jobs=-1)
model.fit(train_X, train_Y)

end = time.time()
print("Prediction execution time: " + str((end - start)/60) + " min")

  y = column_or_1d(y, warn=True)


Prediction execution time: 2.624621562163035 min


In [5]:
# y_test_path = '../../data/output/test_Y.csv'
# x_test_path = '../../data/output/test_X.csv'

# test_X = pd.read_csv(x_test_path)
# test_Y = pd.read_csv(y_test_path)

pred_array = model.predict(test_X)

In [5]:
#prediction time

one_test = test_X.iloc[[0], :]

start = time.time() 
pred_array = model.predict(one_test)
end = time.time()
print("Prediction execution time: " + str(end - start) + " sec")

Prediction execution time: 0.0010368824005126953 sec


### Evalating Model

#### City A

In [6]:
def get_quality(bb_pred):
    bb_label = test_Y
    
    # Bus Bunching
    print("Accuracy: " + str(accuracy_score(bb_label, bb_pred)))
    print("Precision: " + str(precision_score(bb_label, bb_pred)))
    print("Recall: " + str(recall_score(bb_label, bb_pred)))
    print("F-measure: " + str(f1_score(bb_label, bb_pred)))

In [7]:
get_quality(pred_array) # 5% 1 day

Accuracy: 0.9229448379480575
Precision: 0.873015873015873
Recall: 0.24282560706401765
F-measure: 0.379965457685665


In [7]:
get_quality(pred_array) # 100%

Accuracy: 0.8684975586942941
Precision: 0.6425396825396825
Recall: 0.05246649558026804
F-measure: 0.09701152730845736


In [17]:
# balanced with under
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F-measure: 1.0


In [12]:
# unbalanced
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.8684975586942941
Precision: 0.6425396825396825
Recall: 0.05246649558026804
F-measure: 0.09701152730845736


### Update to calculate p-value

In [16]:
X = sm.add_constant(train_X)
logit_model = sm.Logit(train_Y, X) # Logistic Regression/Binary variable
result = logit_model.fit(method='lbfgs')
print(result.summary())



                           Logit Regression Results                           
Dep. Variable:            busBunching   No. Observations:              1146106
Model:                          Logit   Df Residuals:                  1145735
Method:                           MLE   Df Model:                          370
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                  0.1158
Time:                        19:10:22   Log-Likelihood:            -4.0229e+05
converged:                      False   LL-Null:                   -4.5498e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                        coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------
const                                                -0.3952        nan        nan        nan         nan         nan
route         

In [8]:
#5%

X = sm.add_constant(train_X)
logit_model = sm.Logit(train_Y, X) # Logistic Regression/Binary variable
result = logit_model.fit(method='lbfgs')
print(result.summary())



                           Logit Regression Results                           
Dep. Variable:            busBunching   No. Observations:                55907
Model:                          Logit   Df Residuals:                    55670
Method:                           MLE   Df Model:                          236
Date:                Sun, 15 Nov 2020   Pseudo R-squ.:                  0.2601
Time:                        16:44:06   Log-Likelihood:                -13231.
converged:                      False   LL-Null:                       -17881.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                        coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------
route                                                -1.6554        nan        nan        nan         nan         nan
tripNum       

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [9]:
var_values = pd.DataFrame(data=result.pvalues.sort_values(), columns=['p-value'])
var_values.reset_index(inplace=True)
var_values = var_values.rename(columns = {'index':'variable'})
var_values.head(20)

Unnamed: 0,variable,p-value
0,route,
1,tripNum,
2,shapeSequence,
3,shapeLat,
4,shapeLon,
5,distanceTraveledShape,
6,busCode,
7,gpsPointId,
8,gpsLat,
9,gpsLon,
