In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
from skrvm import RVR
import time
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm



In [2]:
city = 'Recife'

data_path = '../../data/output/' + city + '/normalized_data_X.csv'
y_path = '../../data/output/' + city + '/y.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(1432633, 420)
(1432633, 1)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.066667,0.085019,0.2646,0.49615,0.057584,0.0,0.0,0.266155,0.496047,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.118068,0.066667,0.085026,0.270218,0.501802,0.064231,0.0,0.000148,0.271812,0.501642,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.066667,0.085033,0.273653,0.505296,0.068301,0.0,0.0,0.275187,0.505192,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.118068,0.066667,0.085039,0.277433,0.509404,0.0729,0.0,0.000149,0.280622,0.510132,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.118068,0.066667,0.085053,0.28005,0.513166,0.077983,0.0,0.000155,0.281878,0.513178,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# proportion of unbalaced data
y.busBunching[y.busBunching == 1].count() / len(y)

0.13544362024328632

In [7]:
data['busBunching'] = y.busBunching
data.busBunching

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
1432628    0.0
1432629    0.0
1432630    0.0
1432631    0.0
1432632    0.0
Name: busBunching, Length: 1432633, dtype: float64

In [8]:
# Class count
count_class_0, count_class_1 = data.busBunching.value_counts()

# Divide by class
df_class_0 = data[data['busBunching'] == 0]
df_class_1 = data[data['busBunching'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_data_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_data_under.busBunching.value_counts())

Random under-sampling:
1.0    194041
0.0    194041
Name: busBunching, dtype: int64


In [9]:
y_data_under = df_data_under.busBunching
y_data_under

718522     0.0
961078     0.0
88902      0.0
1275620    0.0
1362240    0.0
          ... 
1431026    1.0
1432512    1.0
1432513    1.0
1432522    1.0
1432626    1.0
Name: busBunching, Length: 388082, dtype: float64

In [12]:
# proportion of balaced data -  undersample
y_data_under[y_data_under == 1].count() / len(y_data_under)

0.5

In [13]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(df_data_under, y_data_under, test_size=0.20, random_state=7)

### LR

In [14]:
start = time.time()

model = LogisticRegression(random_state=47, max_iter = 10000, multi_class='ovr', n_jobs=-1)
model.fit(train_X, train_Y)

end = time.time()
print("Prediction execution time: " + str((end - start)/60) + " min")

Prediction execution time: 2.766302053133647 min


In [15]:
# y_test_path = '../../data/output/test_Y.csv'
# x_test_path = '../../data/output/test_X.csv'

# test_X = pd.read_csv(x_test_path)
# test_Y = pd.read_csv(y_test_path)

pred_array = model.predict(test_X)

### Evalating Model

#### City A

In [None]:
def get_quality(bb_pred):
    bb_label = test_Y
    
    # Bus Bunching
    print("Accuracy: " + str(accuracy_score(bb_label, bb_pred)))
    print("Precision: " + str(precision_score(bb_label, bb_pred)))
    print("Recall: " + str(recall_score(bb_label, bb_pred)))
    print("F-measure: " + str(f1_score(bb_label, bb_pred)))

In [None]:
get_quality(pred_array) # 100%

In [17]:
# balanced with under
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F-measure: 1.0


In [12]:
# unbalanced
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.8684975586942941
Precision: 0.6425396825396825
Recall: 0.05246649558026804
F-measure: 0.09701152730845736


### Update to calculate p-value

In [None]:
X = sm.add_constant(train_X)
logit_model = sm.Logit(train_Y, X) # Logistic Regression/Binary variable
result = logit_model.fit()
print(result.summary())