In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import matplotlib.pyplot as plt
import time
import seaborn as sns
from sklearn.externals import joblib



In [2]:
file_path = '../../data/input/integrated_data_dummy.csv'

data = pd.read_csv(file_path)
print(data.shape)
data = data.sort_values(["busCode","busCodeSB"])

(1432633, 423)


In [3]:
data[data.busBunching == True].shape

(194041, 423)

In [4]:
target_col = ['headway']
bb_col = ['busBunching']
hd_threshold = ["headwayThreshold"]
features = list(set(list(data.columns))-set(target_col)-set(bb_col)-set(hd_threshold))

# Get label column and remove it from data
y = data['busBunching']
# y_threshold = data['headwayThreshold']

data.drop('headway', axis=1, inplace=True)
data.drop('busBunching', axis=1, inplace=True)
data.drop('headwayThreshold', axis=1, inplace=True)

# Normalize data
min_max_scaler = preprocessing.MinMaxScaler()
data_scale = min_max_scaler.fit_transform(data)

# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data_scale, y, test_size=0.20, random_state=7, shuffle=False)

In [5]:
del data
del y

### Gradient Boosting

In [None]:
parameters = {'learning_rate': [0.001, 0.01],
              'n_estimators': [10, 25, 50],
              'max_depth': [5, 10, 15, 25],
              'min_samples_split': [5, 25, 50, 100],
              'min_samples_leaf': [5, 25, 50, 100]
             }

model = GradientBoostingClassifier()

gridSearch = GridSearchCV(estimator=model,
                          param_grid=parameters,
                          cv=5,
                          return_train_score=True).fit(train_X, train_Y)

print('Grid Search Best score', gridSearch.best_score_)
print('Grid Search Best Parameters', gridSearch.best_params_)
print('Execution time', gridSearch.refit_time_)

In [18]:
random.seed(42)

start = time.time()

model = GradientBoostingClassifier(learning_rate=0.01, min_samples_split=1000, 
                                   min_samples_leaf=1000, max_depth=50, random_state=42)

model.fit(train_X, train_Y)
pred_array = model.predict(test_X)

end = time.time()
print("Execution time: " + str((end - start)/60) + " min")

Execution time: 1147.2348720391592 min


In [19]:
pred_array

array([0, 0, 0, ..., 1, 1, 0])

In [20]:
# Bus Bunching
accuracy = accuracy_score(test_Y, pred_array)
precision = precision_score(test_Y, pred_array)
recall = recall_score(test_Y, pred_array)
f_measure = f1_score(test_Y, pred_array)

In [21]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.8853546088152251
Precision: 0.05242424242424242
Recall: 0.0057869208897809
F-measure: 0.010423256514535321
