In [6]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_path = '../../data/output/normalized_data_X_5p.csv'
y_path = '../../data/output/y_5p.csv'

data = pd.read_csv(data_path)
y = pd.read_csv(y_path)
print(data.shape)
print(y.shape)
data.head()

(69884, 420)
(69884, 1)


Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertTypeSB_ACCIDENT,alertTypeSB_CHIT_CHAT,alertTypeSB_HAZARD,alertTypeSB_JAM,alertTypeSB_NORMAL,alertTypeSB_POLICE,alertTypeSB_ROAD_CLOSED,jamBlockTypeSB_-,jamBlockTypeSB_NORMAL,jamBlockTypeSB_ROAD_CLOSED_EVENT
0,0.118068,0.0,0.084945,0.264128,0.4394,0.010487,0.0,6e-06,0.265656,0.439493,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.118068,0.0,0.084965,0.264428,0.443488,0.014304,0.0,9e-06,0.265956,0.444051,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.118068,0.0,0.084972,0.263625,0.448606,0.01803,0.0,1e-05,0.264915,0.448957,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.118068,0.0,0.084979,0.261608,0.458297,0.025182,0.0,1.4e-05,0.263352,0.456961,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.118068,0.0,0.084985,0.260228,0.463266,0.028953,0.0,0.0,0.261792,0.46317,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Making training and test data: 80% Training, 20% Test
random.seed(15) #to get always the same set
train_X, test_X, train_Y, test_Y = train_test_split(data, y, test_size=0.20, random_state=7)
train_X_1, train_X_2, train_Y_1, train_Y_2 = train_test_split(train_X, train_Y, test_size=0.50, random_state=7) 

### Incremental CatBoost

In [6]:
model_1 = CatBoostClassifier(
    custom_loss=['Accuracy', 'Precision', 'Recall', 'F1'],
    iterations=10000,
    learning_rate=0.9,
    random_seed=42,
    depth=8,
    l2_leaf_reg=5,
    logging_level='Silent'
)
model_1.fit(train_X_1, train_Y_1)

model_1.save_model('model_1.model')

In [9]:
model_2_v1 = CatBoostClassifier(
    custom_loss=['Accuracy', 'Precision', 'Recall', 'F1'],
    iterations=10000,
    learning_rate=0.9,
    random_seed=42,
    depth=8,
    l2_leaf_reg=5,
    logging_level='Silent'
)
model_2_v1.fit(train_X_2, train_Y_2)


<catboost.core.CatBoostClassifier at 0x7ff9542b4eb8>

In [13]:
model_2_v2 = CatBoostClassifier(
    custom_loss=['Accuracy', 'Precision', 'Recall', 'F1'],
    iterations=10000,
    learning_rate=0.9,
    random_seed=42,
    depth=8,
    l2_leaf_reg=5,
    logging_level='Silent'
)
model_2_v2.fit(train_X_2, train_Y_2, init_model='model_1.model')

<catboost.core.CatBoostClassifier at 0x7ff956221b70>

In [14]:
pred_array_v1 = model_2_v1.predict(test_X)
pred_array_v2 = model_2_v2.predict(test_X)

In [17]:
# Bus Bunching
accuracy = accuracy_score(test_Y, pred_array_v2)
precision = precision_score(test_Y, pred_array_v2)
recall = recall_score(test_Y, pred_array_v2)
f_measure = f1_score(test_Y, pred_array_v2)

In [16]:
# v1: without incremental
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.9600057236889175
Precision: 0.8571428571428571
Recall: 0.7064017660044151
F-measure: 0.7745058491327147


In [18]:
# v2: incremental learning
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure)) #0.79

Accuracy: 0.9594333547971667
Precision: 0.8322147651006712
Recall: 0.7299484915378955
F-measure: 0.777734221873775


### Incremental XGBoost

In [4]:
xgb_1 = XGBClassifier(
     learning_rate =0.1,
     n_estimators=120, 
     max_depth=50,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=-1,
     scale_pos_weight=1,
     seed=23)
xgb_1.fit(train_X_1, train_Y_1)

xgb_1.save_model('xgb_1.model')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
xgb_2_v1 = XGBClassifier(
     learning_rate =0.1,
     n_estimators=120, 
     max_depth=50,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=-1,
     scale_pos_weight=1,
     seed=23)
xgb_2_v1.fit(train_X_2, train_Y_2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=None, n_estimators=120, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=23,
              silent=None, subsample=0.8, verbosity=1)

In [6]:
xgb_2_v2 = XGBClassifier(
     learning_rate =0.1,
     n_estimators=120, 
     max_depth=50,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=-1,
     scale_pos_weight=1,
     seed=23)
xgb_2_v2.fit(train_X_2, train_Y_2, xgb_model='xgb_1.model')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=None, n_estimators=120, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=23,
              silent=None, subsample=0.8, verbosity=1)

In [7]:
pred_array_v1 = xgb_2_v1.predict(test_X)
pred_array_v2 = xgb_2_v2.predict(test_X)

In [11]:
# Bus Bunching
accuracy = accuracy_score(test_Y, pred_array_v1)
precision = precision_score(test_Y, pred_array_v1)
recall = recall_score(test_Y, pred_array_v1)
f_measure = f1_score(test_Y, pred_array_v1)

In [12]:
# v1: without incremental
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.9568576947842885
Precision: 0.9064516129032258
Recall: 0.6203090507726269
F-measure: 0.7365661861074704


In [10]:
# v2: incremental learning
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure)) #0.79

Accuracy: 0.9595764470201045
Precision: 0.8832046332046332
Recall: 0.673289183222958
F-measure: 0.7640918580375783


### Random Forest

In [8]:
random.seed(42)
rf_1 = RandomForestClassifier(n_estimators=100, min_samples_split=5,
                           max_features=0.8, n_jobs=-1, warm_start=True)
rf_1.fit(train_X_1, train_Y_1)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=True)

In [None]:
random.seed(42)
rf_2_v1 = RandomForestClassifier(n_estimators=100, min_samples_split=5,
                           max_features=0.8, n_jobs=-1)
rf_2_v1.fit(train_X_2, train_Y_2)

In [15]:
random.seed(42)
rf_1.fit(train_X_2, train_Y_2)

  
  warn("Warm-start fitting without increasing n_estimators does not "


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=0.8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=True)

In [17]:
pred_array_v1 = rf_2_v1.predict(test_X)
pred_array_v2 = rf_1.predict(test_X)

In [18]:
# Bus Bunching
accuracy = accuracy_score(test_Y, pred_array_v2)
precision = precision_score(test_Y, pred_array_v2)
recall = recall_score(test_Y, pred_array_v2)
f_measure = f1_score(test_Y, pred_array_v2)

In [14]:
# v1: without incremental
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.9580024325677899
Precision: 0.9054621848739496
Recall: 0.6342899190581309
F-measure: 0.7459974037213328


In [19]:
# v2: incremental learning
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F-measure: " + str(f_measure))

Accuracy: 0.9571438792301639
Precision: 0.9103671706263499
Recall: 0.6203090507726269
F-measure: 0.737855579868709
