In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
raw_df = pd.read_csv("data/2017/OD_2017-08.csv")
print(raw_df.shape)
raw_df.head()

(859471, 6)


Unnamed: 0,start_date,start_station_code,end_date,end_station_code,duration_sec,is_member
0,2017-08-01 00:00,6381,2017-08-01 00:14,6113,875,1
1,2017-08-01 00:00,6128,2017-08-01 00:04,6119,229,1
2,2017-08-01 00:00,6141,2017-08-01 00:05,6177,302,1
3,2017-08-01 00:00,6224,2017-08-01 00:11,6912,638,1
4,2017-08-01 00:01,6113,2017-08-01 00:20,7076,1139,1


In [3]:
start_date_expanded = []

for i in range(raw_df.shape[0]):
    start_date = raw_df.loc[i, "start_date"]
    date, time = start_date.split(" ")
    year, month, day = date.split("-")
    hour, minute = time.split(":")
    
    start_date_expanded.append([year, month, day, hour, minute])    

start_date_expanded_df = pd.DataFrame(start_date_expanded, columns=["start_year", "start_month", "start_day", "start_hour", "start_min"])

In [4]:
end_date_expanded = []

for i in range(raw_df.shape[0]):
    end_date = raw_df.loc[i, "end_date"]
    date, time = end_date.split(" ")
    year, month, day = date.split("-")
    hour, minute = time.split(":")
    
    end_date_expanded.append([year, month, day, hour, minute])    

end_date_expanded_df = pd.DataFrame(end_date_expanded, columns=["end_year", "end_month", "end_day", "end_hour", "end_min"])

In [5]:
parsed_df = pd.concat([start_date_expanded_df, 
                       end_date_expanded_df,
                       raw_df.loc[:,("start_station_code", "end_station_code", "duration_sec", "is_member")]],
                     axis=1)

print(parsed_df.shape)
parsed_df.to_csv("parsed_data.csv", index=False)

(859471, 14)


In [6]:
sampled_df = parsed_df.sample(n=50000, axis=0)
print("Sampled shape: ", sampled_df.shape)

X_df = pd.get_dummies(sampled_df.drop("is_member", 1), 
                      columns=["start_year", "start_month", "start_day", "start_hour", "start_min",
                               "end_year", "end_month", "end_day", "end_hour", "end_min",
                               "start_station_code", "end_station_code"
                              ])

y_df = sampled_df["is_member"]


print("X Shape:", X_df.shape)
print("y Shape:", y_df.shape)

Sampled shape:  (50000, 14)
X Shape: (50000, 1324)
y Shape: (50000,)


In [7]:
print(X_df.columns[0:20])
print(X_df.columns[500:515])
print(X_df.columns[1000:1015])
print(X_df.columns[1285:1300])

Index(['duration_sec', 'start_year_2017', 'start_month_08', 'start_day_01',
       'start_day_02', 'start_day_03', 'start_day_04', 'start_day_05',
       'start_day_06', 'start_day_07', 'start_day_08', 'start_day_09',
       'start_day_10', 'start_day_11', 'start_day_12', 'start_day_13',
       'start_day_14', 'start_day_15', 'start_day_16', 'start_day_17'],
      dtype='object')
Index(['start_station_code_6278', 'start_station_code_6279',
       'start_station_code_6280', 'start_station_code_6281',
       'start_station_code_6301', 'start_station_code_6302',
       'start_station_code_6303', 'start_station_code_6304',
       'start_station_code_6305', 'start_station_code_6306',
       'start_station_code_6307', 'start_station_code_6309',
       'start_station_code_6310', 'start_station_code_6311',
       'start_station_code_6312'],
      dtype='object')
Index(['end_station_code_6229', 'end_station_code_6230',
       'end_station_code_6231', 'end_station_code_6232',
       'end_station

In [8]:
pca = PCA(n_components=5)
X_reduced = pca.fit_transform(X_df, y_df)

X_reduced.shape

(50000, 5)

In [9]:
logit = LogisticRegression()

logit.fit(X_df, y_df)
acc_score = accuracy_score(y_df, logit.predict(X_df))

print("Logit: Accuracy Score:", acc_score)


acc_score_cv = cross_val_score(logit, X_df, y_df, n_jobs=-1, scoring="accuracy")
f1_score_cv = cross_val_score(logit, X_df, y_df, n_jobs=-1, scoring="f1")
prec_score_cv = cross_val_score(logit, X_df, y_df, n_jobs=-1, scoring="precision")

print("\nLogit: Accuracy Score of 3-Fold Cross Validation:", acc_score_cv)
print("Logit: F1 Score of 3-Fold Cross Validation:", f1_score_cv)
print("Logit: Precision Score of 3-Fold Cross Validation:", prec_score_cv)

Logit: Accuracy Score: 0.81494

Logit: Accuracy Score of 3-Fold Cross Validation: [ 0.81047516  0.80805232  0.80901236]
Logit: F1 Score of 3-Fold Cross Validation: [ 0.88767201  0.88572143  0.88660896]
Logit: Precision Score of 3-Fold Cross Validation: [ 0.82645832  0.8279017   0.82673399]


In [11]:
b_dtr = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4))

cv_res = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="accuracy")
f1_score_cv = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="f1")
prec_score_cv = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="precision")

print(f"B-DTR_4: Accuracy Score of 3-Fold Cross Validation:", cv_res)
print("B-DTR_4: F1 Score of 3-Fold Cross Validation:", f1_score_cv)
print("B-DTR_4: Precision Score of 3-Fold Cross Validation:", prec_score_cv)

KeyboardInterrupt: 

In [10]:
logit = LogisticRegression()

cv_res = cross_val_score(logit, X_reduced, y_df, n_jobs=-1, scoring="accuracy")
f1_score_cv = cross_val_score(logit, X_reduced, y_df, n_jobs=-1, scoring="f1")
prec_score_cv = cross_val_score(logit, X_reduced, y_df, n_jobs=-1, scoring="precision")

print("Logit + PCA: Accuracy Score of 3-Fold Cross Validation:", cv_res)
print("Logit + PCA: F1 Score of 3-Fold Cross Validation:", f1_score_cv)
print("Logit + PCA: Precision Score of 3-Fold Cross Validation:", prec_score_cv)

Logit + PCA: Accuracy Score of 3-Fold Cross Validation: [ 0.7862971   0.78231129  0.78459138]
Logit + PCA: F1 Score of 3-Fold Cross Validation: [ 0.87683264  0.87433322  0.87580433]
Logit + PCA: Precision Score of 3-Fold Cross Validation: [ 0.79742138  0.79622737  0.79675206]


In [11]:
neural_net = MLPClassifier(hidden_layer_sizes=(1323,))
neural_net.fit(X_df, y_df)
acc_score = accuracy_score(y_df, neural_net.predict(X_df))

# cv_res = cross_val_score(neural_net, X_df, y_df, n_jobs=2, scoring="accuracy")
# f1_score_cv = cross_val_score(neural_net, X_df, y_df, n_jobs=-1, scoring="f1")
# prec_score_cv = cross_val_score(neural_net, X_df, y_df, n_jobs=-1, scoring="precision")

print("1-layer Neural Nets: Accuracy Score on Traning Data", acc_score)
# print(f"1-layer Neural Nets: Accuracy Score of 3-Fold Cross Validation:", cv_res)
# print(f"1-layer Neural Nets: F1 Score of 3-Fold Cross Validation:", f1_score_cv)
# print(f"1-layer Neural Nets: Precision Score of 3-Fold Cross Validation:", prec_score_cv)

1-layer Neural Nets: Accuracy Score on Traning Data 0.80912


In [None]:
neural_net = MLPClassifier(hidden_layer_sizes=(1323,1150), batch_size=64)
neural_net.fit(X_df, y_df)
acc_score = accuracy_score(y_df, neural_net.predict(X_df))

cv_res = cross_val_score(b_dtr, X_df, y_df, n_jobs=2, scoring="accuracy")
# f1_score_cv = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="f1")
# prec_score_cv = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="precision")

print("2-layer Neural Nets: Accuracy Score on Traning Data", acc_score)
print(f"2-layer Neural Nets: Accuracy Score of 3-Fold Cross Validation:", cv_res)
# print(f"2-layer Neural Nets: F1 Score of 3-Fold Cross Validation:", f1_score_cv)
# print(f"2-layer Neural Nets: Precision Score of 3-Fold Cross Validation:", prec_score_cv)

In [33]:
from sklearn.model_selection import train_test_split

# Take a new sample
sampled_df = parsed_df.sample(n=100000, axis=0)
print("Sampled shape: ", sampled_df.shape)

# Getting dummies
data_dum = pd.get_dummies(sampled_df, 
                      columns=["start_year", "start_month", "start_day", "start_hour", "start_min",
                               "end_year", "end_month", "end_day", "end_hour", "end_min",
                               "start_station_code", "end_station_code"
                              ])

# Split data
train_data, test_data = train_test_split(data_dum, test_size=0.5, random_state=1)

print("train_data shape: ", train_data.shape)
print("test_data shape: ", test_data.shape)

Sampled shape:  (100000, 14)
train_data shape:  (50000, 1325)
test_data shape:  (50000, 1325)


In [34]:
# Take a subset of the test data that will be ones, another of zeros
test_data_zeros = test_data[test_data["is_member"] == 0]
test_data_ones = test_data[test_data["is_member"] == 1]

print("test_data_zeros shape:", test_data_zeros.shape)
print("test_data_ones shape:", test_data_ones.shape)

# Take only 10000 from each
test_data_zeros = test_data_zeros.sample(n=10000, axis=0)
test_data_ones = test_data_ones.sample(n=10000, axis=0)

print("\nnew test_data_zeros shape:", test_data_zeros.shape)
print("new test_data_ones shape:", test_data_ones.shape)

test_data_zeros shape: (10843, 1325)
test_data_ones shape: (39157, 1325)

new test_data_zeros shape: (10000, 1325)
new test_data_ones shape: (10000, 1325)


In [41]:
# Splitting X and Y
X_train = train_data.drop("is_member", 1)
y_train = train_data["is_member"]

X_test_zeros = test_data_zeros.drop("is_member", 1)
y_test_zeros = test_data_zeros["is_member"]

X_test_ones = test_data_ones.drop("is_member", 1)
y_test_ones = test_data_ones["is_member"]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("\nX_test_zeros shape:", X_test_zeros.shape)
print("y_test_zeros shape:", y_test_zeros.shape)
print("\nX_test_ones shape:", X_test_ones.shape)
print("y_test_ones shape:", y_test_ones.shape)

X_train shape: (50000, 1324)
y_train shape: (50000,)

X_test_zeros shape: (10000, 1324)
y_test_zeros shape: (10000,)

X_test_ones shape: (10000, 1324)
y_test_ones shape: (10000,)


In [42]:
# Testing different scores using Logit
acc_score_cv = cross_val_score(logit, X_train, y_train, n_jobs=-1, scoring="accuracy")

print("Logit: Average Accuracy Score of 3-Fold Cross Validation:", np.average(acc_score_cv))

logit.fit(X_train, y_train)
acc_score_zeros = accuracy_score(y_test_zeros, logit.predict(X_test_zeros))
acc_score_ones = accuracy_score(y_test_ones, logit.predict(X_test_ones))

print("Logit: Accuracy Score, on y_test_zeros:", acc_score_zeros)
print("Logit: Accuracy Score, on y_test_ones:", acc_score_ones)

Logit: Average Accuracy Score of 3-Fold Cross Validation: 0.809459965847
Logit: Accuracy Score, on y_test_zeros: 0.2897
Logit: Accuracy Score, on y_test_ones: 0.9569


Conclusion: Logit performs very poorly on predicting non-members, but very well on predicting members

In [44]:
b_dtr = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4))

acc_score_cv = cross_val_score(b_dtr, X_train, y_train, n_jobs=-1, scoring="accuracy")

print("b_dtr: Average Accuracy Score of 3-Fold Cross Validation:", np.average(acc_score_cv))

b_dtr.fit(X_train, y_train)
acc_score_zeros = accuracy_score(y_test_zeros, b_dtr.predict(X_test_zeros))
acc_score_ones = accuracy_score(y_test_ones, b_dtr.predict(X_test_ones))

print("b_dtr: Accuracy Score, on y_test_zeros:", acc_score_zeros)
print("b_dtr: Accuracy Score, on y_test_ones:", acc_score_ones)

b_dtr: Average Accuracy Score of 3-Fold Cross Validation: 0.802019977841
b_dtr: Accuracy Score, on y_test_zeros: 0.3015
b_dtr: Accuracy Score, on y_test_ones: 0.9419


Conclusion: Boosted Decision tree of depth 4 also performs very poorly on predicting non-members, but very well on predicting members

In [51]:
for n in range(1,15):
    b_dtr = AdaBoostClassifier(DecisionTreeClassifier(max_depth=n))
    b_dtr.fit(X_train, y_train)
    acc_score_zeros = accuracy_score(y_test_zeros, b_dtr.predict(X_test_zeros))
    
    print(f"b_dtr_{n}: Accuracy Score, on y_test_zeros:", acc_score_zeros)

b_dtr_1: Accuracy Score, on y_test_zeros: 0.2271
b_dtr_2: Accuracy Score, on y_test_zeros: 0.2621
b_dtr_3: Accuracy Score, on y_test_zeros: 0.2804
b_dtr_4: Accuracy Score, on y_test_zeros: 0.2995
b_dtr_5: Accuracy Score, on y_test_zeros: 0.3107
b_dtr_6: Accuracy Score, on y_test_zeros: 0.3335
b_dtr_7: Accuracy Score, on y_test_zeros: 0.3382
b_dtr_8: Accuracy Score, on y_test_zeros: 0.3565
b_dtr_9: Accuracy Score, on y_test_zeros: 0.358
b_dtr_10: Accuracy Score, on y_test_zeros: 0.369
b_dtr_11: Accuracy Score, on y_test_zeros: 0.3637
b_dtr_12: Accuracy Score, on y_test_zeros: 0.3729
b_dtr_13: Accuracy Score, on y_test_zeros: 0.3707
b_dtr_14: Accuracy Score, on y_test_zeros: 0.3801


Boosted Decision Tree of maximum depth of 14 is the best at predicting 0s.