In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [3]:
raw_df = pd.read_csv("OD_2017-08.csv")
print(raw_df.shape)
raw_df.head

FileNotFoundError: File b'OD_2017-08.csv' does not exist

In [54]:
start_date_expanded = []

for i in range(raw_df.shape[0]):
    start_date = raw_df.loc[i, "start_date"]
    date, time = start_date.split(" ")
    year, month, day = date.split("-")
    hour, minute = time.split(":")
    
    start_date_expanded.append([year, month, day, hour, minute])    

start_date_expanded_df = pd.DataFrame(start_date_expanded, columns=["start_year", "start_month", "start_day", "start_hour", "start_min"])

In [57]:
end_date_expanded = []

for i in range(raw_df.shape[0]):
    end_date = raw_df.loc[i, "end_date"]
    date, time = end_date.split(" ")
    year, month, day = date.split("-")
    hour, minute = time.split(":")
    
    end_date_expanded.append([year, month, day, hour, minute])    

end_date_expanded_df = pd.DataFrame(end_date_expanded, columns=["end_year", "end_month", "end_day", "end_hour", "end_min"])

In [60]:
parsed_df = pd.concat([start_date_expanded_df, 
                       end_date_expanded_df,
                       raw_df.loc[:,("start_station_code", "end_station_code", "duration_sec", "is_member")]],
                     axis=1)

print(parsed_df.shape)
parsed_df.to_csv("parsed_data.csv", index=False)

(859471, 14)


In [132]:
sampled_df = parsed_df.sample(n=50000, axis=0)
print("Sampled shape: ", sampled_df.shape)

X_df = pd.get_dummies(sampled_df.drop("is_member", 1), 
                      columns=["start_year", "start_month", "start_day", "start_hour", "start_min",
                               "end_year", "end_month", "end_day", "end_hour", "end_min",
                               "start_station_code", "end_station_code"
                              ])

y_df = sampled_df["is_member"]


print("X Shape:", X_df.shape)
print("y Shape:", y_df.shape)

Sampled shape:  (50000, 14)
X Shape: (50000, 1324)
y Shape: (50000,)


In [133]:
print(sampled_dummies_df.columns[0:50])
print(sampled_dummies_df.columns[1000:1025])
print(sampled_dummies_df.columns[1285:])

Index(['duration_sec', 'is_member', 'start_year_2017', 'start_month_08',
       'start_day_01', 'start_day_02', 'start_day_03', 'start_day_04',
       'start_day_05', 'start_day_06', 'start_day_07', 'start_day_08',
       'start_day_09', 'start_day_10', 'start_day_11', 'start_day_12',
       'start_day_13', 'start_day_14', 'start_day_15', 'start_day_16',
       'start_day_17', 'start_day_18', 'start_day_19', 'start_day_20',
       'start_day_21', 'start_day_22', 'start_day_23', 'start_day_24',
       'start_day_25', 'start_day_26', 'start_day_27', 'start_day_28',
       'start_day_29', 'start_day_30', 'start_day_31', 'start_hour_00',
       'start_hour_01', 'start_hour_02', 'start_hour_03', 'start_hour_04',
       'start_hour_05', 'start_hour_06', 'start_hour_07', 'start_hour_08',
       'start_hour_09', 'start_hour_10', 'start_hour_11', 'start_hour_12',
       'start_hour_13', 'start_hour_14'],
      dtype='object')
Index(['end_station_code_6234', 'end_station_code_6235',
       'end_

In [134]:
pca = PCA(n_components=5)
X_reduced = pca.fit_transform(X_df, y_df)

X_reduced.shape

(50000, 5)

In [144]:
logit = LogisticRegression()

logit.fit(X_df, y_df)
acc_score = accuracy_score(y_df, logit.predict(X_df))
cv_res = cross_val_score(logit, X_df, y_df, n_jobs=-1, scoring="accuracy")

print("Logit - Accuracy Score:", acc_score)
print("Logit - Score of 3-Fold Cross Validation:", cv_res)

Neural Net - Accuracy Score: 0.81984
Logit: Score of 3-Fold Cross Validation: [ 0.81149508  0.80913237  0.81117245]


In [136]:
b_dtr = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4))

cv_res = cross_val_score(b_dtr, X_df, y_df, n_jobs=-1, scoring="accuracy")

print(f"B-DTR4: Score of 3-Fold Cross Validation:", cv_res)

B-DTR4: Score of 3-Fold Cross Validation: [ 0.79895608  0.80013201  0.80187207]


In [137]:
logit = LogisticRegression()

cv_res = cross_val_score(logit, X_reduced, y_df, n_jobs=-1, scoring="accuracy")

print("Logit with reduced data: Score of 3-Fold Cross Validation:", cv_res)

Logit with reduced data: Score of 3-Fold Cross Validation: [ 0.78695704  0.78459138  0.78531141]


In [147]:
neural_net_sizes = [2, 5, 10, 25, 50, 75, 100, 500]

for size in neural_net_sizes:
    neural_net = MLPClassifier(hidden_layer_sizes=(size,))

    neural_net.fit(X_df, y_df)
    acc_score = accuracy_score(y_df, neural_net.predict(X_df))

    cv_res = cross_val_score(neural_net, X_df, y_df, n_jobs=-1, scoring="accuracy")

    print("Neural Net Size {size}- Accuracy Score:", acc_score)
    print("Neural Net Size {size}- Score of 3-Fold Cross Validation:", cv_res)

Neural Net Size {size}- Accuracy Score: 0.78056
Neural Net Size {size}- Score of 3-Fold Cross Validation: [ 0.79547636  0.78057122  0.78057122]


KeyboardInterrupt: 