## <center>This Model will Attempt to Predict the Type of Bicycle Used from Trip Data</center>

In [26]:
# Eliminating Warnings
import warnings
warnings.filterwarnings('ignore')

In [27]:
# Importing Libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score

In [28]:
# Set up data path
data_path = 'data'

In [29]:
# Load base data to dataframe
all_data_file = 'all_data.csv'
all_df = pd.read_csv(os.path.join(data_path, all_data_file))
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,10/17/16,Work to Orwasher's,2.09,,18.0,0.30,6.97,Citibike
1,10/13/19,Down ES Up UWS,21.72,2.0,52.0,2.87,7.58,Specialized
2,9/14/16,Citibike to Work,1.09,,11.0,0.18,5.95,Citibike
3,8/12/19,CP BL,6.79,,48.0,0.80,8.49,Specialized
4,8/3/20,To Kalustiyans,1.96,,16.0,0.27,7.35,Citibike
...,...,...,...,...,...,...,...,...
348,8/25/16,Citibike to Work,1.18,,10.0,0.17,7.08,Citibike
349,6/4/19,To Essex Market,3.51,,35.0,0.58,6.02,Citibike
350,7/18/20,CP ML,6.47,,36.0,0.60,10.78,Specialized
351,6/29/16,Citibike to Work,1.16,,10.0,0.17,6.96,Citibike


In [30]:
# Change NaN values to 0 - only occurs on hours column so this is accurate
all_df['Hours'] = all_df['Hours'].fillna(0)
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,10/17/16,Work to Orwasher's,2.09,0.0,18.0,0.30,6.97,Citibike
1,10/13/19,Down ES Up UWS,21.72,2.0,52.0,2.87,7.58,Specialized
2,9/14/16,Citibike to Work,1.09,0.0,11.0,0.18,5.95,Citibike
3,8/12/19,CP BL,6.79,0.0,48.0,0.80,8.49,Specialized
4,8/3/20,To Kalustiyans,1.96,0.0,16.0,0.27,7.35,Citibike
...,...,...,...,...,...,...,...,...
348,8/25/16,Citibike to Work,1.18,0.0,10.0,0.17,7.08,Citibike
349,6/4/19,To Essex Market,3.51,0.0,35.0,0.58,6.02,Citibike
350,7/18/20,CP ML,6.47,0.0,36.0,0.60,10.78,Specialized
351,6/29/16,Citibike to Work,1.16,0.0,10.0,0.17,6.96,Citibike


In [31]:
# Turning Index into Orig_Index Column
all_df.reset_index(inplace=True)
all_df = all_df.rename(columns = {'index': 'Orig_Index'})
all_df

Unnamed: 0,Orig_Index,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,0,10/17/16,Work to Orwasher's,2.09,0.0,18.0,0.30,6.97,Citibike
1,1,10/13/19,Down ES Up UWS,21.72,2.0,52.0,2.87,7.58,Specialized
2,2,9/14/16,Citibike to Work,1.09,0.0,11.0,0.18,5.95,Citibike
3,3,8/12/19,CP BL,6.79,0.0,48.0,0.80,8.49,Specialized
4,4,8/3/20,To Kalustiyans,1.96,0.0,16.0,0.27,7.35,Citibike
...,...,...,...,...,...,...,...,...,...
348,348,8/25/16,Citibike to Work,1.18,0.0,10.0,0.17,7.08,Citibike
349,349,6/4/19,To Essex Market,3.51,0.0,35.0,0.58,6.02,Citibike
350,350,7/18/20,CP ML,6.47,0.0,36.0,0.60,10.78,Specialized
351,351,6/29/16,Citibike to Work,1.16,0.0,10.0,0.17,6.96,Citibike


In [32]:
# Defining fields to use
columns = ['Miles', 'Duration', 'Speed','Type']
# target = ['Type']

In [36]:
# Create Dataframe with desired columns
bike_df = all_df.loc[:,columns].copy()
bike_df

Unnamed: 0,Miles,Duration,Speed,Type
0,2.09,0.30,6.97,Citibike
1,21.72,2.87,7.58,Specialized
2,1.09,0.18,5.95,Citibike
3,6.79,0.80,8.49,Specialized
4,1.96,0.27,7.35,Citibike
...,...,...,...,...
348,1.18,0.17,7.08,Citibike
349,3.51,0.58,6.02,Citibike
350,6.47,0.60,10.78,Specialized
351,1.16,0.17,6.96,Citibike


In [37]:
# Create Dataframe with desired columns
# Demonstrating different way to do it
# bike_df = all_df[['Miles', 'Duration', 'Speed','Type']]
# bike_df

In [38]:
# Splitting out Features and Target
y = bike_df["Type"]
X = bike_df.drop(columns="Type")

In [39]:
# Splitting out Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [40]:
# Checking training data shape
X_train.shape

(264, 3)

In [41]:
# Checking testing data shape
X_test.shape

(89, 3)

In [42]:
# Creating the Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [43]:
# Training the Model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [44]:
# Create Predictions
y_pred = classifier.predict(X_test)

In [45]:
# Get accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9662921348314607


In [46]:
# Convert Predictions to DataFrame
pred_df = pd.DataFrame(y_pred)
pred_df.reset_index()
pred_df = pred_df.rename(columns = {0:'Prediction'})
pred_df

Unnamed: 0,Prediction
0,Specialized
1,Citibike
2,Specialized
3,Specialized
4,Specialized
...,...
84,Citibike
85,Citibike
86,Specialized
87,Citibike


In [47]:
# Convert Tests to DataFrame
test_df = pd.DataFrame(y_test)
test_df

Unnamed: 0,Type
224,Specialized
291,Citibike
125,Specialized
153,Specialized
100,Specialized
...,...
311,Citibike
227,Citibike
68,Specialized
159,Citibike


In [48]:
# Making index into a column
test_df.reset_index(inplace=True)
test_df = test_df.rename(columns = {'index': 'Orig_Index'})
test_df

Unnamed: 0,Orig_Index,Type
0,224,Specialized
1,291,Citibike
2,125,Specialized
3,153,Specialized
4,100,Specialized
...,...,...
84,311,Citibike
85,227,Citibike
86,68,Specialized
87,159,Citibike


In [49]:
# Combining the new test and predition dataframes horizontally for comparison
test_pred_df = pd.concat([test_df, pred_df], axis = 1)
test_pred_df

Unnamed: 0,Orig_Index,Type,Prediction
0,224,Specialized,Specialized
1,291,Citibike,Citibike
2,125,Specialized,Specialized
3,153,Specialized,Specialized
4,100,Specialized,Specialized
...,...,...,...
84,311,Citibike,Citibike
85,227,Citibike,Citibike
86,68,Specialized,Specialized
87,159,Citibike,Citibike


In [22]:
# Exporting Combined Test and Prediction Dataframe for examination in Excel
# No longer necessary
# test_pred_file = "test_pred.csv"
# test_pred_df.to_csv(os.path.join(data_path, test_pred_file))

In [50]:
# Identify records where prediction is wrong
pred_errs_df = test_pred_df.loc[test_pred_df['Type'] != test_pred_df['Prediction']]
pred_errs_df

Unnamed: 0,Orig_Index,Type,Prediction
59,188,Citibike,Specialized
74,89,Specialized,Citibike
79,135,Citibike,Specialized


In [54]:
# Merging pred_errs_df with all_df to see details of errors
err_details_df = pd.merge(pred_errs_df, all_df, on=["Orig_Index", "Orig_Index"])
err_details_df

Unnamed: 0,Orig_Index,Type_x,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type_y
0,188,Citibike,Specialized,6/17/18,Work to Village,4.69,0.0,40.0,0.67,7.04,Citibike
1,89,Specialized,Citibike,7/15/19,Up Down ES,3.74,0.0,33.0,0.55,6.8,Specialized
2,135,Citibike,Specialized,9/16/19,From Pier 11,5.91,0.0,49.0,0.82,7.24,Citibike


In [55]:
# Cleaning up err_details_df to eliminate redundancy and improve readability
err_details_df.drop(columns='Type_y', inplace=True)
err_details_df.rename(columns = {'Type_x' : 'Type'}, inplace=True)
err_details_df

Unnamed: 0,Orig_Index,Type,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed
0,188,Citibike,Specialized,6/17/18,Work to Village,4.69,0.0,40.0,0.67,7.04
1,89,Specialized,Citibike,7/15/19,Up Down ES,3.74,0.0,33.0,0.55,6.8
2,135,Citibike,Specialized,9/16/19,From Pier 11,5.91,0.0,49.0,0.82,7.24
