## <center>This Model will Attempt to Predict the Type of Bicycle Used from Trip Data</center>

In [1]:
# Eliminating Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score

In [3]:
# Set up data path
data_path = 'data'

In [4]:
# Load base data to dataframe
all_data_file = 'all_data.csv'
all_df = pd.read_csv(os.path.join(data_path, all_data_file))
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,6/6/20,CP BL,6.14,,37.0,0.62,9.96,Specialized
1,7/13/20,Wards Island,13.59,1.0,43.0,1.72,7.92,Specialized
2,6/24/16,Work to W 4th,3.63,,27.0,0.45,8.07,Citibike
3,7/8/20,To Citarella,1.08,,9.0,0.15,7.20,Citibike
4,4/14/20,CP BL x 3,19.93,2.0,6.0,2.10,9.49,Specialized
...,...,...,...,...,...,...,...,...
355,12/11/20,Dn ES Up WS,21.60,2.0,36.0,2.60,8.31,Specialized
356,6/27/16,Citibike to Work,1.08,,11.0,0.18,5.89,Citibike
357,6/23/20,UWS Thru CP to GWB+,22.42,2.0,54.0,2.90,7.73,Specialized
358,7/16/20,Wards Island,12.80,1.0,36.0,1.60,8.00,Specialized


In [5]:
# Change NaN values to 0 - only occurs on hours column so this is accurate
all_df['Hours'] = all_df['Hours'].fillna(0)
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,6/6/20,CP BL,6.14,0.0,37.0,0.62,9.96,Specialized
1,7/13/20,Wards Island,13.59,1.0,43.0,1.72,7.92,Specialized
2,6/24/16,Work to W 4th,3.63,0.0,27.0,0.45,8.07,Citibike
3,7/8/20,To Citarella,1.08,0.0,9.0,0.15,7.20,Citibike
4,4/14/20,CP BL x 3,19.93,2.0,6.0,2.10,9.49,Specialized
...,...,...,...,...,...,...,...,...
355,12/11/20,Dn ES Up WS,21.60,2.0,36.0,2.60,8.31,Specialized
356,6/27/16,Citibike to Work,1.08,0.0,11.0,0.18,5.89,Citibike
357,6/23/20,UWS Thru CP to GWB+,22.42,2.0,54.0,2.90,7.73,Specialized
358,7/16/20,Wards Island,12.80,1.0,36.0,1.60,8.00,Specialized


In [6]:
# Turning Index into Orig_Index Column
all_df.reset_index(inplace=True)
all_df = all_df.rename(columns = {'index': 'Orig_Index'})
all_df

Unnamed: 0,Orig_Index,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,0,6/6/20,CP BL,6.14,0.0,37.0,0.62,9.96,Specialized
1,1,7/13/20,Wards Island,13.59,1.0,43.0,1.72,7.92,Specialized
2,2,6/24/16,Work to W 4th,3.63,0.0,27.0,0.45,8.07,Citibike
3,3,7/8/20,To Citarella,1.08,0.0,9.0,0.15,7.20,Citibike
4,4,4/14/20,CP BL x 3,19.93,2.0,6.0,2.10,9.49,Specialized
...,...,...,...,...,...,...,...,...,...
355,355,12/11/20,Dn ES Up WS,21.60,2.0,36.0,2.60,8.31,Specialized
356,356,6/27/16,Citibike to Work,1.08,0.0,11.0,0.18,5.89,Citibike
357,357,6/23/20,UWS Thru CP to GWB+,22.42,2.0,54.0,2.90,7.73,Specialized
358,358,7/16/20,Wards Island,12.80,1.0,36.0,1.60,8.00,Specialized


In [7]:
# Defining fields to use
columns = ['Miles', 'Duration', 'Speed','Type']
# target = ['Type']

In [8]:
# Create Dataframe with desired columns
bike_df = all_df.loc[:,columns].copy()
bike_df

Unnamed: 0,Miles,Duration,Speed,Type
0,6.14,0.62,9.96,Specialized
1,13.59,1.72,7.92,Specialized
2,3.63,0.45,8.07,Citibike
3,1.08,0.15,7.20,Citibike
4,19.93,2.10,9.49,Specialized
...,...,...,...,...
355,21.60,2.60,8.31,Specialized
356,1.08,0.18,5.89,Citibike
357,22.42,2.90,7.73,Specialized
358,12.80,1.60,8.00,Specialized


In [9]:
# Create Dataframe with desired columns
# Demonstrating different way to do it
# bike_df = all_df[['Miles', 'Duration', 'Speed','Type']]
# bike_df

In [10]:
# Splitting out Features and Target
y = bike_df["Type"]
X = bike_df.drop(columns="Type")

In [11]:
# Splitting out Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [12]:
# Checking training data shape
X_train.shape

(270, 3)

In [13]:
# Checking testing data shape
X_test.shape

(90, 3)

In [14]:
# Creating the Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [15]:
# Training the Model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [16]:
# Create Predictions
y_pred = classifier.predict(X_test)

In [17]:
# Get accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9555555555555556


In [18]:
# Convert Predictions to DataFrame
pred_df = pd.DataFrame(y_pred)
pred_df.reset_index()
pred_df = pred_df.rename(columns = {0:'Prediction'})
pred_df

Unnamed: 0,Prediction
0,Citibike
1,Specialized
2,Specialized
3,Specialized
4,Specialized
...,...
85,Specialized
86,Specialized
87,Specialized
88,Specialized


In [19]:
# Convert Tests to DataFrame
test_df = pd.DataFrame(y_test)
test_df

Unnamed: 0,Type
121,Citibike
326,Specialized
102,Specialized
94,Specialized
126,Specialized
...,...
188,Specialized
327,Specialized
236,Specialized
16,Specialized


In [20]:
# Making index into a column
test_df.reset_index(inplace=True)
test_df = test_df.rename(columns = {'index': 'Orig_Index'})
test_df

Unnamed: 0,Orig_Index,Type
0,121,Citibike
1,326,Specialized
2,102,Specialized
3,94,Specialized
4,126,Specialized
...,...,...
85,188,Specialized
86,327,Specialized
87,236,Specialized
88,16,Specialized


In [21]:
# Combining the new test and predition dataframes horizontally for comparison
test_pred_df = pd.concat([test_df, pred_df], axis = 1)
test_pred_df

Unnamed: 0,Orig_Index,Type,Prediction
0,121,Citibike,Citibike
1,326,Specialized,Specialized
2,102,Specialized,Specialized
3,94,Specialized,Specialized
4,126,Specialized,Specialized
...,...,...,...
85,188,Specialized,Specialized
86,327,Specialized,Specialized
87,236,Specialized,Specialized
88,16,Specialized,Specialized


In [22]:
# Identify records where prediction is wrong
pred_errs_df = test_pred_df.loc[test_pred_df['Type'] != test_pred_df['Prediction']]
pred_errs_df

Unnamed: 0,Orig_Index,Type,Prediction
17,291,Citibike,Specialized
33,330,Specialized,Citibike
35,132,Specialized,Citibike
78,261,Specialized,Citibike


In [23]:
# Merging pred_errs_df with all_df to see details of errors
err_details_df = pd.merge(pred_errs_df, all_df, on=["Orig_Index", "Orig_Index"])
err_details_df

Unnamed: 0,Orig_Index,Type_x,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type_y
0,291,Citibike,Specialized,6/17/18,Work to Village,4.69,0.0,40.0,0.67,7.04,Citibike
1,330,Specialized,Citibike,12/29/20,From Trek WS,3.75,0.0,26.0,0.43,8.65,Specialized
2,132,Specialized,Citibike,6/28/20,Downtown Trio,4.1,0.0,34.0,0.57,7.24,Specialized
3,261,Specialized,Citibike,5/25/20,Uptown Trio,4.27,0.0,34.0,0.57,7.54,Specialized


In [24]:
# Cleaning up err_details_df to eliminate redundancy and improve readability
err_details_df.drop(columns='Type_y', inplace=True)
err_details_df.rename(columns = {'Type_x' : 'Type'}, inplace=True)
err_details_df

Unnamed: 0,Orig_Index,Type,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed
0,291,Citibike,Specialized,6/17/18,Work to Village,4.69,0.0,40.0,0.67,7.04
1,330,Specialized,Citibike,12/29/20,From Trek WS,3.75,0.0,26.0,0.43,8.65
2,132,Specialized,Citibike,6/28/20,Downtown Trio,4.1,0.0,34.0,0.57,7.24
3,261,Specialized,Citibike,5/25/20,Uptown Trio,4.27,0.0,34.0,0.57,7.54
