## <center>This Machine Learning Model will Attempt to Predict the Type of Bicycle Used from Trip Data</center>

In [1]:
# Importing Libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Set up data path
data_path = 'data'

In [3]:
# Load base data to dataframe
all_data_file = 'all_data.csv'
all_df = pd.read_csv(os.path.join(data_path, all_data_file))
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,5/2/21,Dn ES Up WS to GWB,29.47,3.0,7.0,3.12,9.46,Specialized
1,7/11/19,Up ES Down WS,19.47,3.0,9.0,3.15,6.18,Specialized
2,9/9/16,Citibike to Work,1.11,,11.0,0.18,6.05,Citibike
3,9/22/20,Up Dn WS to GWB Up ES,28.57,2.0,56.0,2.93,9.74,Specialized
4,11/2/19,Wards Island,14.20,2.0,9.0,2.15,6.60,Specialized
...,...,...,...,...,...,...,...,...
381,10/19/18,CP BL,7.44,,52.0,0.87,8.58,Specialized
382,10/17/16,Work to Orwasher's,2.09,,18.0,0.30,6.97,Citibike
383,7/12/21,Wards Island +,16.11,1.0,59.0,1.98,8.12,Specialized
384,9/29/19,Wards Island,14.66,1.0,48.0,1.80,8.14,Specialized


In [4]:
# Change NaN values to 0 - only occurs on hours column so this is accurate
all_df['Hours'] = all_df['Hours'].fillna(0)
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,5/2/21,Dn ES Up WS to GWB,29.47,3.0,7.0,3.12,9.46,Specialized
1,7/11/19,Up ES Down WS,19.47,3.0,9.0,3.15,6.18,Specialized
2,9/9/16,Citibike to Work,1.11,0.0,11.0,0.18,6.05,Citibike
3,9/22/20,Up Dn WS to GWB Up ES,28.57,2.0,56.0,2.93,9.74,Specialized
4,11/2/19,Wards Island,14.20,2.0,9.0,2.15,6.60,Specialized
...,...,...,...,...,...,...,...,...
381,10/19/18,CP BL,7.44,0.0,52.0,0.87,8.58,Specialized
382,10/17/16,Work to Orwasher's,2.09,0.0,18.0,0.30,6.97,Citibike
383,7/12/21,Wards Island +,16.11,1.0,59.0,1.98,8.12,Specialized
384,9/29/19,Wards Island,14.66,1.0,48.0,1.80,8.14,Specialized


In [5]:
# Turning Index into Orig_Index Column to trace back to original records
all_df.reset_index(inplace=True)
all_df = all_df.rename(columns = {'index': 'Orig_Index'})
all_df

Unnamed: 0,Orig_Index,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,0,5/2/21,Dn ES Up WS to GWB,29.47,3.0,7.0,3.12,9.46,Specialized
1,1,7/11/19,Up ES Down WS,19.47,3.0,9.0,3.15,6.18,Specialized
2,2,9/9/16,Citibike to Work,1.11,0.0,11.0,0.18,6.05,Citibike
3,3,9/22/20,Up Dn WS to GWB Up ES,28.57,2.0,56.0,2.93,9.74,Specialized
4,4,11/2/19,Wards Island,14.20,2.0,9.0,2.15,6.60,Specialized
...,...,...,...,...,...,...,...,...,...
381,381,10/19/18,CP BL,7.44,0.0,52.0,0.87,8.58,Specialized
382,382,10/17/16,Work to Orwasher's,2.09,0.0,18.0,0.30,6.97,Citibike
383,383,7/12/21,Wards Island +,16.11,1.0,59.0,1.98,8.12,Specialized
384,384,9/29/19,Wards Island,14.66,1.0,48.0,1.80,8.14,Specialized


In [6]:
# Defining fields to use
columns = ['Miles', 'Duration', 'Speed','Type']
# target = ['Type']

In [7]:
# Create Dataframe with desired columns
bike_df = all_df.loc[:,columns].copy()
bike_df

Unnamed: 0,Miles,Duration,Speed,Type
0,29.47,3.12,9.46,Specialized
1,19.47,3.15,6.18,Specialized
2,1.11,0.18,6.05,Citibike
3,28.57,2.93,9.74,Specialized
4,14.20,2.15,6.60,Specialized
...,...,...,...,...
381,7.44,0.87,8.58,Specialized
382,2.09,0.30,6.97,Citibike
383,16.11,1.98,8.12,Specialized
384,14.66,1.80,8.14,Specialized


In [8]:
# Splitting out Features and Target
y = bike_df["Type"]
X = bike_df.drop(columns="Type")

In [9]:
# Splitting out Training and Testing data
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Checking training data shape
X_train.shape

(289, 3)

In [11]:
# Checking testing data shape
X_test.shape

(97, 3)

In [12]:
# Creating the Logistic Regression Model
# from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [13]:
# Training the Model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [14]:
# Create Predictions
y_pred = classifier.predict(X_test)

In [15]:
# Get accuracy score
# from sklearn.metrics import accuracy_score
accu_score = accuracy_score(y_test, y_pred)
print(accu_score)

0.9896907216494846


In [16]:
# Convert Predictions to DataFrame
pred_df = pd.DataFrame(y_pred)
pred_df.reset_index()
pred_df = pred_df.rename(columns = {0:'Prediction'})
pred_df

Unnamed: 0,Prediction
0,Specialized
1,Specialized
2,Specialized
3,Specialized
4,Specialized
...,...
92,Citibike
93,Specialized
94,Specialized
95,Specialized


In [17]:
# Convert Tests to DataFrame
test_df = pd.DataFrame(y_test)
test_df

Unnamed: 0,Type
124,Specialized
348,Specialized
32,Specialized
40,Specialized
277,Specialized
...,...
344,Citibike
49,Specialized
12,Specialized
245,Specialized


In [18]:
# Making index into a column
test_df.reset_index(inplace=True)
test_df = test_df.rename(columns = {'index': 'Orig_Index'})
test_df

Unnamed: 0,Orig_Index,Type
0,124,Specialized
1,348,Specialized
2,32,Specialized
3,40,Specialized
4,277,Specialized
...,...,...
92,344,Citibike
93,49,Specialized
94,12,Specialized
95,245,Specialized


In [19]:
# Combining the new test and predition dataframes horizontally for comparison
test_pred_df = pd.concat([test_df, pred_df], axis = 1)
test_pred_df

Unnamed: 0,Orig_Index,Type,Prediction
0,124,Specialized,Specialized
1,348,Specialized,Specialized
2,32,Specialized,Specialized
3,40,Specialized,Specialized
4,277,Specialized,Specialized
...,...,...,...
92,344,Citibike,Citibike
93,49,Specialized,Specialized
94,12,Specialized,Specialized
95,245,Specialized,Specialized


In [20]:
# Identify records where prediction is wrong
pred_errs_df = test_pred_df.loc[test_pred_df['Type'] != test_pred_df['Prediction']]
pred_errs_df

Unnamed: 0,Orig_Index,Type,Prediction
61,74,Specialized,Citibike


In [21]:
# Merging pred_errs_df with all_df to see details of errors
err_details_df = pd.merge(pred_errs_df, all_df, on=["Orig_Index", "Orig_Index"])
err_details_df

Unnamed: 0,Orig_Index,Type_x,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type_y
0,74,Specialized,Citibike,6/28/20,Uptown Trio,3.95,0.0,26.0,0.43,9.12,Specialized


In [22]:
# Cleaning up err_details_df to eliminate redundancy and improve readability
err_details_df.drop(columns='Type_y', inplace=True)
err_details_df.rename(columns = {'Type_x' : 'Type'}, inplace=True)
err_details_df

Unnamed: 0,Orig_Index,Type,Prediction,Date,Destination,Miles,Hours,Minutes,Duration,Speed
0,74,Specialized,Citibike,6/28/20,Uptown Trio,3.95,0.0,26.0,0.43,9.12


In [23]:
# Print formatted Accuracy Score
print('Accruacy is: {:.2f}'.format(accu_score * 100) + '%')

Accruacy is: 98.97%
