## <center>This Machine Learning Model will Attempt to Predict the Type of Bicycle Used from Trip Data</center>

In [18]:
# Importing Libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Set up data path
data_path = 'data'

In [3]:
# Load base data to dataframe
all_data_file = 'all_data.csv'
all_df = pd.read_csv(os.path.join(data_path, all_data_file))
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,11/27/20,Wards Island x 2 Astoria,22.87,2.0,48.0,2.80,8.17,Specialized
1,5/24/20,CP BL,7.71,,49.0,0.82,9.44,Specialized
2,10/8/20,Wards Island x 2,17.71,2.0,7.0,2.12,8.37,Specialized
3,9/3/16,Down ES UP WS,16.72,1.0,52.0,1.87,8.96,Specialized
4,7/10/16,Down ES UP WS,18.38,2.0,20.0,2.33,7.88,Specialized
...,...,...,...,...,...,...,...,...
388,11/11/20,Dn ES Up LWS,14.81,1.0,40.0,1.67,8.89,Specialized
389,9/20/16,Citibike from Work Midday,0.90,,12.0,0.20,4.50,Citibike
390,8/17/20,Up Dn UWS to GWB,16.93,1.0,54.0,1.90,8.91,Specialized
391,9/3/21,Up Dn WS to GWB Up ES,29.80,3.0,23.0,3.38,8.81,Specialized


In [4]:
# Change NaN values to 0 - only occurs on hours column so this is accurate
all_df['Hours'] = all_df['Hours'].fillna(0)
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,11/27/20,Wards Island x 2 Astoria,22.87,2.0,48.0,2.80,8.17,Specialized
1,5/24/20,CP BL,7.71,0.0,49.0,0.82,9.44,Specialized
2,10/8/20,Wards Island x 2,17.71,2.0,7.0,2.12,8.37,Specialized
3,9/3/16,Down ES UP WS,16.72,1.0,52.0,1.87,8.96,Specialized
4,7/10/16,Down ES UP WS,18.38,2.0,20.0,2.33,7.88,Specialized
...,...,...,...,...,...,...,...,...
388,11/11/20,Dn ES Up LWS,14.81,1.0,40.0,1.67,8.89,Specialized
389,9/20/16,Citibike from Work Midday,0.90,0.0,12.0,0.20,4.50,Citibike
390,8/17/20,Up Dn UWS to GWB,16.93,1.0,54.0,1.90,8.91,Specialized
391,9/3/21,Up Dn WS to GWB Up ES,29.80,3.0,23.0,3.38,8.81,Specialized


In [5]:
# Turning Index into Orig_Index Column to trace back to original records
all_df.reset_index(inplace=True)
all_df = all_df.rename(columns = {'index': 'Orig_Index'})
all_df

Unnamed: 0,Orig_Index,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,0,11/27/20,Wards Island x 2 Astoria,22.87,2.0,48.0,2.80,8.17,Specialized
1,1,5/24/20,CP BL,7.71,0.0,49.0,0.82,9.44,Specialized
2,2,10/8/20,Wards Island x 2,17.71,2.0,7.0,2.12,8.37,Specialized
3,3,9/3/16,Down ES UP WS,16.72,1.0,52.0,1.87,8.96,Specialized
4,4,7/10/16,Down ES UP WS,18.38,2.0,20.0,2.33,7.88,Specialized
...,...,...,...,...,...,...,...,...,...
388,388,11/11/20,Dn ES Up LWS,14.81,1.0,40.0,1.67,8.89,Specialized
389,389,9/20/16,Citibike from Work Midday,0.90,0.0,12.0,0.20,4.50,Citibike
390,390,8/17/20,Up Dn UWS to GWB,16.93,1.0,54.0,1.90,8.91,Specialized
391,391,9/3/21,Up Dn WS to GWB Up ES,29.80,3.0,23.0,3.38,8.81,Specialized


In [6]:
# Defining fields to use
columns = ['Miles', 'Duration', 'Speed','Type']
# target = ['Type']

In [7]:
# Create Dataframe with desired columns
bike_df = all_df.loc[:,columns].copy()
bike_df

Unnamed: 0,Miles,Duration,Speed,Type
0,22.87,2.80,8.17,Specialized
1,7.71,0.82,9.44,Specialized
2,17.71,2.12,8.37,Specialized
3,16.72,1.87,8.96,Specialized
4,18.38,2.33,7.88,Specialized
...,...,...,...,...
388,14.81,1.67,8.89,Specialized
389,0.90,0.20,4.50,Citibike
390,16.93,1.90,8.91,Specialized
391,29.80,3.38,8.81,Specialized


In [8]:
# Splitting out Features and Target
y = bike_df["Type"]
X = bike_df.drop(columns="Type")

In [9]:
# Splitting out Training and Testing data
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Checking training data shape
X_train.shape

(294, 3)

In [11]:
# Checking testing data shape
X_test.shape

(99, 3)

In [12]:
# Creating the Logistic Regression Model
# from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [13]:
# Training the Model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [14]:
# Create Predictions
y_pred = classifier.predict(X_test)

In [16]:
X_test

Unnamed: 0,Miles,Duration,Speed
221,7.81,0.80,9.76
185,7.55,0.87,8.71
347,6.78,0.72,9.46
261,6.75,0.72,9.42
343,7.43,1.10,6.75
...,...,...,...
351,2.72,0.43,6.28
132,7.51,0.70,10.73
82,14.57,2.18,6.67
330,17.72,2.00,8.86


In [32]:
# Load single test data to dataframe
test_trip_file = 'TestTrip.csv'
test_trip_df = pd.read_csv(os.path.join(data_path, test_trip_file))
test_trip_df

Unnamed: 0,Miles,Duration,Speed
0,1.46,0.28,5.15


In [33]:
# Create Prediction for a single ride record
y_pred = classifier.predict(test_trip_df)
print(y_pred[0])

Citibike
