## <center>This Model will Attempt to Predict the Type of Bicycle Used from Trip Data</center>

In [105]:
# Eliminating Warnings
import warnings
warnings.filterwarnings('ignore')

In [106]:
# Importing Libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score

In [107]:
# Set up data path
data_path = 'data'

In [108]:
# Load base data to dataframe
all_data_file = 'all_data.csv'
all_df = pd.read_csv(os.path.join(data_path, all_data_file))
all_df

Unnamed: 0,Date,Destination,Miles,Hours,Minutes,Duration,Speed,Type
0,2/24/21,Dn ES Up LWS,14.65,1.0,54.0,1.90,7.71,Specialized
1,10/3/16,Citibike to Work,1.31,,10.0,0.17,7.86,Citibike
2,7/13/20,Wards Island,13.59,1.0,43.0,1.72,7.92,Specialized
3,6/27/16,Citibike to Work,1.08,,11.0,0.18,5.89,Citibike
4,12/27/20,To Zabars,2.30,,21.0,0.35,6.57,Citibike
...,...,...,...,...,...,...,...,...
348,10/4/16,Citibike to Work,1.36,,11.0,0.18,7.42,Citibike
349,5/13/20,CP BL x 2,13.82,1.0,32.0,1.53,9.01,Specialized
350,8/6/20,Down ES Up WS to GWB,28.47,3.0,10.0,3.17,8.99,Specialized
351,11/9/20,Dn ES Up WS,17.70,2.0,3.0,2.05,8.63,Specialized


In [109]:
# Defining fields to use
columns = ['Miles', 'Duration', 'Speed','Type']
# target = ['Type']

In [110]:
# Create Dataframe with desired columns
bike_df = all_df.loc[:,columns].copy()
bike_df

Unnamed: 0,Miles,Duration,Speed,Type
0,14.65,1.90,7.71,Specialized
1,1.31,0.17,7.86,Citibike
2,13.59,1.72,7.92,Specialized
3,1.08,0.18,5.89,Citibike
4,2.30,0.35,6.57,Citibike
...,...,...,...,...
348,1.36,0.18,7.42,Citibike
349,13.82,1.53,9.01,Specialized
350,28.47,3.17,8.99,Specialized
351,17.70,2.05,8.63,Specialized


In [111]:
# Create Dataframe with desired columns
# Demonstrating different way to do it
#new2_df = all_df[['Miles', 'Duration', 'Speed', 'Type']]
#new2_df

In [112]:
# Splitting out Features and Target
y = bike_df["Type"]
X = bike_df.drop(columns="Type")

In [113]:
# Splitting out Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [114]:
# Checking training data shape
X_train.shape

(264, 3)

In [115]:
# Checking testing data shape
X_test.shape

(89, 3)

In [116]:
# Creating the Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [117]:
# Training the Model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [118]:
# Create Predictions
y_pred = classifier.predict(X_test)

In [119]:
# Get accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9550561797752809


In [121]:
# See Predictions side-by-side
# Convert Predictions to DataFrame
pred_df = pd.DataFrame(y_pred)
pred_df

Unnamed: 0,0
0,Specialized
1,Citibike
2,Specialized
3,Specialized
4,Specialized
...,...
84,Citibike
85,Citibike
86,Specialized
87,Citibike


In [122]:
# Convert Tests to DataFrame
test_df = pd.DataFrame(y_test)
test_df

Unnamed: 0,Type
222,Specialized
294,Citibike
100,Specialized
132,Specialized
85,Specialized
...,...
318,Citibike
227,Citibike
53,Specialized
173,Citibike


### The following 3 cells are a sledgehammer approach to eliminating the index on the test date to be able to combine it for comparison to the predicted data

In [130]:
# Export the test data to a CSV file
test_data_file = 'test_data.csv'
test_df.to_csv(os.path.join(data_path, test_data_file), index=False)

In [133]:
# Also export a test data file with index for tracking back to original records
test_data_index_file = 'test_data_index.csv'
test_df.to_csv(os.path.join(data_path, test_data_index_file))

In [127]:
# Reimporting Test Data File to creating a new dataframe without the index
test2_df = pd.read_csv(os.path.join(data_path, test_data_file))
test2_df

Unnamed: 0,Type
0,Specialized
1,Citibike
2,Specialized
3,Specialized
4,Specialized
...,...
84,Citibike
85,Citibike
86,Specialized
87,Citibike


### Combining the test and prediction dataframes for comparison

In [128]:
# Combining the new test and predition dataframes horizontally
test_pred_df = pd.concat([test2_df, pred_df], axis = 1)
test_pred_df

Unnamed: 0,Type,0
0,Specialized,Specialized
1,Citibike,Citibike
2,Specialized,Specialized
3,Specialized,Specialized
4,Specialized,Specialized
...,...,...
84,Citibike,Citibike
85,Citibike,Citibike
86,Specialized,Specialized
87,Citibike,Citibike


In [129]:
# Exporting Combined Test and Prediction Dataframe for examination
test_pred_file = "test_pred.csv"
test_pred_df.to_csv(os.path.join(data_path, test_pred_file))