In [1]:
# Dataset load
import pandas as pd
url = 'https://s3.amazonaws.com/capitalbikeshare-data/201908-capitalbikeshare-tripdata.zip'
df = pd.read_csv(url, compression='zip')
print(df.columns)

Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


In [2]:
# Preprocessing
df.dropna(inplace=True)
df.columns = ['duration', 'start_date', 'end_date', 'start_station_number', 'start_station', 'end_station_number', 'end_station', 'bike_number', 'class']
print(df.shape)
df.head()

(360044, 9)


Unnamed: 0,duration,start_date,end_date,start_station_number,start_station,end_station_number,end_station,bike_number,class
0,357,2019-08-01 00:00:34,2019-08-01 00:06:31,31117,15th & Euclid St NW,31115,Columbia Rd & Georgia Ave NW,W21052,Member
1,1100,2019-08-01 00:01:48,2019-08-01 00:20:09,31407,14th St & Colorado Ave NW,31115,Columbia Rd & Georgia Ave NW,W01101,Member
2,1406,2019-08-01 00:04:04,2019-08-01 00:27:31,31407,14th St & Colorado Ave NW,31234,20th & O St NW / Dupont South,W20870,Casual
3,3667,2019-08-01 00:05:09,2019-08-01 01:06:16,31907,Franklin St & S Washington St,31907,Franklin St & S Washington St,W00966,Member
4,282,2019-08-01 00:05:13,2019-08-01 00:09:56,31201,15th & P St NW,31234,20th & O St NW / Dupont South,W20443,Member


In [0]:
# Train test split
import numpy as np
from sklearn import model_selection
X = np.array(df[['duration', 'start_station_number', 'end_station_number']])
Y = np.array(df['class'])
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2)

In [4]:
# Training the classifier
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [5]:
# Result and report
from sklearn.metrics import classification_report
predictions = clf.predict(X_test)
print(classification_report(Y_test, predictions))
print(predictions[:10])
print(Y_test[:10])

              precision    recall  f1-score   support

      Casual       0.52      0.33      0.40     10321
      Member       0.89      0.95      0.92     61688

    accuracy                           0.86     72009
   macro avg       0.71      0.64      0.66     72009
weighted avg       0.84      0.86      0.85     72009

['Member' 'Casual' 'Member' 'Member' 'Member' 'Casual' 'Member' 'Member'
 'Member' 'Member']
['Member' 'Casual' 'Member' 'Casual' 'Member' 'Casual' 'Member' 'Member'
 'Casual' 'Member']
