In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import zero_one_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# ML algs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Data preparation
We plan to predict the user type (subscriber or customer) and the duration of trip using a few given features.

In [2]:
dat = pd.read_csv('../data.csv', parse_dates=['starttime', 'stoptime']).rename(columns={
    'start.station.latitude': 'lat',
    'start.station.longitude': 'lon'
    })

# define day of week, hour of day, and day-time of week for the starting time
dat['wday'] = dat['starttime'].dt.dayofweek
dat['hour'] = dat['starttime'].dt.hour
dat['wdaytime'] = dat['wday'] + dat['hour']/24.0

# duration of trip: convert seconds to minutes
dat['trip_dur'] = dat['tripduration']/60.0

# predictors: lat, lon, wday, and hour
# labels: usertype (class) and trip_dur (numeric)
dat = dat[['lat', 'lon', 'wday', 'hour', 'usertype', 'trip_dur']]
dat.head()

Unnamed: 0,lat,lon,wday,hour,usertype,trip_dur
0,40.736245,-73.984738,1,20,Subscriber,5.216667
1,40.730477,-73.999061,4,8,Subscriber,6.116667
2,40.735354,-74.004831,4,8,Subscriber,15.95
3,40.760193,-73.991255,3,22,Subscriber,15.45
4,40.711464,-74.005524,6,7,Subscriber,11.583333


In [3]:
# predictors
X = dat[['lat', 'lon', 'wday', 'hour']]

# label for classification
Y = dat['usertype']
print('Class counts for usertype:\n{}'.format(Y.value_counts(dropna=False)))

# label for regression
Z = dat['trip_dur']
print('\nDistribution of trip_dur:\n{}'.format(Z.describe()))

Class counts for usertype:
usertype
Subscriber    43781
Customer       6219
Name: count, dtype: int64

Distribution of trip_dur:
count    50000.000000
mean        14.311048
std         14.348527
min          1.000000
25%          6.800000
50%         10.816667
75%         17.833333
max        346.200000
Name: trip_dur, dtype: float64


# Train classifier to predict usertype

In [4]:
# split training and test data to predict Y (usertype)
Xy_train, Xy_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1)

# normalize data for Y
scaler_y = StandardScaler()
scaler_y.fit(Xy_train)
Xy_train_scaled = scaler_y.transform(Xy_train)
Xy_test_scaled = scaler_y.transform(Xy_test)

# create model
model_y = RandomForestClassifier(max_depth=2, max_samples=0.5, random_state=0)

# train model
model_y.fit(Xy_train_scaled, Y_train)

# score trained model with cross validation
model_y_cv_scores = cross_val_score(model_y, Xy_train_scaled, Y_train, cv=3)

# mean and deviation of accuracy in cv
print('Accuracy {} +/- {}'.format(np.mean(model_y_cv_scores), np.std(model_y_cv_scores)))

Accuracy 0.8780499998475664 +/- 4.3114758167406825e-06


In [5]:
Y_pred = model_y.predict(Xy_test_scaled)
model_y_error_rate = zero_one_loss(Y_test, Y_pred, normalize=True)
print('Classification error rate = {}'.format(model_y_error_rate))

Classification error rate = 0.1341


# Train regressor to predict trip_dur

In [6]:
# split training and test data to predict Z (trip_dur)
Xz_train, Xz_test, Z_train, Z_test = train_test_split(
    X, Z, test_size=0.2, random_state=1)

# normalize data for Z
scaler_z = StandardScaler()
scaler_z.fit(Xz_train)
Xz_train_scaled = scaler_z.transform(Xz_train)
Xz_test_scaled = scaler_z.transform(Xz_test)

# create model
model_z = MLPRegressor(hidden_layer_sizes=(50, 25, ),
                       activation='relu',
                       alpha=0.0001,
                       max_iter=300,
                       random_state=0)
# model_z = KNeighborsRegressor(n_neighbors=500)
# model_z = RandomForestRegressor(max_depth=6, max_samples=0.5, random_state=0)

# train model
model_z.fit(Xz_train_scaled, Z_train)

# score trained model with cross validation
model_z_cv_scores = cross_val_score(model_z, Xz_train_scaled, Z_train, cv=3)

# mean and deviation of accuracy in cv
print('Accuracy {} +/- {}'.format(np.mean(model_z_cv_scores), np.std(model_z_cv_scores)))

Accuracy 0.04054042227435987 +/- 0.005343509423974904


In [7]:
Z_pred = model_z.predict(Xz_test_scaled)
model_z_mae = mean_absolute_error(Z_test, Z_pred)
model_z_mse = mean_squared_error(Z_test, Z_pred)
print('Regression MAE = {}, MSE = {}'.format(model_z_mae, model_z_mse))

Regression MAE = 7.836058693051617, MSE = 214.93003663634136
