# Testing the model candidates

## Base features

* start time
* end time
* duration
* station A ID
* station B ID
* user type
* season (for future?)

## Constraints

Imbalanced dataset - 95% of the recorded trips are members.

## Candidates

1. Logistic regression
2. Random forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.pipeline import make_pipeline

## Loading the dataset

In [2]:
from pathlib import Path

csv_path = Path.cwd().parent / 'data' / 'bikeshare-2017-q1.csv'
bikes = pd.read_csv(csv_path, index_col='trip_id')

bikes.sample(5)

Unnamed: 0_level_0,trip_start_time,trip_stop_time,trip_duration_seconds,from_station_id,from_station_name,to_station_id,to_station_name,user_type
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
829358,10/3/2017 13:28,10/3/2017 13:40,712,7168,Queens Quay / Yonge St,7062,University Ave / College St,Member
767133,3/2/2017 23:53,4/2/2017 0:03,604,7069,Queen St W / Spadina Ave,7019,Temperance St / Yonge St,Member
792684,21/02/2017 11:01,21/02/2017 11:21,1201,7024,Dundonald St / Church St,7108,Front St E / Cherry St,Casual
737479,19/01/2017 13:49,19/01/2017 14:00,692,7158,King St W / Stafford St,7053,Metro Hall Plaza,Casual
810503,1/3/2017 2:26,1/3/2017 2:47,1301,7006,Bay St / College St (East Side),7150,Dufferin St / Sylvan Av (Dufferin Grove Park),Member


In [5]:
bikes.dtypes

trip_start_time          object
trip_stop_time           object
trip_duration_seconds     int64
from_station_id           int64
from_station_name        object
to_station_id             int64
to_station_name          object
user_type                object
dtype: object

## Preprocessing

1. start/stop time should be converted to two fields: day of week and time of day
    * future dev: sync the date with public holiday calendar, i.e. an extra field indicating whether that day is a public holiday
2. station_id are basically encoded station names and so:
    * station names can be dropped
    * station_id should be one-hot encoded to be treated as categoricals
3. user_type should convert to 1/0: 1 for member, 0 for casual

In [5]:
df = bikes.sample(5)
datetimes = df['trip_start_time']
# can't convert two columns at the same time; method assumes that each column
# represents a year/month/day column in a dataframe???
# datetimes = df[['trip_start_time', 'trip_stop_time']]
print(datetimes.info())
dt = pd.to_datetime(
    datetimes,
    # format="%d/%m/%Y %H:%M",
    infer_datetime_format=True,
)
print(dt.info())

<class 'pandas.core.series.Series'>
Int64Index: 5 entries, 824695 to 801667
Series name: trip_start_time
Non-Null Count  Dtype 
--------------  ----- 
5 non-null      object
dtypes: object(1)
memory usage: 80.0+ bytes
None
<class 'pandas.core.series.Series'>
Int64Index: 5 entries, 824695 to 801667
Series name: trip_start_time
Non-Null Count  Dtype         
--------------  -----         
5 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 80.0 bytes
None


In [23]:
print(dt.iloc[0])

2017-01-13 23:05:00


In [20]:
dt.iloc[0].day_of_week

4

In [37]:
print(dt.iloc[0].hour, dt.iloc[0].minute)

23 5


In [15]:
def prep_bikes(df_bikes):
    """Preprocesses the bikeshare data
    
    Converts the datetimes from str obj to datetime objects
    """
    df_bikes['dt_start'] = pd.to_datetime(
        df_bikes['trip_start_time'],
        infer_datetime_format=True,
    )
    df_bikes['dt_end'] = pd.to_datetime(
        df_bikes['trip_stop_time'],
        infer_datetime_format=True,
    )
    # get day of week
    def get_day_of_week(trip):
        return trip['dt_start'].day_of_week

    df_bikes['day_of_week'] = df_bikes.apply(
        # get_day_of_week,
        lambda x: x['dt_start'].day_of_week,
        axis=1
    )
    
    # get hours
    df_bikes['start_hour'] = df_bikes.apply(
        lambda x: x['dt_start'].hour + x['dt_start'].minute / 60,
        axis=1,
    )
    df_bikes['end_hour'] = df_bikes.apply(
        lambda x: x['dt_end'].hour + x['dt_end'].minute / 60,
        axis=1,
    )
    drops = [
        'trip_start_time', 
        'trip_stop_time', 
        'from_station_name', 
        'to_station_name',
        'dt_start',
        'dt_end',
        'user_type',
        ]
    df_bikes = df_bikes.drop(drops, axis=1)
    return df_bikes


In [6]:
prepped = prep_bikes(df)

Steps 2 and 3 involve encoding the features and so will be part of the `sklearn` pipeline.

In [60]:
enc = OneHotEncoder(dtype=int)
station_ids_cols = ['from_station_id', 'to_station_id']
station_ids = prepped[station_ids_cols]

enc.fit_transform(station_ids,).toarray()


array([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

In [88]:
enc.get_feature_names_out()

array(['from_station_id_7061', 'from_station_id_7124',
       'from_station_id_7158', 'from_station_id_7161',
       'from_station_id_7202', 'to_station_id_7049', 'to_station_id_7055',
       'to_station_id_7057', 'to_station_id_7064', 'to_station_id_7176'],
      dtype=object)

In [89]:
pd.DataFrame(enc.fit_transform(station_ids).toarray(), 
            columns=enc.get_feature_names_out())

Unnamed: 0,from_station_id_7061,from_station_id_7124,from_station_id_7158,from_station_id_7161,from_station_id_7202,to_station_id_7049,to_station_id_7055,to_station_id_7057,to_station_id_7064,to_station_id_7176
0,1,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,1,1,0,0,0,0


In [46]:
station_ids

Unnamed: 0_level_0,from_station_id,to_station_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
729152,7061,7055
806425,7161,7057
823936,7124,7064
738827,7158,7176
812654,7202,7049


In [100]:
label_enc = LabelBinarizer()
y = label_enc.fit_transform(prepped['user_type'])
y

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [87]:
inv_y = label_enc.inverse_transform(y)
inv_y

array(['Member', 'Member', 'Member', 'Member', 'Member'], dtype=object)

In [None]:
bikes_dt = prep_bikes(bikes)

In [79]:
# drop datetime columns to avoid passthrough error when using ColumnTransformer
# bikes_dt = bikes_dt.drop(['dt_start', 'dt_end'], axis=1)
bikes_dt.columns

Index(['trip_duration_seconds', 'from_station_id', 'to_station_id',
       'user_type', 'day_of_week', 'start_hour', 'end_hour'],
      dtype='object')

In [None]:
# testing how stratify= works in train split

X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    )


In [9]:
bikes_samp = bikes.sample(20)
bikes_samp

Unnamed: 0_level_0,trip_start_time,trip_stop_time,trip_duration_seconds,from_station_id,from_station_name,to_station_id,to_station_name,user_type
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
750034,25/01/2017 14:47,25/01/2017 14:57,608,7011,Wellington St W / Portland St,7039,Simcoe St / Dundas St W,Member
765640,3/2/2017 11:40,3/2/2017 11:49,547,7033,Union Station,7047,University Ave / Gerrard St W,Member
799305,23/02/2017 20:38,23/02/2017 20:57,1164,7194,Mortimer Ave / Coxwell Ave,7117,Castle Frank Station,Member
847203,21/03/2017 21:31,21/03/2017 21:38,465,7039,Simcoe St / Dundas St W,7043,Queens Quay W / Lower Simcoe St,Member
744464,22/01/2017 23:35,22/01/2017 23:37,140,7059,Front St W / Blue Jays Way,7059,Front St W / Blue Jays Way,Member
843416,20/03/2017 18:09,20/03/2017 18:28,1143,7035,Queen St W / Ossington Ave,7010,King St W / Spadina Ave,Member
836619,16/03/2017 18:11,16/03/2017 18:29,1074,7066,Willcocks St / St. George St,7071,161 Bleecker St (South of Wellesley),Member
764015,2/2/2017 14:11,2/2/2017 14:21,634,7100,Dundas St E / Regent Park Blvd,7012,Elizabeth St / Edward St (Bus Terminal),Member
832733,13/03/2017 12:48,13/03/2017 12:58,637,7032,Augusta Ave / Dundas St W,7005,University Ave / King St W,Member
772565,8/2/2017 3:34,8/2/2017 3:54,1168,7035,Queen St W / Ossington Ave,7007,College St W / Huron St,Member


In [16]:
bikes_samp['target'] = bikes_samp['user_type'].apply(
    lambda type:  type == 'Member'
)
bikes_samp[['user_type', 'target']]

Unnamed: 0_level_0,user_type,target
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
750034,Member,True
765640,Member,True
799305,Member,True
847203,Member,True
744464,Member,True
843416,Member,True
836619,Member,True
764015,Member,True
832733,Member,True
772565,Member,True


In [19]:
bikes_samp_prep = prep_bikes(bikes_samp)
enc = LabelBinarizer()
# y = enc.fit_transform(bikes_samp['user_type'])
train, test = train_test_split(
    bikes_samp_prep, 
    test_size=0.4,
    stratify=bikes_samp_prep['target'],
)

In [23]:
cv = StratifiedKFold()
trips = []
for train_idx, test_idx in cv.split(train.drop('target', axis=1), train['target']):
    trips.append(train[:,train_idx], train[:,test_idx])





InvalidIndexError: (slice(None, None, None), array([ 3,  4,  5,  6,  7,  8,  9, 10, 11]))

In [None]:
print(np.bincount(y_train), np.bincount(y_test))

[33  4] [12  1]


Split to train/val/test.

First split with `train_test_split` to get the hold-out set, then split the resulting training set via `StratifiedKfold`

In [93]:
# split to train/test.

X = bikes_dt.drop('user_type', axis=1)
y = bikes_dt['user_type']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    )

In [97]:
y.head()

trip_id
712382    Member
712383    Member
712384    Member
712385    Member
712386    Member
Name: user_type, dtype: object

Need `ColumnTransformer` to include one-hot encode of only select columns in a pipeline

In [130]:
from sklearn.compose import make_column_transformer
from category_encoders.target_encoder import TargetEncoder

# enc = OneHotEncoder()
col_trans = make_column_transformer(
    (OneHotEncoder(), ['from_station_id']),
    (OneHotEncoder(), ['to_station_id']),
    (TargetEncoder(), []),
    # remainder='drop',
    remainder='passthrough',
)
col_trans.fit(X_train, y_train)

In [132]:
foo = col_trans.fit_transform(X_test, y_test)
foo.toarray().shape

(33031, 404)

In [134]:
foo.toarray()[:5, -8:]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.81000000e+02, 2.00000000e+00, 1.47833333e+01, 1.48333333e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.42000000e+02, 1.00000000e+00, 2.20333333e+01, 2.22000000e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.32000000e+02, 5.00000000e+00, 5.66666667e+00, 5.75000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.14400000e+03, 2.00000000e+00, 4.38333333e+00, 4.70000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.12000000e+02, 5.00000000e+00, 1.80333333e+01, 1.82166667e+01]])

### LabelBinarizer in Pipeline?

All steps in a pipeline are assumed to be transformers able to take `fit_transform(self, X, y)`, but the binarizer is only acting on `y`.

Need to use `TransformedTargetRegressor` to include our target transformation as part of the pipeline. What the wrapper class does is take the regressor that will be used (logistic reg or RF in our case), as well as the transformation that will be applied to the target.

The snag here is that `y` is required by the `Transformer` to already be a numpy array. Wouldn't need the wrapper if it already was, buddy.

Pipeline will not support target transformation since it's thought to be done only initially during fit. In all subsequent instances, the pipeline will only be working with new unlabelled data; no use in including support for target transformation.

In [121]:
# from sklearn.compose import TransformedTargetRegressor

label_enc = LabelBinarizer()
regressor = LogisticRegression(
    class_weight='balanced',
    max_iter=5e2)
# regr = TransformedTargetRegressor(
#     regressor=regressor,
#     transformer=label_enc,
# )

X_train_proc = col_trans.transform(X_train)
y_train_proc = label_enc.fit_transform(y_train).ravel()
regressor.fit(X_train_proc, y_train_proc)


In [126]:
X_test_proc = col_trans.transform(X_test)
y_test_proc = label_enc.transform(y_test).ravel()
y_score = regressor.predict_proba(X_test_proc)[:, 0]
y_pred = regressor.predict(X_test_proc)

roc = roc_auc_score(y_test_proc, y_score)
f1 = f1_score(y_test_proc, y_pred, average=None)
print(roc, f1)

0.1623052545436927 [0.26134342 0.88361661]


In [123]:
baz = np.array([0, 1])
label_enc.inverse_transform(baz)

array(['Casual', 'Member'], dtype='<U6')

In [125]:
rf =  RandomForestClassifier(
    class_weight='balanced',
)
rf.fit(X_train_proc, y_train_proc)
rf_y_score = rf.predict_proba(X_test_proc)[:, 0]
rf_y_pred = rf.predict(X_test_proc)
rf_roc = roc_auc_score(y_test_proc, rf_y_score)
rf_f1 = f1_score(y_test_proc, rf_y_pred, average=None)
print(rf_roc, rf_f1)

0.1172606802898305 [0.5789681  0.98317145]
