# Testing the model candidates

## Base features

* start time
* end time
* duration
* station A ID
* station B ID
* user type
* season (for future?)

## Constraints

Imbalanced dataset - 95% of the recorded trips are members.

## Candidates

1. Logistic regression
2. Random forest

In [53]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.pipeline import Pipeline

## Loading the dataset

In [4]:
from pathlib import Path

csv_path = Path.cwd().parent / 'data' / 'bikeshare-2017-q1.csv'
bikes = pd.read_csv(csv_path, index_col='trip_id')

bikes.sample(5)

Unnamed: 0_level_0,trip_start_time,trip_stop_time,trip_duration_seconds,from_station_id,from_station_name,to_station_id,to_station_name,user_type
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
784964,16/02/2017 21:33,16/02/2017 21:53,1207,7202,Queen St W / York St (City Hall),7097,Riverdale Park North (Broadview Ave),Member
863208,29/03/2017 12:55,29/03/2017 13:03,533,7006,Bay St / College St (East Side),7057,Simcoe St / Wellington St W,Member
800702,24/02/2017 3:44,24/02/2017 4:01,1071,7046,Niagara St / Richmond St W,7163,Yonge St / Wood St,Member
769516,6/2/2017 13:46,6/2/2017 14:00,839,7010,King St W / Spadina Ave,7028,Gould St / Mutual St,Member
778488,11/2/2017 23:52,12/2/2017 0:01,492,7008,Wellesley St / Queen's Park Cres,7031,Jarvis St / Isabella St,Member


In [5]:
bikes.dtypes

trip_start_time          object
trip_stop_time           object
trip_duration_seconds     int64
from_station_id           int64
from_station_name        object
to_station_id             int64
to_station_name          object
user_type                object
dtype: object

## Preprocessing

1. start/stop time should be converted to two fields: day of week and time of day
    * future dev: sync the date with public holiday calendar, i.e. an extra field indicating whether that day is a public holiday
2. station_id are basically encoded station names and so:
    * station names can be dropped
    * station_id should be one-hot encoded to be treated as categoricals
3. user_type should convert to 1/0: 1 for member, 0 for casual

In [17]:
df = bikes.sample(5)
datetimes = df['trip_start_time']
# can't convert two columns at the same time; method assumes that each column
# represents a year/month/day column in a dataframe???
# datetimes = df[['trip_start_time', 'trip_stop_time']]
print(datetimes.info())
dt = pd.to_datetime(
    datetimes,
    # format="%d/%m/%Y %H:%M",
    infer_datetime_format=True,
)
print(dt.info())

<class 'pandas.core.series.Series'>
Int64Index: 5 entries, 729152 to 812654
Series name: trip_start_time
Non-Null Count  Dtype 
--------------  ----- 
5 non-null      object
dtypes: object(1)
memory usage: 80.0+ bytes
None
<class 'pandas.core.series.Series'>
Int64Index: 5 entries, 729152 to 812654
Series name: trip_start_time
Non-Null Count  Dtype         
--------------  -----         
5 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 80.0 bytes
None


In [23]:
print(dt.iloc[0])

2017-01-13 23:05:00


In [20]:
dt.iloc[0].day_of_week

4

In [37]:
print(dt.iloc[0].hour, dt.iloc[0].minute)

23 5


In [40]:
def prep_bikes(df_bikes):
    """Preprocesses the bikeshare data
    
    Converts the datetimes from str obj to datetime objects
    """
    df_bikes['dt_start'] = pd.to_datetime(
        df_bikes['trip_start_time'],
        infer_datetime_format=True,
    )
    df_bikes['dt_end'] = pd.to_datetime(
        df_bikes['trip_stop_time'],
        infer_datetime_format=True,
    )
    # get day of week
    def get_day_of_week(trip):
        return trip['dt_start'].day_of_week

    df_bikes['day_of_week'] = df_bikes.apply(
        # get_day_of_week,
        lambda x: x['dt_start'].day_of_week,
        axis=1
    )
    
    # get hours
    df_bikes['start_hour'] = df_bikes.apply(
        lambda x: x['dt_start'].hour + x['dt_start'].minute / 60,
        axis=1,
    )
    df_bikes['end_hour'] = df_bikes.apply(
        lambda x: x['dt_end'].hour + x['dt_end'].minute / 60,
        axis=1,
    )
    drops = [
        'trip_start_time', 
        'trip_stop_time', 
        'from_station_name', 
        'to_station_name',
        'dt_start',
        'dt_end',
        ]
    df_bikes = df_bikes.drop(drops, axis=1)
    return df_bikes


In [42]:
prepped = prep_bikes(df)

Steps 2 and 3 involve encoding the features and so will be part of the `sklearn` pipeline.

In [60]:
enc = OneHotEncoder(dtype=int)
station_ids_cols = ['from_station_id', 'to_station_id']
station_ids = prepped[station_ids_cols]

enc.fit_transform(station_ids,).toarray()


array([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

In [46]:
station_ids

Unnamed: 0_level_0,from_station_id,to_station_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
729152,7061,7055
806425,7161,7057
823936,7124,7064
738827,7158,7176
812654,7202,7049


In [48]:
label_enc = LabelEncoder()
y = label_enc.fit_transform(prepped['user_type'])


array([0, 0, 0, 0, 0])

In [None]:
bikes_dt = prep_bikes(bikes)

In [79]:
bikes_dt = bikes_dt.drop(['dt_start', 'dt_end'], axis=1)
bikes_dt.columns

Index(['trip_duration_seconds', 'from_station_id', 'to_station_id',
       'user_type', 'day_of_week', 'start_hour', 'end_hour'],
      dtype='object')

In [None]:
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    )


In [None]:
print(np.bincount(y_train), np.bincount(y_test))

[33  4] [12  1]


Split to train/val/test.

First split with `train_test_split` to get the hold-out set, then split the resulting training set via `StratifiedKfold`

In [80]:
# split to train/test.

X = bikes_dt.drop('user_type', axis=1)
y = bikes_dt['user_type']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    )

Need `ColumnTransformer` to include one-hot encode of only select columns in a pipeline

In [81]:
from sklearn.compose import make_column_transformer

# enc = OneHotEncoder()
col_trans = make_column_transformer(
    (OneHotEncoder(), ['from_station_id']),
    (OneHotEncoder(), ['to_station_id']),
    # remainder='drop',
    remainder='passthrough',
)
col_trans.fit(X_train)

In [82]:
foo = col_trans.transform(X_test)
foo.toarray().shape

(33031, 404)