# Improving the Functionality and Reliability of Bluebikes by Using ML to Predict Station Availability

## Background and Motivation

## Setup

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Data Read-In, Cleaning, and Preprocessing

First, read in one sample Bluebikes dataset from July 2020 to develop the preprocessing pipeline. Later, the same preprocessing will be applied to each month's Bluebikes trips dataset in the desired timeframe (July 2020 to June 2021).

In [58]:
# Read in one sample dataset
trips_2020_07 = pd.read_csv('data/202007-bluebikes-tripdata.csv')

In [59]:
display(trips_2020_07.head())
display(trips_2020_07.describe())
trips_2020_07.dtypes

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,259,2020-07-01 00:01:15.0430,2020-07-01 00:05:34.1010,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.07657,26,Washington St at Waltham St,42.341575,-71.068904,6059,Subscriber,2118.0
1,436,2020-07-01 00:03:39.1810,2020-07-01 00:10:55.4600,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,Customer,2114.0
2,1346,2020-07-01 00:04:27.0790,2020-07-01 00:26:53.2030,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,Customer,
3,2069,2020-07-01 00:04:56.0140,2020-07-01 00:39:25.1100,436,Maverick St at Massport Path,42.367741,-71.03336,436,Maverick St at Massport Path,42.367741,-71.03336,3858,Subscriber,2128.0
4,1266,2020-07-01 00:05:43.0180,2020-07-01 00:26:49.0580,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,Customer,


Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid
count,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0
mean,2320.02,162.39954,42.354993,-71.086877,161.173818,42.354872,-71.086755,4233.672366
std,32348.21,133.904613,0.018732,0.027911,133.891372,0.018805,0.027893,1245.847556
min,61.0,1.0,42.167226,-71.166491,1.0,42.167226,-71.166491,31.0
25%,557.0,54.0,42.344137,-71.105668,53.0,42.343749,-71.105495,3170.0
50%,958.0,113.0,42.353391,-71.086336,111.0,42.353334,-71.085954,4251.0
75%,1633.0,239.0,42.365445,-71.065287,236.0,42.365445,-71.065287,5328.0
max,3030358.0,455.0,42.414963,-70.905558,455.0,42.414963,-70.905558,6332.0


tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
postal code                 object
dtype: object

The data type of `starttime` and `stoptime` columns is an object, which is not the best way to store date information. Instead, we should convert these columns to datetime data type. Data types for the other variables seem reasonable.


In [60]:
# Convert starttime and stoptime of each trip to datetime
trips_2020_07['starttime'] = pd.to_datetime(trips_2020_07['starttime'])
trips_2020_07['stoptime'] = pd.to_datetime(trips_2020_07['stoptime'])

In [61]:
# Rename stoptime column to endtime for consistent column naming
trips_2020_07.rename(columns={'stoptime': 'endtime'}, inplace=True)

In [62]:
# Replace spaces in column names with underscores
trips_2020_07.columns = trips_2020_07.columns.str.replace(' ','_')

In [63]:
# Convert tripduration from seconds to minutes for more intuitive values
trips_2020_07['tripduration'] = trips_2020_07['tripduration']/60

In [64]:
## Create a few other date-related columns based on starttime variable
# extract 'year'
trips_2020_07['year'] = trips_2020_07.starttime.dt.year

# extract 'month'
trips_2020_07['month'] = trips_2020_07.starttime.dt.month

# extract 'weekday'
trips_2020_07['weekday'] = trips_2020_07.starttime.dt.weekday

# extract 'hour'
trips_2020_07['hour'] = trips_2020_07.starttime.dt.hour

In [65]:
# Function to assign 0 to 2020 and 1 to 2021
def encode_year(x):
    return np.int64(np.floor(x-2020))

# encode 'year' column with 0 and 1
trips_2020_07['year'] = trips_2020_07['year'].apply(encode_year)

`usertype` column should reflect the type of user:
- casual: Single Trip or Day Pass user
- member: Annual or Monthly Member


In [66]:
# Rename data values in the usertype column to intuitively type of user
trips_2020_07.usertype.replace({'Customer': 'casual',
                                'Subscriber': 'member'},
                              inplace=True)

In [67]:
# verify values
print(trips_2020_07.usertype.unique())

['member' 'casual']


In [68]:
# Display dataset after preprocessing
display(trips_2020_07.head())
trips_2020_07.dtypes

Unnamed: 0,tripduration,starttime,endtime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,postal_code,year,month,weekday,hour
0,4.316667,2020-07-01 00:01:15.043,2020-07-01 00:05:34.101,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.07657,26,Washington St at Waltham St,42.341575,-71.068904,6059,member,2118.0,0,7,2,0
1,7.266667,2020-07-01 00:03:39.181,2020-07-01 00:10:55.460,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,casual,2114.0,0,7,2,0
2,22.433333,2020-07-01 00:04:27.079,2020-07-01 00:26:53.203,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,casual,,0,7,2,0
3,34.483333,2020-07-01 00:04:56.014,2020-07-01 00:39:25.110,436,Maverick St at Massport Path,42.367741,-71.03336,436,Maverick St at Massport Path,42.367741,-71.03336,3858,member,2128.0,0,7,2,0
4,21.1,2020-07-01 00:05:43.018,2020-07-01 00:26:49.058,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,casual,,0,7,2,0


tripduration                      float64
starttime                  datetime64[ns]
endtime                    datetime64[ns]
start_station_id                    int64
start_station_name                 object
start_station_latitude            float64
start_station_longitude           float64
end_station_id                      int64
end_station_name                   object
end_station_latitude              float64
end_station_longitude             float64
bikeid                              int64
usertype                           object
postal_code                        object
year                                int64
month                               int64
weekday                             int64
hour                                int64
dtype: object

In [85]:
# Funciton to preprocess a month's trips dataset
def preprocess(month_df):
        # Convert starttime and stoptime of each trip to datetime
        month_df['starttime'] = pd.to_datetime(month_df['starttime'])
        month_df['stoptime'] = pd.to_datetime(month_df['stoptime'])

        # Rename stoptime column to endtime for consistent column naming
        month_df.rename(columns={'stoptime': 'endtime'}, inplace=True)

        # Replace spaces in column names with underscores
        month_df.columns = month_df.columns.str.replace(' ','_')

        # Convert tripduration from seconds to minutes for more intuitive values
        month_df['tripduration'] = month_df['tripduration']/60

        ## Create a few other date-related columns based on starttime variable
        # extract 'year'
        month_df['year'] = month_df.starttime.dt.year

        # extract 'month'
        month_df['month'] = month_df.starttime.dt.month

        # extract 'weekday'
        month_df['weekday'] = month_df.starttime.dt.weekday

        # extract 'hour'
        month_df['hour'] = month_df.starttime.dt.hour

        # encode 'year' column with 0 and 1
        month_df['year'] = month_df['year'].apply(encode_year)

        # Rename data values in the usertype column to intuitively type of user
        month_df.usertype.replace({'Customer': 'casual',
                                   'Subscriber': 'member'},
                                  inplace=True)
        
        return month_df
    
# Function to read-in all monthly datasets in a timeframe,
# apply preprocessing to each, and merge all into a single trips df
def read_trips_data(datasets):
    # List to store each each month's df
    dfs = []
    
    for month_dataset in datasets:
        month_df = pd.read_csv(f'data/{month_dataset}')
        
        month_df = preprocess(month_df)

        dfs.append(month_df)
        
    # Concat all dfs
    return pd.concat(dfs)

In [106]:
# Read all 
trips_df = read_trips_data(['202007-bluebikes-tripdata.csv', '202008-bluebikes-tripdata.csv',
                            '202009-bluebikes-tripdata.csv', '202010-bluebikes-tripdata.csv',
                            '202011-bluebikes-tripdata.csv', '202012-bluebikes-tripdata.csv',
                            '202101-bluebikes-tripdata.csv', '202102-bluebikes-tripdata.csv',
                            '202103-bluebikes-tripdata.csv', '202104-bluebikes-tripdata.csv',
                            '202105-bluebikes-tripdata.csv', '202106-bluebikes-tripdata.csv'])

In [107]:
# Clean indices
trips_df.reset_index(inplace=True)

In [111]:
trips_df.drop(columns='index', inplace=True)

In [112]:
# Final merged dataset 
display(trips_df.head())
display(trips_df.describe())
display(trips_df.dtypes)

Unnamed: 0,tripduration,starttime,endtime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,postal_code,year,month,weekday,hour
0,4.316667,2020-07-01 00:01:15.043,2020-07-01 00:05:34.101,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.07657,26,Washington St at Waltham St,42.341575,-71.068904,6059,member,2118.0,0,7,2,0
1,7.266667,2020-07-01 00:03:39.181,2020-07-01 00:10:55.460,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,casual,2114.0,0,7,2,0
2,22.433333,2020-07-01 00:04:27.079,2020-07-01 00:26:53.203,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,casual,,0,7,2,0
3,34.483333,2020-07-01 00:04:56.014,2020-07-01 00:39:25.110,436,Maverick St at Massport Path,42.367741,-71.03336,436,Maverick St at Massport Path,42.367741,-71.03336,3858,member,2128.0,0,7,2,0
4,21.1,2020-07-01 00:05:43.018,2020-07-01 00:26:49.058,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,casual,,0,7,2,0


Unnamed: 0,tripduration,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bikeid,year,month,weekday,hour
count,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0,2382726.0
mean,30.19542,167.6565,42.35618,-71.08903,166.5549,42.3561,-71.08877,4454.116,0.4373042,6.983388,3.155015,14.71261
std,423.7127,141.5987,0.01756302,0.02685581,141.7654,0.01761271,0.02682503,1377.318,0.4960538,2.717798,1.978414,5.803577
min,1.016667,1.0,42.16723,-71.22627,1.0,42.16723,-71.22627,31.0,0.0,1.0,0.0,0.0
25%,7.9,55.0,42.3459,-71.1055,54.0,42.34573,-71.1055,3276.0,0.0,5.0,1.0,12.0
50%,13.56667,110.0,42.35585,-71.08981,107.0,42.3556,-71.08974,4508.0,0.0,7.0,3.0,16.0
75%,23.31667,296.0,42.36551,-71.06985,282.0,42.36551,-71.06962,5638.0,1.0,9.0,5.0,19.0
max,56106.53,527.0,42.53467,-70.87021,526.0,42.53467,-70.87021,7029.0,1.0,12.0,6.0,23.0


tripduration                      float64
starttime                  datetime64[ns]
endtime                    datetime64[ns]
start_station_id                    int64
start_station_name                 object
start_station_latitude            float64
start_station_longitude           float64
end_station_id                      int64
end_station_name                   object
end_station_latitude              float64
end_station_longitude             float64
bikeid                              int64
usertype                           object
postal_code                        object
year                                int64
month                               int64
weekday                             int64
hour                                int64
dtype: object

In [113]:
# Join background variables to the trips df
background_vars = ['holiday', 'weather', 'temp', 'atemp', 
                   'hum', 'windspeed']

- `year` with 0 for 2020, 1 for 2021, etc.
- `month` with 1 through 12, with 1 denoting January.
- `weekday` (0 through 6, with 0 denoting Monday)
- `starttime` (date and time in the format YYYY-MM-DD HH:MM:SS.S, e.g. 2011-01-01 11:01:01.1)
- `endtime` (date and time in the format YYYY-MM-DD HH:MM:SS.S, e.g. 2011-01-01 11:01:01.1)
- `start_hour` (0 for midnight, 1 for 1:00am, 23 for 11:00pm)
- `end_hour` (0 for midnight, 1 for 1:00am, 23 for 11:00pm)
- `start_station_id` 
- `start_station_name`
- `start_station_latitude` and `start_station_longitude` (coordinates of the station)
- `end_station_id` 
- `end_station_name`
- `end_station_latitude` and `end_station_longitude` (coordinates of the station)
- `tripduration` in seconds
- *`holiday` (1 = the day is a holiday, 0 = otherwise)
- *`season` (1 = winter, 2 = spring, 3 = summer, 4 = fall)
- *`weather`
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm
    - 4: Heavy Rain + Thunderstorm + Mist, Snow + Fog 
- *`temp` (temperature in Celsius, normalized)
- *`atemp` (apparent temperature, or relative outdoor temperature, in Celsius, normalized)
- *`hum` (relative humidity, normalized)
- *`windspeed` (wind speed, normalized)

## EDA