# Background 

# Setup

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Data Read-In, Cleaning, and Preprocessing

In [4]:
# Read in bike trips datasets

In [45]:
trips_2020_07 = pd.read_csv('data/202007-bluebikes-tripdata.csv')
# trips_2021_06 = pd.read_csv('data/202106-bluebikes-tripdata.csv')

In [27]:
display(trips_2020_07.head())
display(trips_2020_07.describe())
trips_2020_07.dtypes

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,259,2020-07-01 00:01:15.0430,2020-07-01 00:05:34.1010,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.07657,26,Washington St at Waltham St,42.341575,-71.068904,6059,Subscriber,2118.0
1,436,2020-07-01 00:03:39.1810,2020-07-01 00:10:55.4600,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,Customer,2114.0
2,1346,2020-07-01 00:04:27.0790,2020-07-01 00:26:53.2030,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,Customer,
3,2069,2020-07-01 00:04:56.0140,2020-07-01 00:39:25.1100,436,Maverick St at Massport Path,42.367741,-71.03336,436,Maverick St at Massport Path,42.367741,-71.03336,3858,Subscriber,2128.0
4,1266,2020-07-01 00:05:43.0180,2020-07-01 00:26:49.0580,404,Mass Ave T Station,42.341356,-71.08337,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,Customer,


Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid
count,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0,259726.0
mean,2320.02,162.39954,42.354993,-71.086877,161.173818,42.354872,-71.086755,4233.672366
std,32348.21,133.904613,0.018732,0.027911,133.891372,0.018805,0.027893,1245.847556
min,61.0,1.0,42.167226,-71.166491,1.0,42.167226,-71.166491,31.0
25%,557.0,54.0,42.344137,-71.105668,53.0,42.343749,-71.105495,3170.0
50%,958.0,113.0,42.353391,-71.086336,111.0,42.353334,-71.085954,4251.0
75%,1633.0,239.0,42.365445,-71.065287,236.0,42.365445,-71.065287,5328.0
max,3030358.0,455.0,42.414963,-70.905558,455.0,42.414963,-70.905558,6332.0


tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
postal code                 object
dtype: object

The data type of `starttime` and `stoptime` columns is an object, which is not the best way to store date information. Instead, we should convert these columns to datetime data type. Data types for the other variables seem reasonable.


In [62]:
# Replace spaces in column names with underscores
trips_2020_07.columns = trips_2020_07.columns.str.replace(' ','_')

In [None]:
# Convert tripduration from seconds to minutes for more intuitive values
trips_2020_07['tripduration'] = trips_2020_07['tripduration']/60

In [63]:
trips_2020_07

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,postal_code,year,month,weekday
0,259,2020-07-01 00:01:15.043,2020-07-01 00:01:15.043,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.076570,26,Washington St at Waltham St,42.341575,-71.068904,6059,member,02118,2020,7,2
1,436,2020-07-01 00:03:39.181,2020-07-01 00:03:39.181,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,casual,02114,2020,7,2
2,1346,2020-07-01 00:04:27.079,2020-07-01 00:04:27.079,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,casual,,2020,7,2
3,2069,2020-07-01 00:04:56.014,2020-07-01 00:04:56.014,436,Maverick St at Massport Path,42.367741,-71.033360,436,Maverick St at Massport Path,42.367741,-71.033360,3858,member,02128,2020,7,2
4,1266,2020-07-01 00:05:43.018,2020-07-01 00:05:43.018,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,casual,,2020,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259721,2538,2020-07-31 23:59:20.468,2020-07-31 23:59:20.468,318,Ames St at Broadway,42.363693,-71.087567,68,Central Square at Mass Ave / Essex St,42.365070,-71.103100,2056,member,02142,2020,7,4
259722,1293,2020-07-31 23:59:27.653,2020-07-31 23:59:27.653,136,ID Building West,42.344796,-71.031614,416,Blossom St at Charles St,42.364356,-71.069594,4514,member,02210,2020,7,4
259723,2128,2020-07-31 23:59:42.780,2020-07-31 23:59:42.780,7,Fan Pier,42.353391,-71.044571,342,Boylston St at Jersey St,42.344651,-71.097325,5528,casual,,2020,7,4
259724,834,2020-07-31 23:59:48.095,2020-07-31 23:59:48.095,364,Tremont St at Northampton St,42.338896,-71.081500,342,Boylston St at Jersey St,42.344651,-71.097325,6251,member,02215,2020,7,4


In [64]:
# Rename stoptime column to endtime for consistent column naming
trips_2020_07.rename(columns={'stoptime': 'endtime'}, inplace=True)

In [66]:
# Convert starttime and stoptime of each trip to datetime
trips_2020_07['starttime'] = pd.to_datetime(trips_2020_07['starttime'])
trips_2020_07['endtime'] = pd.to_datetime(trips_2020_07['endtime'])


In [47]:
# Create a few other date-related columns based on starttime variable
# extract 'year'
trips_2020_07['year'] = trips_2020_07.starttime.dt.year

# extract 'month'
trips_2020_07['month'] = trips_2020_07.starttime.dt.month

# extract 'weekday'
trips_2020_07['weekday'] = trips_2020_07.starttime.dt.weekday

# extract 'hour'
trips_2020_07['hour'] = trips_2020_07.starttime.dt.hour


In [48]:
# function to assign 0 to 2020 and 1 to 2021
def encode_year(x):
    return np.int64(np.floor(x-2020))

# encode 'year' column with 0 and 1
trips_2020_07['year'] = trips_2020_07['year'].apply(encode_year)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code,year,month,weekday
0,259,2020-07-01 00:01:15.043,2020-07-01 00:01:15.043,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.076570,26,Washington St at Waltham St,42.341575,-71.068904,6059,Subscriber,02118,2020,7,2
1,436,2020-07-01 00:03:39.181,2020-07-01 00:03:39.181,6,Cambridge St at Joy St,42.361257,-71.065287,152,Ink Block - Harrison Ave at Herald St,42.345901,-71.063187,2322,Customer,02114,2020,7,2
2,1346,2020-07-01 00:04:27.079,2020-07-01 00:04:27.079,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,4062,Customer,,2020,7,2
3,2069,2020-07-01 00:04:56.014,2020-07-01 00:04:56.014,436,Maverick St at Massport Path,42.367741,-71.033360,436,Maverick St at Massport Path,42.367741,-71.033360,3858,Subscriber,02128,2020,7,2
4,1266,2020-07-01 00:05:43.018,2020-07-01 00:05:43.018,404,Mass Ave T Station,42.341356,-71.083370,167,Ryan Playground - Dorchester Ave at Harbor Vie...,42.317642,-71.056664,6031,Customer,,2020,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259721,2538,2020-07-31 23:59:20.468,2020-07-31 23:59:20.468,318,Ames St at Broadway,42.363693,-71.087567,68,Central Square at Mass Ave / Essex St,42.365070,-71.103100,2056,Subscriber,02142,2020,7,4
259722,1293,2020-07-31 23:59:27.653,2020-07-31 23:59:27.653,136,ID Building West,42.344796,-71.031614,416,Blossom St at Charles St,42.364356,-71.069594,4514,Subscriber,02210,2020,7,4
259723,2128,2020-07-31 23:59:42.780,2020-07-31 23:59:42.780,7,Fan Pier,42.353391,-71.044571,342,Boylston St at Jersey St,42.344651,-71.097325,5528,Customer,,2020,7,4
259724,834,2020-07-31 23:59:48.095,2020-07-31 23:59:48.095,364,Tremont St at Northampton St,42.338896,-71.081500,342,Boylston St at Jersey St,42.344651,-71.097325,6251,Subscriber,02215,2020,7,4


In [57]:
trips_2020_07.usertype.replace({'Customer': 'casual',
                                'Subscriber': 'member'},
                              inplace=True)

In [58]:
trips_2020_07.usertype.unique()

array(['member', 'casual'], dtype=object)

In [5]:
# Read in station dataset

In [22]:
# Join background variables to the trips dataset
background_vars = ['season', 'hour', 'weekday', 'holiday', 'weather', 
                   'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered']

- `year` with 0 for 2020, 1 for 2021, etc.
- `month` with 1 through 12, with 1 denoting January.
- `weekday` (0 through 6, with 0 denoting Monday)
- `starttime` (date and time in the format YYYY-MM-DD HH:MM:SS.S, e.g. 2011-01-01 11:01:01.1)
- `endtime` (date and time in the format YYYY-MM-DD HH:MM:SS.S, e.g. 2011-01-01 11:01:01.1)
- `start_hour` (0 for midnight, 1 for 1:00am, 23 for 11:00pm)
- `end_hour` (0 for midnight, 1 for 1:00am, 23 for 11:00pm)
- `start_station_id` 
- `start_station_name`
- `start_station_latitude` and `start_station_longitude` (coordinates of the station)
- `end_station_id` 
- `end_station_name`
- `end_station_latitude` and `end_station_longitude` (coordinates of the station)
- `tripduration` in seconds
- *`holiday` (1 = the day is a holiday, 0 = otherwise)
- *`season` (1 = winter, 2 = spring, 3 = summer, 4 = fall)
- *`weather`
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm
    - 4: Heavy Rain + Thunderstorm + Mist, Snow + Fog 
- *`temp` (temperature in Celsius, normalized)
- *`atemp` (apparent temperature, or relative outdoor temperature, in Celsius, normalized)
- *`hum` (relative humidity, normalized)
- *`windspeed` (wind speed, normalized)
- *`casual` (number of rides that day made by casual riders, not registered in the system)
- *`registered` (number of rides that day made by registered riders)

# EDA