In [224]:
import numpy as np
import pandas as pd
import janitor
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import glob

In [225]:
bike_2019 = pd.read_csv("/Users/lee14257/Development/CMU/Perspectives_in Data_Science/Project/bike2019_transformed.csv")

## Initial Check, drop NA rows for "capacity"

Since we will mainly be using the column "capacity" to set upper limits to the availability of a station, we must make sure to drop those stations with no information on capacity.

In [226]:
bike_2019.head()

Unnamed: 0,station_id,station_name,date,bike_id,act,reshuffle,availability,capacity
0,31000,STARTING BIKE NUM,2019-01-01 00:00:01,starting,5,0,5,15.0
1,31000,Eads St & 15th St S,2019-01-02 09:46:15,W23821,-1,0,4,15.0
2,31000,Eads St & 15th St S,2019-01-02 09:58:13,W21229,-1,0,3,15.0
3,31000,Eads St & 15th St S,2019-01-02 12:09:20,W23657,1,0,4,15.0
4,31000,Eads St & 15th St S,2019-01-02 12:12:59,W23657,-1,0,3,15.0


In [227]:
len(bike_2019['station_id'].unique())

581

In [228]:
# Drop rows that have NA values for "capacity"
bike_2019 = bike_2019.dropna()

In [229]:
# Make sure there are no more NA values
bike_2019[['capacity']].isnull().values.any()

False

## Explore average availability in each station

We will be excluding all stations that had an average availability of -10 or lower, because this probably meant that there were additional large reshuffles going on behind the scenes that cannot be explained by the data. We will later use lower and upper limit to cap the availability of a station, because we are interested in the overall movement of the availability curve for a station. 

In [230]:
avg_availability = bike_2019.groupby(['station_id'])['availability'].agg('mean')

In [231]:
len(avg_availability)

572

In [232]:
bike_2019.groupby(['station_id'])['availability'].agg(['mean', 'max']).to_csv("average_availability_per_station.csv")

In [233]:
# Create a list of stations to delete (that have less than -10 availability on average)
avg_availability = avg_availability.reset_index()
del_list = avg_availability[avg_availability['availability'] < -10][['station_id', 'availability']]

In [234]:
# Drop rows that are in the delete list
bike_2019_2 = bike_2019[~bike_2019['station_id'].isin(del_list['station_id'])]

In [235]:
# Check that stations were dropped
len(bike_2019_2['station_id'].unique())

545

## Availability capping (lower, upper limit)

After removing stations having extreme negative outlying availability, we will be running the cumulative sum to calculate availability again, but this time with an upper (total capacity of a station) and lower limit (0).

In [236]:
grouped = bike_2019_2.groupby('station_id')

In [237]:
new_availability = []

# Lower limit is 0
lower = 0

# Initialize cumsum to 0
cumsum = 0

# Start with the first row
cur_station = 31000

for name, group in grouped:
    for row_index, row in group.iterrows():
        if row['station_id'] != cur_station:
            cur_station = row['station_id']
            cumsum = 0
        upper = row['capacity']
        cumsum += row['act']
        cumsum = max(min(cumsum, upper), lower)
        new_availability.append(cumsum)

In [238]:
bike_2019_2[['availability']] = new_availability

In [239]:
bike_2019_2.to_csv("bike_2019_transformed_new.csv", index=False)

## Change availability to proportion value

In [240]:
bike_2019_2[['availability_p']] = round(bike_2019_2['availability'] / bike_2019_2['capacity'], 3)

In [242]:
bike_2019_2

Unnamed: 0,station_id,station_name,date,reshuffle,capacity,availability,availability_p
0,31000,STARTING BIKE NUM,2019-01-01 00:00:01,0,15.0,5.0,0.333
1,31000,Eads St & 15th St S,2019-01-02 09:46:15,0,15.0,4.0,0.267
2,31000,Eads St & 15th St S,2019-01-02 09:58:13,0,15.0,3.0,0.200
3,31000,Eads St & 15th St S,2019-01-02 12:09:20,0,15.0,4.0,0.267
4,31000,Eads St & 15th St S,2019-01-02 12:12:59,0,15.0,3.0,0.200
...,...,...,...,...,...,...,...
7494715,32609,W Columbia St & N Washington St,2019-12-28 14:16:09,0,12.0,4.0,0.333
7494716,32609,W Columbia St & N Washington St,2019-12-28 14:16:24,0,12.0,5.0,0.417
7494717,32609,W Columbia St & N Washington St,2019-12-28 14:17:07,0,12.0,4.0,0.333
7494718,32609,W Columbia St & N Washington St,2019-12-28 14:17:08,0,12.0,3.0,0.250


## Add holidays

In [251]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays(start=bike_2019_2['date'].min(), end=bike_2019_2['date'].max())
bike_2019_2['date_2'] = pd.to_datetime(bike_2019_2['date'], format='%Y-%m-%d').dt.date

In [255]:
bike_2019_2['is_holiday'] = bike_2019_2['date_2'].isin(holidays) * 1

In [259]:
bike_2019_2[bike_2019_2['is_holiday'] == 1]

Unnamed: 0,station_id,station_name,date,reshuffle,capacity,availability,availability_p,is_holiday,date_2
2318,31000,Eads St & 15th St S,2019-09-02 08:11:55,0,15.0,3.0,0.200,1,2019-09-02
2319,31000,Eads St & 15th St S,2019-09-02 08:49:14,0,15.0,2.0,0.133,1,2019-09-02
2320,31000,Eads St & 15th St S,2019-09-02 10:41:07,0,15.0,3.0,0.200,1,2019-09-02
2321,31000,Eads St & 15th St S,2019-09-02 11:33:13,0,15.0,2.0,0.133,1,2019-09-02
2322,31000,Eads St & 15th St S,2019-09-02 12:11:46,0,15.0,1.0,0.067,1,2019-09-02
...,...,...,...,...,...,...,...,...,...
7493698,32608,Falls Church City Hall / Park Ave & Little Fal...,2019-11-11 16:37:11,0,12.0,7.0,0.583,1,2019-11-11
7494327,32609,W Columbia St & N Washington St,2019-09-02 08:47:04,0,12.0,9.0,0.750,1,2019-09-02
7494328,32609,W Columbia St & N Washington St,2019-09-02 18:52:11,0,12.0,10.0,0.833,1,2019-09-02
7494329,32609,W Columbia St & N Washington St,2019-09-02 18:53:06,0,12.0,9.0,0.750,1,2019-09-02


## Add is_weekend variable

In [262]:
bike_2019_2['is_weekend'] = ((pd.DatetimeIndex(bike_2019_2['date_2']).dayofweek) // 5 == 1).astype(int)

In [264]:
bike_2019_2[bike_2019_2['is_weekend'] == 1]

Unnamed: 0,station_id,station_name,date,reshuffle,capacity,availability,availability_p,is_holiday,date_2,is_weekend
13,31000,Eads St & 15th St S,2019-01-05 17:43:30,0,15.0,2.0,0.133,0,2019-01-05,1
14,31000,Eads St & 15th St S,2019-01-06 13:29:17,0,15.0,3.0,0.200,0,2019-01-06,1
15,31000,Eads St & 15th St S,2019-01-06 16:09:35,0,15.0,4.0,0.267,0,2019-01-06,1
16,31000,Eads St & 15th St S,2019-01-06 16:10:08,0,15.0,3.0,0.200,0,2019-01-06,1
17,31000,Eads St & 15th St S,2019-01-06 23:10:15,0,15.0,2.0,0.133,0,2019-01-06,1
...,...,...,...,...,...,...,...,...,...,...
7494714,32609,W Columbia St & N Washington St,2019-12-28 11:57:50,0,12.0,3.0,0.250,0,2019-12-28,1
7494715,32609,W Columbia St & N Washington St,2019-12-28 14:16:09,0,12.0,4.0,0.333,0,2019-12-28,1
7494716,32609,W Columbia St & N Washington St,2019-12-28 14:16:24,0,12.0,5.0,0.417,0,2019-12-28,1
7494717,32609,W Columbia St & N Washington St,2019-12-28 14:17:07,0,12.0,4.0,0.333,0,2019-12-28,1


In [265]:
bike_2019_2 = bike_2019_2[['station_id', 'station_name', 'date', 'is_holiday', 'is_weekend', 'reshuffle', 'capacity', 'availability', 'availability_p']]

In [267]:
bike_2019_2.to_csv("bike_2019_transformed_new.csv", index=False)