In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime
from scipy.integrate import quad
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import mplleaflet
pd.set_option('display.max_columns',60)

In [2]:
csv_files_2017 = [('2017' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files_2018 = [('2018' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files_2019 = [('2019' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files = csv_files_2017 + csv_files_2018 + csv_files_2019

In [3]:
def to_datetime(df):
    df1 = df.copy()
    df1['starttime'] = pd.to_datetime(df1['starttime'])
    df1['stoptime'] = pd.to_datetime(df1['stoptime'])
    df1['start_date'] = df1['starttime'].dt.date
    df1['start_time'] = df1['starttime'].dt.time
    df1['start_hour'] = df1['starttime'].dt.hour
    df1['start_min'] = df1['starttime'].dt.minute
    df1['start_year'] = df1['starttime'].dt.year
    df1['start_month'] = df1['starttime'].dt.month
    df1['start_dayofweek'] = df1['starttime'].dt.weekday   # Monday is 0, Sunday is 6
    #df1['start_dayofweek'] = df1['starttime'].dt.weekday_name  # The name of day in a week (e.g. Monday)
    df1['stop_date'] = df1['stoptime'].dt.date
    df1['stop_time'] = df1['stoptime'].dt.time
    df1['stop_hour'] = df1['stoptime'].dt.hour
    df1['stop_min'] = df1['stoptime'].dt.minute
    df1['stop_dayofweek'] = df1['stoptime'].dt.weekday
    return df1

In [4]:
def ignore_offpeak(df):
    df1 = df.copy()
    mask = df1['start_hour'].apply(lambda x: 5 <= x <= 22)
    return df1[mask]

In [5]:
def aggregated_data(df):
    # Define distance: if same start and end location, use average speed of 7.456mph to estimate distance.
    # If different locations, calculate Manhattan distance between two stations
    df['distance']=np.where(df['start_station_ID'] == df['end_station_ID'],df['trip_duration']*7.456/3600,
                            abs(df['start_station_longitude']-df['end_station_longitude'])*53+\
                            abs(df['start_station_latitude']-df['end_station_latitude'])*69)
    # Daily average of all stations for time-series analysis
    df_daily = df.groupby('start_date').agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_daily.columns = ['start_date','trip_per_day','daily_avg_trip_duration','daily_avg_distance']
    df_daily_merged = df.merge(df_daily, how = 'left', on = 'start_date')
    # Hourly average for each station regardeless of days 
    df_hourly = df.groupby(['start_station_ID','start_hour']).\
    agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_hourly.columns = ['start_station_ID','start_hour','trip_per_hour','hourly_avg_trip_duration','hourly_avg_distance']
    df_hourly_merged = df_daily_merged.merge(df_hourly, how = 'left', on = ['start_station_ID','start_hour'])
    # Calculate hourly trip counts, avg trip duration, and avg trip distance per station and merge to above df
    df_hourly_eachday = df.groupby(['start_station_ID','start_date','start_hour']).\
    agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_hourly_eachday.columns = ['start_station_ID','start_date','start_hour','trip_per_hour_eachday','hourly_avg_trip_duration_eachday','hourly_avg_distance_eachday']
    df_hourly_eachday_merged = df_hourly_merged.merge(df_hourly_eachday, how = 'left', on = ['start_station_ID','start_date','start_hour'])
    return df_hourly_eachday_merged    

In [6]:
def merge_bikecount(df):
    # groupby start station ID, date and hour to get hourly counts of trips per start station
    checkout = df.groupby(['start_station_ID','start_date','start_hour'])['trip_duration'].count().reset_index()
    checkout.columns = ['start_station_ID','start_date','start_hour','checkout_counts']
    # groupby end station ID, date, and hour to get hourly counts of trips per end station 
    checkin = df.groupby(['end_station_ID','stop_date','stop_hour'])[['trip_duration']].count().reset_index()
    checkin.columns=['end_station_ID','stop_date','stop_hour','checkin_counts']
    # Join dataframe to get station checkin and checkout counts 
    temp = pd.merge(checkout, checkin,  how='outer', left_on=['start_station_ID','start_date','start_hour'], 
                    right_on = ['end_station_ID','stop_date','stop_hour'])
    temp['start_station_ID'] = temp['start_station_ID'].fillna(temp['end_station_ID'])
    temp['start_date'] = temp['start_date'].fillna(temp['stop_date'])
    temp['start_hour'] = temp['start_hour'].fillna(temp['stop_hour'])
    temp['checkout_counts'] = temp['checkout_counts'].fillna(0)
    temp['checkin_counts'] = temp['checkin_counts'].fillna(0)
    temp = temp.drop(['end_station_ID','stop_date','stop_hour'],axis=1)
    temp.columns=['station_ID','date','hour','checkout_counts','checkin_counts']
    temp['bike_added'] = temp['checkin_counts'] - temp['checkout_counts']
    # merge orginal dataframe to get hourly checkin/checkout information for both start and stop stations 
    df_temp_merged = pd.merge(df, temp,  how='left', left_on=['start_station_ID','start_date','start_hour'], 
         right_on = ['station_ID','date','hour']).drop(['station_ID','date','hour'],axis = 1)
    df_temp_merged = pd.merge(df_temp_merged, temp,  how='left', left_on=['end_station_ID','stop_date','stop_hour'], 
         right_on = ['station_ID','date','hour']).drop(['station_ID','date','hour'],axis = 1) 
    df_temp_merged = df_temp_merged.rename(columns={'checkout_counts_x':'start_station_checkout_counts',
                                                    'checkin_counts_x':'start_station_checkin_counts',
                                                    'bike_added_x':'start_station_bike_added', 
                                                    'checkout_counts_y':'end_station_checkout_counts',
                                                    'checkin_counts_y':'end_station_checkin_counts',
                                                    'bike_added_y':'end_station_bike_added'}) 
    return df_temp_merged
    

In [7]:
df = pd.DataFrame()
start_time = time.time()
random.seed(0)
path = './CitiBikeOriginal/'
divide_by = 20

In [8]:
for i, csv in enumerate(csv_files):
    df_temp = pd.read_csv(path+csv)
    df_temp.columns = ['trip_duration','starttime','stoptime','start_station_ID','start_station_name',
                       'start_station_latitude','start_station_longitude','end_station_ID','end_station_name',
                       'end_station_latitude','end_station_longitude','bike_ID','user_type','birth_year','gender']
    df_temp = df_temp.loc[df_temp['trip_duration']<= 24*3600]  
    df_temp = df_temp.loc[(df_temp['start_station_latitude']>40) & (df_temp['start_station_latitude']<41)]
    df_temp = to_datetime(df_temp)
    df_temp = ignore_offpeak(df_temp)
    #df_temp = merge_bikecount(df_temp)
    #df_temp = aggregated_data(df_temp)
    
    rows = len(df_temp)
    size = int(rows/divide_by)
    selected_idx = random.sample(range(1,rows), size)
    skip_idx = list(set(df_temp.index)-set(selected_idx))
    test_idx = random.sample(skip_idx,int(len(skip_idx)/divide_by))
    df_train = df_temp.iloc[selected_idx,:]
    #df_test = df_temp.iloc[test_idx,:]
    df_temp = df_train[['start_station_ID','start_station_name','start_station_latitude','start_station_longitude',
                        'end_station_ID','end_station_name','end_station_latitude','end_station_longitude',
                       'trip_duration']]    
    df = pd.concat([df, df_temp], axis = 0)
    print('Finishing ' + csv)
    
print('This block uses %.2f'%(time.time() - start_time) + ' seconds.' )

Finishing 201701-citibike-tripdata.csv
Finishing 201702-citibike-tripdata.csv
Finishing 201703-citibike-tripdata.csv
Finishing 201704-citibike-tripdata.csv
Finishing 201705-citibike-tripdata.csv
Finishing 201706-citibike-tripdata.csv
Finishing 201707-citibike-tripdata.csv
Finishing 201708-citibike-tripdata.csv
Finishing 201709-citibike-tripdata.csv
Finishing 201710-citibike-tripdata.csv
Finishing 201711-citibike-tripdata.csv
Finishing 201712-citibike-tripdata.csv
Finishing 201801-citibike-tripdata.csv
Finishing 201802-citibike-tripdata.csv
Finishing 201803-citibike-tripdata.csv
Finishing 201804-citibike-tripdata.csv
Finishing 201805-citibike-tripdata.csv
Finishing 201806-citibike-tripdata.csv
Finishing 201807-citibike-tripdata.csv
Finishing 201808-citibike-tripdata.csv
Finishing 201809-citibike-tripdata.csv
Finishing 201810-citibike-tripdata.csv
Finishing 201811-citibike-tripdata.csv
Finishing 201812-citibike-tripdata.csv
Finishing 201901-citibike-tripdata.csv
Finishing 201902-citibike

In [9]:
print('This dataframe has %d observations and %d variables.' %(df.shape[0],df.shape[1]))

This dataframe has 2625378 observations and 9 variables.


In [14]:
# combine start/end station ID/name into tuples
df_temp = df.astype('object')
#df_temp['route'] = list(zip(df_temp.start_station_ID, df_temp.end_station_ID))
df_temp['route_name'] = list(zip(df_temp.start_station_name, df_temp.end_station_name))
# create dictionaries from route and routename
route_dict = df_temp.route.value_counts().to_dict()
routename_dict = df_temp.route_name.value_counts().to_dict()

In [15]:
print('The proportion of of unique routes among all trips is %.2f'%(len(df_temp.route_name.unique())/df.shape[0]))
print('Total Possible Combinations is %d'%(1011*1010/2))
print('Total number of unique routes is %d'%len(df_temp.route_name.unique()))

The proportion of of unique routes among all trips is 0.10
Total Possible Combinations is 510555
Total number of unique routes is 252168


In [29]:
routename_dict

{('Central Park S & 6 Ave', 'Central Park S & 6 Ave'): 1120,
 ('Central Park S & 6 Ave', '5 Ave & E 88 St'): 955,
 ('E 7 St & Avenue A', 'Cooper Square & Astor Pl'): 851,
 ('Grand Army Plaza & Central Park S',
  'Grand Army Plaza & Central Park S'): 836,
 ('W 21 St & 6 Ave', '9 Ave & W 22 St'): 751,
 ('Pershing Square North', 'W 33 St & 7 Ave'): 723,
 ('12 Ave & W 40 St', 'West St & Chambers St'): 711,
 ('Pershing Square North', 'E 24 St & Park Ave S'): 661,
 ('West Drive & Prospect Park West', 'West Drive & Prospect Park West'): 653,
 ('S 4 St & Wythe Ave', 'N 6 St & Bedford Ave'): 650,
 ('N 6 St & Bedford Ave', 'S 4 St & Wythe Ave'): 649,
 ('Richardson St & N Henry St', 'Graham Ave & Conselyea St'): 606,
 ('Pier 40 - Hudson River Park', 'West St & Chambers St'): 604,
 ('Wythe Ave & Metropolitan Ave', 'N 6 St & Bedford Ave'): 602,
 ('Centre St & Chambers St', 'Centre St & Chambers St'): 598,
 ('Soissons Landing', 'Soissons Landing'): 595,
 ('Grand Army Plaza & Central Park S', '5 Ave 

In [43]:
import json

In [117]:
with open('./station_information.json') as f:
  data = json.load(f)
#print(json.dumps(data, indent = 4, sort_keys=True))
station_dict = data['data']['stations']
dock_station_df = pd.DataFrame.from_dict(station_dict)[['station_id', 'name', 'capacity']]

In [118]:
with open('./stations.json') as f:
  data1 = json.load(f)
station_dict1 = data1['stationBeanList']
dock_station_df1 = pd.DataFrame.from_dict(station_dict1)[['id', 'stationName', 'totalDocks']]

In [120]:
dock_station_df.columns = ['start_station_ID', 'start_station_name', 'capacity']
dock_station_df1.columns = ['start_station_ID', 'start_station_name', 'capacity']

In [126]:
dock_station_df['capacity1'] = dock_station_df1['capacity']
dock_station_df['capacity'] = dock_station_df[["capacity", "capacity1"]].max(axis=1)
dock_station_df.drop('capacity1', axis = 1, inplace = True)
dock_station_df.to_csv('dock_station.csv')

In [18]:
docks = pd.read_csv('total_docks.csv', index_col = 0, parse_dates = True)

In [23]:
docks.start_date = pd.to_datetime(docks['start_date'])

In [30]:
docks_temp = docks.groupby('start_station_name').agg({'start_date': ['min', 'max']}).reset_index()

In [32]:
docks_temp.columns = ['start_station_name', 'min_date', 'max_date']

In [36]:
docks_temp

Unnamed: 0,start_station_name,min_date,max_date
0,1 Ave & E 110 St,2017-01-18,2019-04-19
1,1 Ave & E 16 St,2017-01-17,2019-04-19
2,1 Ave & E 18 St,2017-01-17,2019-04-19
3,1 Ave & E 30 St,2017-01-17,2019-04-19
4,1 Ave & E 44 St,2017-01-17,2019-04-19
...,...,...,...
762,Wyckoff St & 3 Ave,2017-01-17,2019-04-19
763,Wythe Ave & Metropolitan Ave,2017-01-17,2019-04-19
764,Yankee Ferry Terminal,2017-05-17,2018-10-18
765,York St,2017-01-18,2019-04-19


In [37]:
pd.concat([pd.DataFrame({'Start': pd.date_range(row.min_date, row.max_date, freq='D')}, columns=['Date']) 
           for i, row in docks_temp.iterrows()], ignore_index=True)

Unnamed: 0,Date


In [35]:
docks

Unnamed: 0,start_station_name,start_date,tot_docks,start_year,start_month
207,1 Ave & E 110 St,2017-01-18,25,2017,1
208,1 Ave & E 110 St,2017-01-19,25,2017,1
209,1 Ave & E 110 St,2017-02-18,25,2017,2
210,1 Ave & E 110 St,2017-02-19,25,2017,2
211,1 Ave & E 110 St,2017-03-18,25,2017,3
...,...,...,...,...,...
472822,York St & Jay St,2019-02-17,27,2019,2
472823,York St & Jay St,2019-02-18,27,2019,2
472824,York St & Jay St,2019-03-17,27,2019,3
472825,York St & Jay St,2019-03-18,1,2019,3


In [29]:
docks

Unnamed: 0,start_station_name,start_date,tot_docks,start_year,start_month
207,1 Ave & E 110 St,2017-01-18,25,2017,1
208,1 Ave & E 110 St,2017-01-19,25,2017,1
209,1 Ave & E 110 St,2017-02-18,25,2017,2
210,1 Ave & E 110 St,2017-02-19,25,2017,2
211,1 Ave & E 110 St,2017-03-18,25,2017,3
...,...,...,...,...,...
472822,York St & Jay St,2019-02-17,27,2019,2
472823,York St & Jay St,2019-02-18,27,2019,2
472824,York St & Jay St,2019-03-17,27,2019,3
472825,York St & Jay St,2019-03-18,1,2019,3


In [38]:
from itertools import product

In [40]:
station_list = list(docks['start_station_name'].unique())
date_list = list(pd.date_range(start='1/1/2017', end='12/31/2019').date)

In [41]:
station_date = pd.DataFrame(list(product(station_list, date_list)), columns=['start_station_name', 'start_date'])
station_date['start_date'] = pd.to_datetime(station_date['start_date'])

In [48]:
df = station_date.merge(docks, how = 'left', on = ['start_station_name', 'start_date'])

In [56]:
df.drop(['start_year', 'start_month'], axis = 1, inplace = True)

In [139]:
def fix(df, value):
    list_ = []
    length = len(df)
    for i in range(length):
        a = min(i, value)
        b = min(length - i, value)
        list_.append(max(df['tot_docks'][i - a: i + b]))
    return list_

In [140]:
df.tot_docks = df.tot_docks.apply(lambda x: round(x))
mask = np.isnan(df.tot_docks)
df['tot_docks'][mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), df['tot_docks'][~mask])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [141]:
station_list = list(df.start_station_name.unique())

In [142]:
dock_df = pd.DataFrame()

In [143]:
for station in station_list:
    df_temp = df[df.start_station_name == station]
    df_temp['tot_docks'] = fix(df_temp, 10)
    dock_df = pd.concat([dock_df, df_temp], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


KeyboardInterrupt: 

In [153]:
df.groupby('start_station_name').agg(lambda x: x.median())

Unnamed: 0_level_0,tot_docks
start_station_name,Unnamed: 1_level_1
1 Ave & E 110 St,25
1 Ave & E 16 St,45
1 Ave & E 18 St,39
1 Ave & E 30 St,29
1 Ave & E 44 St,41
...,...
Wyckoff St & 3 Ave,22
Wythe Ave & Metropolitan Ave,27
Yankee Ferry Terminal,41
York St,22
