In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime
from scipy.integrate import quad
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import mplleaflet
pd.set_option('display.max_columns',60)
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [27]:
csv_files_2017 = [('2017' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files_2018 = [('2018' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files_2019 = [('2019' + "%.2d" + '-citibike-tripdata.csv') % i for i in range(1, 13)]
csv_files = csv_files_2017 + csv_files_2018 + csv_files_2019

In [189]:
def to_datetime(df):
    df1 = df.copy()
    df1['starttime'] = pd.to_datetime(df1['starttime'])
    df1['stoptime'] = pd.to_datetime(df1['stoptime'])
    df1['start_date'] = df1['starttime'].dt.date
    df1['start_time'] = df1['starttime'].dt.time
    df1['start_hour'] = df1['starttime'].dt.hour
    df1['start_min'] = df1['starttime'].dt.minute
    df1['start_year'] = df1['starttime'].dt.year
    df1['start_month'] = df1['starttime'].dt.month
    df1['start_dayofweek'] = df1['starttime'].dt.weekday   # Monday is 0, Sunday is 6
    #df1['start_dayofweek'] = df1['starttime'].dt.weekday_name  # The name of day in a week (e.g. Monday)
    df1['stop_date'] = df1['stoptime'].dt.date
    df1['stop_time'] = df1['stoptime'].dt.time
    df1['stop_hour'] = df1['stoptime'].dt.hour
    df1['stop_min'] = df1['stoptime'].dt.minute
    df1['stop_dayofweek'] = df1['stoptime'].dt.weekday
    return df1

In [4]:
def ignore_offpeak(df):
    df1 = df.copy()
    mask = df1['start_hour'].apply(lambda x: 5 <= x <= 22)
    return df1[mask]

In [5]:
def aggregated_data(df):
    # Define distance: if same start and end location, use average speed of 7.456mph to estimate distance.
    # If different locations, calculate Manhattan distance between two stations
    df['distance']=np.where(df['start_station_ID'] == df['end_station_ID'],df['trip_duration']*7.456/3600,
                            abs(df['start_station_longitude']-df['end_station_longitude'])*53+\
                            abs(df['start_station_latitude']-df['end_station_latitude'])*69)
    # Daily average of all stations for time-series analysis
    df_daily = df.groupby('start_date').agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_daily.columns = ['start_date','trip_per_day','daily_avg_trip_duration','daily_avg_distance']
    df_daily_merged = df.merge(df_daily, how = 'left', on = 'start_date')
    # Hourly average for each station regardeless of days 
    df_hourly = df.groupby(['start_station_ID','start_hour']).\
    agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_hourly.columns = ['start_station_ID','start_hour','trip_per_hour','hourly_avg_trip_duration','hourly_avg_distance']
    df_hourly_merged = df_daily_merged.merge(df_hourly, how = 'left', on = ['start_station_ID','start_hour'])
    # Calculate hourly trip counts, avg trip duration, and avg trip distance per station and merge to above df
    df_hourly_eachday = df.groupby(['start_station_ID','start_date','start_hour']).\
    agg({'trip_duration':['count','mean'],'distance':'mean'}).reset_index()
    df_hourly_eachday.columns = ['start_station_ID','start_date','start_hour','trip_per_hour_eachday','hourly_avg_trip_duration_eachday','hourly_avg_distance_eachday']
    df_hourly_eachday_merged = df_hourly_merged.merge(df_hourly_eachday, how = 'left', on = ['start_station_ID','start_date','start_hour'])
    return df_hourly_eachday_merged    

In [6]:
def merge_bikecount(df):
    # groupby start station ID, date and hour to get hourly counts of trips per start station
    checkout = df.groupby(['start_station_ID','start_date','start_hour'])['trip_duration'].count().reset_index()
    checkout.columns = ['start_station_ID','start_date','start_hour','checkout_counts']
    # groupby end station ID, date, and hour to get hourly counts of trips per end station 
    checkin = df.groupby(['end_station_ID','stop_date','stop_hour'])[['trip_duration']].count().reset_index()
    checkin.columns=['end_station_ID','stop_date','stop_hour','checkin_counts']
    # Join dataframe to get station checkin and checkout counts 
    temp = pd.merge(checkout, checkin,  how='outer', left_on=['start_station_ID','start_date','start_hour'], 
                    right_on = ['end_station_ID','stop_date','stop_hour'])
    temp['start_station_ID'] = temp['start_station_ID'].fillna(temp['end_station_ID'])
    temp['start_date'] = temp['start_date'].fillna(temp['stop_date'])
    temp['start_hour'] = temp['start_hour'].fillna(temp['stop_hour'])
    temp['checkout_counts'] = temp['checkout_counts'].fillna(0)
    temp['checkin_counts'] = temp['checkin_counts'].fillna(0)
    temp = temp.drop(['end_station_ID','stop_date','stop_hour'],axis=1)
    temp.columns=['station_ID','date','hour','checkout_counts','checkin_counts']
    temp['bike_added'] = temp['checkin_counts'] - temp['checkout_counts']
    # merge orginal dataframe to get hourly checkin/checkout information for both start and stop stations 
    df_temp_merged = pd.merge(df, temp,  how='left', left_on=['start_station_ID','start_date','start_hour'], 
         right_on = ['station_ID','date','hour']).drop(['station_ID','date','hour'],axis = 1)
    df_temp_merged = pd.merge(df_temp_merged, temp,  how='left', left_on=['end_station_ID','stop_date','stop_hour'], 
         right_on = ['station_ID','date','hour']).drop(['station_ID','date','hour'],axis = 1) 
    df_temp_merged = df_temp_merged.rename(columns={'checkout_counts_x':'start_station_checkout_counts',
                                                    'checkin_counts_x':'start_station_checkin_counts',
                                                    'bike_added_x':'start_station_bike_added', 
                                                    'checkout_counts_y':'end_station_checkout_counts',
                                                    'checkin_counts_y':'end_station_checkin_counts',
                                                    'bike_added_y':'end_station_bike_added'}) 
    return df_temp_merged
    

In [7]:
df = pd.DataFrame()
start_time = time.time()
random.seed(0)
path = './CitiBikeOriginal/'
divide_by = 20

In [8]:
for i, csv in enumerate(csv_files):
    df_temp = pd.read_csv(path+csv)
    df_temp.columns = ['trip_duration','starttime','stoptime','start_station_ID','start_station_name',
                       'start_station_latitude','start_station_longitude','end_station_ID','end_station_name',
                       'end_station_latitude','end_station_longitude','bike_ID','user_type','birth_year','gender']
    df_temp = df_temp.loc[df_temp['trip_duration']<= 24*3600]  
    df_temp = df_temp.loc[(df_temp['start_station_latitude']>40) & (df_temp['start_station_latitude']<41)]
    df_temp = to_datetime(df_temp)
    df_temp = ignore_offpeak(df_temp)
    #df_temp = merge_bikecount(df_temp)
    #df_temp = aggregated_data(df_temp)
    
    rows = len(df_temp)
    size = int(rows/divide_by)
    selected_idx = random.sample(range(1,rows), size)
    skip_idx = list(set(df_temp.index)-set(selected_idx))
    test_idx = random.sample(skip_idx,int(len(skip_idx)/divide_by))
    df_train = df_temp.iloc[selected_idx,:]
    #df_test = df_temp.iloc[test_idx,:]
    df_temp = df_train[['start_station_ID','start_station_name','start_station_latitude','start_station_longitude',
                        'end_station_ID','end_station_name','end_station_latitude','end_station_longitude',
                       'trip_duration']]    
    df = pd.concat([df, df_temp], axis = 0)
    print('Finishing ' + csv)
    
print('This block uses %.2f'%(time.time() - start_time) + ' seconds.' )

Finishing 201701-citibike-tripdata.csv
Finishing 201702-citibike-tripdata.csv
Finishing 201703-citibike-tripdata.csv
Finishing 201704-citibike-tripdata.csv
Finishing 201705-citibike-tripdata.csv
Finishing 201706-citibike-tripdata.csv
Finishing 201707-citibike-tripdata.csv
Finishing 201708-citibike-tripdata.csv
Finishing 201709-citibike-tripdata.csv
Finishing 201710-citibike-tripdata.csv
Finishing 201711-citibike-tripdata.csv
Finishing 201712-citibike-tripdata.csv
Finishing 201801-citibike-tripdata.csv
Finishing 201802-citibike-tripdata.csv
Finishing 201803-citibike-tripdata.csv
Finishing 201804-citibike-tripdata.csv
Finishing 201805-citibike-tripdata.csv
Finishing 201806-citibike-tripdata.csv
Finishing 201807-citibike-tripdata.csv
Finishing 201808-citibike-tripdata.csv
Finishing 201809-citibike-tripdata.csv
Finishing 201810-citibike-tripdata.csv
Finishing 201811-citibike-tripdata.csv
Finishing 201812-citibike-tripdata.csv
Finishing 201901-citibike-tripdata.csv
Finishing 201902-citibike

In [9]:
print('This dataframe has %d observations and %d variables.' %(df.shape[0],df.shape[1]))

This dataframe has 2625378 observations and 9 variables.


In [14]:
# combine start/end station ID/name into tuples
df_temp = df.astype('object')
#df_temp['route'] = list(zip(df_temp.start_station_ID, df_temp.end_station_ID))
df_temp['route_name'] = list(zip(df_temp.start_station_name, df_temp.end_station_name))
# create dictionaries from route and routename
route_dict = df_temp.route.value_counts().to_dict()
routename_dict = df_temp.route_name.value_counts().to_dict()

In [15]:
print('The proportion of of unique routes among all trips is %.2f'%(len(df_temp.route_name.unique())/df.shape[0]))
print('Total Possible Combinations is %d'%(1011*1010/2))
print('Total number of unique routes is %d'%len(df_temp.route_name.unique()))

The proportion of of unique routes among all trips is 0.10
Total Possible Combinations is 510555
Total number of unique routes is 252168


In [29]:
routename_dict

{('Central Park S & 6 Ave', 'Central Park S & 6 Ave'): 1120,
 ('Central Park S & 6 Ave', '5 Ave & E 88 St'): 955,
 ('E 7 St & Avenue A', 'Cooper Square & Astor Pl'): 851,
 ('Grand Army Plaza & Central Park S',
  'Grand Army Plaza & Central Park S'): 836,
 ('W 21 St & 6 Ave', '9 Ave & W 22 St'): 751,
 ('Pershing Square North', 'W 33 St & 7 Ave'): 723,
 ('12 Ave & W 40 St', 'West St & Chambers St'): 711,
 ('Pershing Square North', 'E 24 St & Park Ave S'): 661,
 ('West Drive & Prospect Park West', 'West Drive & Prospect Park West'): 653,
 ('S 4 St & Wythe Ave', 'N 6 St & Bedford Ave'): 650,
 ('N 6 St & Bedford Ave', 'S 4 St & Wythe Ave'): 649,
 ('Richardson St & N Henry St', 'Graham Ave & Conselyea St'): 606,
 ('Pier 40 - Hudson River Park', 'West St & Chambers St'): 604,
 ('Wythe Ave & Metropolitan Ave', 'N 6 St & Bedford Ave'): 602,
 ('Centre St & Chambers St', 'Centre St & Chambers St'): 598,
 ('Soissons Landing', 'Soissons Landing'): 595,
 ('Grand Army Plaza & Central Park S', '5 Ave 

In [33]:
train = pd.read_csv('./CitiBikeData/train.csv', index_col = 0)

  mask |= (ar1 == a)


In [49]:
test = pd.read_csv('./CitiBikeData/test.csv', index_col = 0)

In [34]:
cluster0_station = pd.read_csv('./CitiBikeData/cluster0_list.csv')
cluster0_station = cluster0_station[['start_station_name']]

In [35]:
df_temp = train.merge(cluster0_station, how = 'inner', on = 'start_station_name')
df_temp = df_temp[['start_station_ID','start_station_name','start_station_latitude','start_station_longitude',
                   'end_station_ID','end_station_name','end_station_latitude',
                   'end_station_longitude','trip_duration']]

In [36]:
start_station_geo_location = df_temp[['start_station_name','start_station_latitude',
                   'start_station_longitude']].drop_duplicates()
end_station_geo_location = df_temp[['end_station_name','end_station_latitude',
                   'end_station_longitude']].drop_duplicates()

In [37]:
df = pd.DataFrame(dict(start = df_temp.start_station_name, end = df_temp.end_station_name, cnt = 1))
df = df.groupby(['start','end']).sum()
df = df[df.groupby('start')['cnt'].transform(max) == df['cnt']].reset_index()
df.columns = ['start_station_name', 'end_station_name', 'trip_counts']
merge_df = df[['end_station_name', 'trip_counts']].merge(end_station_geo_location, 
                                                         how = 'left', on = 'end_station_name')

In [38]:
df_temp['route_name'] = list(zip(df_temp.start_station_name, df_temp.end_station_name))
routename_dict = df_temp.route_name.value_counts().to_dict()

In [39]:
pd.DataFrame({'route_name': list(routename_dict.keys()), 'trip_counts': list(routename_dict.values())})

Unnamed: 0,route_name,trip_counts
0,"(West Drive & Prospect Park West, West Drive &...",674
1,"(West Drive & Prospect Park West, Grand Army P...",350
2,"(Bus Slip & State St, South End Ave & Liberty St)",317
3,"(South St & Whitehall St, South St & Gouverneu...",243
4,"(Water - Whitehall Plaza, South End Ave & Libe...",228
...,...,...
25157,"(Suydam St & Broadway, Flushing Ave & Vanderbi...",1
25158,"(Reed St & Van Brunt St, Lefferts Pl & Frankli...",1
25159,"(Riverside Dr & W 91 St, Lafayette St & E 8 St)",1
25160,"(Throop Ave & Myrtle Ave, DeKalb Ave & Hudson ...",1


### Geolocation Borough

In [77]:
# train_start = train[['start_station_name','start_station_latitude','start_station_longitude']]
# train_end = train[['end_station_name','end_station_latitude','end_station_longitude']]
# test_start = test[['start_station_name','start_station_latitude','start_station_longitude']]
# test_end = test[['start_station_name','start_station_latitude','start_station_longitude']]
# train_start.columns = ['station_name', 'latitude', 'longitude']
# train_end.columns = ['station_name', 'latitude', 'longitude']
# test_start.columns = ['station_name', 'latitude', 'longitude']
# test_end.columns = ['station_name', 'latitude', 'longitude']
# station_df = pd.concat([train_start, train_end, test_start, test_end], axis = 0).drop_duplicates()
# station_df = station_df.astype('str')
# station_df['geopair'] = station_df.latitude + ', ' + station_df.longitude
# station_df = station_df.reset_index().drop('index', axis =1)
# station_df.to_csv('./CitiBikeData/all_station_list.csv')

In [2]:
station_df = pd.read_csv('./CitiBikeData/all_station_list.csv', index_col = 0)

In [3]:
station_df.head()

Unnamed: 0,station_name,latitude,longitude,geopair
0,Pershing Square North,40.751873,-73.977706,"40.751872999999996, -73.97770600000003"
1,W 17 St & 8 Ave,40.741776,-74.001497,"40.74177603, -74.00149746"
2,E 2 St & Avenue B,40.722174,-73.983688,"40.72217444, -73.98368779"
3,E 12 St & 3 Ave,40.732233,-73.9889,"40.73223272, -73.98889957"
4,W 34 St & 11 Ave,40.755942,-74.002116,"40.75594159, -74.0021163"


In [4]:
geolocator = Nominatim(user_agent="specify_your_app_name_here")

In [5]:
geolocator.reverse(station_df.geopair[0])[0].split(', ')

['Citi Bike - Pershing Square North',
 'Park Avenue',
 'Murray Hill',
 'Manhattan Community Board 5',
 'Manhattan',
 'New York County',
 'New York',
 '10037',
 'United States of America']

In [6]:
def do_geocode(pair):
    try:
        return geolocator.reverse(pair)[0].split(', ')[2]
    except GeocoderTimedOut:
        return do_geocode(pair)

In [7]:
district_list = []

In [8]:
for i, pair in enumerate(station_df.geopair):
    try:
        district = do_geocode(pair)
        print(district)
        district_list.append(district)
        if i%20 == 0:
            time.sleep(2)
    except:
        district_list.append(np.nan)

Murray Hill
Robert Fulton Houses
Alphabet City
Washington Square Village
Hudson Yards
East Village
Robert F. Wagner Houses
Turtle Bay
Chelsea
Garment District
Civic Center
Midtown
Manhattan Community Board 5
Chelsea
Little Italy
Montague Street
DUMBO
Chelsea
Times Square
Tribeca
Park Slope
Chinatown
Upper East Side
East Village
Upper West Side
Manhattan Valley
East 25th Street
NoHo
Manhattan Valley
Union Square
Riverside Boulevard
Clinton Street
Cobble Hill Historic District
Manhattan Community Board 5
Kane Street
Chelsea
Lower East Side
Kips Bay
Murray Hill
West Village
Chelsea
Kips Bay
Stuy Town
Prospect Heights
West 27th Street
Midtown East
Times Square
Manhattan
Mercer street Broom & Spring Block
Harlem
Yorkville
Financial District
DUMBO
Carroll Gardens
Boerum Hill Historic District
Lenox Hill
Washington Square Village
Boerum Hill
Herald Square
Manhattan Community Board 8
Rose Hill
SoHo
Tribeca
NoHo
Bowne Street
West 42nd Street
Kips Bay
West Village
Tribeca
Broadway
Hudson Square


Boerum Hill
Columbia Street Waterfront District
East 81st Street
East Harlem
Williamsburg
George Washington Carver Houses
47th Avenue
Windsor Terrace
Boerum Hill Historic District
Reed Street
5th Avenue
East Harlem
Clinton Hill
East Harlem
Bedford-Stuyvesant
Long Island City
DUMBO
Bedford-Stuyvesant
Williamsburg
Bedford-Stuyvesant
Williamsburg
Bedford-Stuyvesant
Williamsburg
Park Slope
Red Hook
Bedford-Stuyvesant
Greenpoint
Bedford-Stuyvesant
Brooklyn Navy Yard
Marcy Houses
Williamsburg
Red Hook Houses
3rd Avenue
Long Island City
Red Hook Houses
Williamsburg Houses
Red Hook Houses
Red Hook Houses
Williamsburg
Red Hook Houses
Red Hook Houses
DUMBO
Long Island City
Tribeca
3rd Avenue
Red Hook
East Harlem
Williamsburg
Franklin Plaza
Long Island City
Chelsea
Williamsburg
8th Avenue
Upper West Side
Lenox Hill
Williamsburg
Hell's Kitchen
East Village
Brooklyn Heights
Greenpoint
Greenpoint
Schermerhorn Street
Stuy Town
Red Hook Houses
Washington Square Village
Greenpoint
Upper East Side
East 

In [9]:
station_df['district'] = district_list

In [10]:
station_df.to_csv('./CitiBikeData/station_info.csv')

In [11]:
def do_geocode2(pair):
    try:
        return geolocator.reverse(pair)[0].split(', ')[-5]
    except GeocoderTimedOut:
        return do_geocode(pair)

In [12]:
borough_list = []

In [None]:
for i, pair in enumerate(station_df.geopair):
    try:
        borough = do_geocode2(pair)
        print(borough)
        borough_list.append(borough)
        if i%20 == 0:
            time.sleep(2)
    except:
        district_list.append(np.nan)

Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan Community Board 5
Manhattan
Manhattan
Brooklyn
Brooklyn
Manhattan
Manhattan
Manhattan
Brooklyn
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Brooklyn
Brooklyn
Manhattan
Brooklyn
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Manhattan
Brooklyn
Manhattan
Manhattan
Manhattan
Manhattan Community Board 1
Manhattan
Manhattan
Manhattan
Manhattan
Brooklyn
Brooklyn
Brooklyn
Manhattan
Manhattan


In [None]:
station_df['borough'] = borough_list

In [None]:
station_df.to_csv('./CitiBikeData/station_info.csv')

In [43]:
import json

In [117]:
with open('./station_information.json') as f:
  data = json.load(f)
#print(json.dumps(data, indent = 4, sort_keys=True))
station_dict = data['data']['stations']
dock_station_df = pd.DataFrame.from_dict(station_dict)[['station_id', 'name', 'capacity']]

In [118]:
with open('./stations.json') as f:
  data1 = json.load(f)
station_dict1 = data1['stationBeanList']
dock_station_df1 = pd.DataFrame.from_dict(station_dict1)[['id', 'stationName', 'totalDocks']]

In [120]:
dock_station_df.columns = ['start_station_ID', 'start_station_name', 'capacity']
dock_station_df1.columns = ['start_station_ID', 'start_station_name', 'capacity']

In [126]:
dock_station_df['capacity1'] = dock_station_df1['capacity']
dock_station_df['capacity'] = dock_station_df[["capacity", "capacity1"]].max(axis=1)
dock_station_df.drop('capacity1', axis = 1, inplace = True)
dock_station_df.to_csv('dock_station.csv')

In [221]:
jun_2019 = pd.read_csv('./CitiBikeOriginal/201906-citibike-tripdata.csv')

In [223]:
jun_2019.columns = ['trip_duration','starttime','stoptime','start_station_ID','start_station_name',
                   'start_station_latitude','start_station_longitude','end_station_ID','end_station_name',
                   'end_station_latitude','end_station_longitude','bike_ID','user_type','birth_year','gender']
jun_2019 = jun_2019.loc[jun_2019['trip_duration']<= 24*3600]  
jun_2019 = jun_2019.loc[(jun_2019['start_station_latitude']>40) & (jun_2019['start_station_latitude']<41)]
jun_2019 = jun_2019.loc[(jun_2019['end_station_latitude']>40) & (jun_2019['end_station_latitude']<41)]
jun_2019['starttime'] = pd.to_datetime(jun_2019['starttime'])
jun_2019['stoptime'] = pd.to_datetime(jun_2019['stoptime'])
jun_2019['start_week'] = jun_2019['starttime'].dt.week
jun_2019['start_dayofweek'] = jun_2019['starttime'].dt.weekday + 1
jun_2019['start_day'] = jun_2019['starttime'].dt.day
jun_2019['start_hour'] = jun_2019['starttime'].dt.hour
jun_2019['stop_hour'] = jun_2019['stoptime'].dt.hour

In [224]:
station = pd.read_csv('./CitiBikeData/station_activity.csv', index_col = 0)
station_info = pd.read_csv('./CitiBikeData/station_info.csv', index_col = 0)
station_info = station_info[['station_name', 'district']]

In [225]:
# merge to get district infomration
station_merged = station.merge(station_info, how = 'left', left_on = 'Station', right_on = 'station_name')
station_merged.drop('station_name', axis = 1, inplace = True)

In [226]:
station_merged

Unnamed: 0,Station,start_hour,net_change,net_change_percent,start_station_longitude,start_station_latitude,Normality,district
0,1 Ave & E 44 St,8,18.552910,0.475072,-73.969053,40.750020,Highly Positive,Tudor City
1,1 Ave & E 44 St,9,12.304348,0.314142,-73.969053,40.750020,Normal,Tudor City
2,1 Ave & E 44 St,16,-10.136082,-0.259416,-73.969053,40.750020,Normal,Tudor City
3,1 Ave & E 44 St,17,-21.878307,-0.559593,-73.969053,40.750020,Highly Negative,Tudor City
4,1 Ave & E 44 St,18,-6.788060,-0.172730,-73.969053,40.750020,Normal,Tudor City
...,...,...,...,...,...,...,...,...
598,West St & Chambers St,9,19.192453,0.639904,-74.013221,40.717548,Highly Positive,Battery Park City
599,West St & Chambers St,16,-2.363359,-0.078915,-74.013221,40.717548,Normal,Battery Park City
600,West St & Chambers St,17,-17.645314,-0.588479,-74.013221,40.717548,Highly Negative,Battery Park City
601,West St & Chambers St,18,-19.978388,-0.666492,-74.013221,40.717548,Highly Negative,Battery Park City


In [227]:
hour = 8
#station_list = []

In [337]:
highly_neg_8_part = station_merged[(station_merged.start_hour == hour) 
                                   & (station_merged.Normality == 'Highly Negative')
                                   & (station_merged.district.isin(['Alphabet City']))]
highneg_station_8 = set(highly_neg_8_part.Station)

In [338]:
highneg_station_8

{'E 2 St & Avenue B',
 'E 2 St & Avenue C',
 'E 5 St & Avenue C',
 'E 7 St & Avenue A',
 'E 9 St & Avenue C'}

In [340]:
trip_record = jun_2019[(jun_2019.start_hour == 8) 
                       & (jun_2019.start_station_name.isin(highneg_station_8))].reset_index()
trip_record

Unnamed: 0,index,trip_duration,starttime,stoptime,start_station_ID,start_station_name,start_station_latitude,start_station_longitude,end_station_ID,end_station_name,end_station_latitude,end_station_longitude,bike_ID,user_type,birth_year,gender,start_week,start_dayofweek,start_day,start_hour,stop_hour
0,5055,343,2019-06-01 08:01:57.355,2019-06-01 08:07:41.311,432,E 7 St & Avenue A,40.726218,-73.983799,285,Broadway & E 14 St,40.734546,-73.990741,30001,Subscriber,1994,2,22,6,1,8,8
1,5185,253,2019-06-01 08:06:10.482,2019-06-01 08:10:24.134,432,E 7 St & Avenue A,40.726218,-73.983799,3263,Cooper Square & Astor Pl,40.729515,-73.990753,19187,Subscriber,1961,1,22,6,1,8,8
2,5467,594,2019-06-01 08:16:04.972,2019-06-01 08:25:59.708,150,E 2 St & Avenue C,40.720874,-73.980858,361,Allen St & Hester St,40.716059,-73.991908,32869,Subscriber,1993,2,22,6,1,8,8
3,5887,1247,2019-06-01 08:28:40.795,2019-06-01 08:49:28.369,394,E 9 St & Avenue C,40.725213,-73.977688,337,Old Slip & Front St,40.703799,-74.008387,18223,Subscriber,1958,1,22,6,1,8,8
4,5900,346,2019-06-01 08:29:03.432,2019-06-01 08:34:49.647,432,E 7 St & Avenue A,40.726218,-73.983799,229,Great Jones St,40.727434,-73.993790,16918,Subscriber,1978,1,22,6,1,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3052,2067579,311,2019-06-30 08:49:42.558,2019-06-30 08:54:53.597,150,E 2 St & Avenue C,40.720874,-73.980858,403,E 2 St & 2 Ave,40.725029,-73.990697,28251,Subscriber,1994,1,26,7,30,8,8
3053,2067620,173,2019-06-30 08:51:06.083,2019-06-30 08:53:59.599,301,E 2 St & Avenue B,40.722174,-73.983688,349,Rivington St & Ridge St,40.718502,-73.983299,39406,Subscriber,1986,1,26,7,30,8,8
3054,2067647,592,2019-06-30 08:52:14.168,2019-06-30 09:02:06.960,150,E 2 St & Avenue C,40.720874,-73.980858,408,Market St & Cherry St,40.710762,-73.994004,39271,Subscriber,1993,1,26,7,30,8,9
3055,2067682,1556,2019-06-30 08:53:23.711,2019-06-30 09:19:20.021,432,E 7 St & Avenue A,40.726218,-73.983799,316,Fulton St & William St,40.709560,-74.006536,38780,Subscriber,1987,1,26,7,30,8,9


In [341]:
def closest_index(trip):
    return jun_2019[(jun_2019.index < trip['index']) & (jun_2019.bike_ID == trip['bike_ID'])].index.max()

In [342]:
last_station_list = []
last_station_hour = []
for i in range(len(trip_record)):
    trip = trip_record.iloc[i,:]
    close_index = closest_index(trip)
    if np.isnan(close_index) == False:
        last_station_list.append(jun_2019.loc[close_index,'end_station_name'])
        last_station_hour.append(jun_2019.loc[close_index,'stoptime'])
    else:
        last_station_list.append(np.nan)
        last_station_hour.append(np.nan)

In [343]:
trip_record['last_end_station'] = last_station_list
trip_record['last_station_stoptime'] = last_station_hour
trip_record['reBalance'] = np.where(trip_record['last_end_station'] == trip_record['start_station_name'], 0, 1)

In [344]:
np.sum(trip_record['reBalance']) - trip_record.isna().sum().last_end_station

498

In [345]:
df_temp = trip_record[trip_record.reBalance == 1].dropna()

In [346]:
df_temp['rebalance_route'] =list(zip(df_temp.last_end_station, df_temp.start_station_name))

In [347]:
df_temp.rebalance_route.value_counts().to_dict()

{('E 10 St & Avenue A', 'E 9 St & Avenue C'): 110,
 ('E 13 St & Avenue A', 'E 9 St & Avenue C'): 29,
 ('E 10 St & Avenue A', 'E 5 St & Avenue C'): 28,
 ('E 7 St & Avenue A', 'E 9 St & Avenue C'): 27,
 ('E 10 St & Avenue A', 'E 7 St & Avenue A'): 19,
 ('E 13 St & Avenue A', 'E 5 St & Avenue C'): 19,
 ('Avenue D & E 12 St', 'E 9 St & Avenue C'): 18,
 ('E 11 St & Avenue B', 'E 9 St & Avenue C'): 16,
 ('E 7 St & Avenue A', 'E 5 St & Avenue C'): 13,
 ('E 13 St & Avenue A', 'E 2 St & Avenue C'): 13,
 ('E 11 St & Avenue B', 'E 5 St & Avenue C'): 13,
 ('Avenue D & E 12 St', 'E 5 St & Avenue C'): 12,
 ('E 7 St & Avenue A', 'E 2 St & Avenue C'): 11,
 ('E 13 St & Avenue A', 'E 7 St & Avenue A'): 10,
 ('E 11 St & Avenue B', 'E 2 St & Avenue C'): 10,
 ('Broadway & Battery Pl', 'E 2 St & Avenue C'): 9,
 ('E 2 St & Avenue B', 'E 5 St & Avenue C'): 8,
 ('E 11 St & Avenue B', 'E 7 St & Avenue A'): 6,
 ('Division St & Bowery', 'E 2 St & Avenue C'): 6,
 ('Avenue D & E 12 St', 'E 2 St & Avenue C'): 6,
 ('

In [349]:
df_temp.head()

Unnamed: 0,index,trip_duration,starttime,stoptime,start_station_ID,start_station_name,start_station_latitude,start_station_longitude,end_station_ID,end_station_name,end_station_latitude,end_station_longitude,bike_ID,user_type,birth_year,gender,start_week,start_dayofweek,start_day,start_hour,stop_hour,last_end_station,last_station_stoptime,reBalance,rebalance_route
72,153689,969,2019-06-03 08:17:25.513,2019-06-03 08:33:34.558,394,E 9 St & Avenue C,40.725213,-73.977688,470,W 20 St & 8 Ave,40.743453,-74.00004,25815,Subscriber,1974,1,23,1,3,8,8,Pierrepont St & Monroe Pl,2019-06-03 07:55:14.065,1,"(Pierrepont St & Monroe Pl, E 9 St & Avenue C)"
217,230385,736,2019-06-04 08:00:55.907,2019-06-04 08:13:12.236,394,E 9 St & Avenue C,40.725213,-73.977688,496,E 16 St & 5 Ave,40.737262,-73.99239,15261,Subscriber,1992,2,23,2,4,8,8,E 13 St & Avenue A,2019-06-03 17:55:10.147,1,"(E 13 St & Avenue A, E 9 St & Avenue C)"
221,230468,1009,2019-06-04 08:01:33.246,2019-06-04 08:18:22.471,150,E 2 St & Avenue C,40.720874,-73.980858,259,South St & Whitehall St,40.701221,-74.012342,29711,Subscriber,1984,1,23,2,4,8,8,E 11 St & Avenue B,2019-06-03 17:39:55.835,1,"(E 11 St & Avenue B, E 2 St & Avenue C)"
225,230670,90,2019-06-04 08:03:22.145,2019-06-04 08:04:52.731,150,E 2 St & Avenue C,40.720874,-73.980858,301,E 2 St & Avenue B,40.722174,-73.983688,29422,Subscriber,1968,1,23,2,4,8,8,E 2 St & Avenue B,2019-06-03 21:47:25.442,1,"(E 2 St & Avenue B, E 2 St & Avenue C)"
228,230866,1038,2019-06-04 08:05:27.076,2019-06-04 08:22:45.265,394,E 9 St & Avenue C,40.725213,-73.977688,362,Broadway & W 37 St,40.751726,-73.987535,32802,Subscriber,1996,1,23,2,4,8,8,E 10 St & Avenue A,2019-06-03 20:22:43.261,1,"(E 10 St & Avenue A, E 9 St & Avenue C)"


In [350]:
df_temp['time_difference'] = df_temp['starttime'] - df_temp['last_station_stoptime']
df_temp['time_difference'] = df_temp.time_difference/2
df_temp['possible_rebalance_time'] = df_temp['last_station_stoptime'] + df_temp['time_difference']

In [351]:
df_temp.groupby('start_station_name').count().sort_values(by = 'index', ascending = False)[['index']]

Unnamed: 0_level_0,index
start_station_name,Unnamed: 1_level_1
E 9 St & Avenue C,242
E 5 St & Avenue C,113
E 2 St & Avenue C,77
E 7 St & Avenue A,56
E 2 St & Avenue B,10


In [357]:
(242+113+77+56+10)/5/23

4.3304347826086955

In [356]:
len(df_temp.start_day.unique())

23