# Visualizing NYC Waterways Data

Data source: https://github.com/BetaNYC/NY-Waterways-GTFS-data

In [1]:
%%bash

git clone https://github.com/BetaNYC/NY-Waterways-GTFS-data

Cloning into 'NY-Waterways-GTFS-data'...


In [2]:
import pandas as pd
import numpy as np
import datetime as dt

In [4]:
agency = pd.read_csv('NY-Waterways-GTFS-data/Data/agency.txt')
calendar_dates = pd.read_csv('NY-Waterways-GTFS-data/Data/calendar_dates.txt')
calendar = pd.read_csv('NY-Waterways-GTFS-data/Data/calendar.txt')
routes = pd.read_csv('NY-Waterways-GTFS-data/Data/routes.txt')
shapes = pd.read_csv('NY-Waterways-GTFS-data/Data/shapes.txt')
stop_times = pd.read_csv('NY-Waterways-GTFS-data/Data/stop_times.txt')
stops = pd.read_csv('NY-Waterways-GTFS-data/Data/stops.txt')
trips = pd.read_csv('NY-Waterways-GTFS-data/Data/trips.txt')

In [5]:
service_trip_id = pd.merge(calendar[['service_id']],trips[['service_id', 'trip_id', 'route_id']])
service_trip_id.head()

Unnamed: 0,service_id,trip_id,route_id
0,2012ERFSummerWeekday,ERF-NORTH-SUMMER-WEEKDAY-01,ER
1,2012ERFSummerWeekday,ERF-NORTH-SUMMER-WEEKDAY-02,ER
2,2012ERFSummerWeekday,ERF-NORTH-SUMMER-WEEKDAY-03,ER
3,2012ERFSummerWeekday,ERF-NORTH-SUMMER-WEEKDAY-04,ER
4,2012ERFSummerWeekday,ERF-NORTH-SUMMER-WEEKDAY-05,ER


In [6]:
calendar = pd.merge(calendar,service_trip_id)
calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,2012ERFSummerWeekday,0,0,0,0,0,0,0,20120401,20130525,ERF-NORTH-SUMMER-WEEKDAY-01,ER
1,2012ERFSummerWeekday,0,0,0,0,0,0,0,20120401,20130525,ERF-NORTH-SUMMER-WEEKDAY-02,ER
2,2012ERFSummerWeekday,0,0,0,0,0,0,0,20120401,20130525,ERF-NORTH-SUMMER-WEEKDAY-03,ER
3,2012ERFSummerWeekday,0,0,0,0,0,0,0,20120401,20130525,ERF-NORTH-SUMMER-WEEKDAY-04,ER
4,2012ERFSummerWeekday,0,0,0,0,0,0,0,20120401,20130525,ERF-NORTH-SUMMER-WEEKDAY-05,ER


### Add abritrary arrival and departure dates to timetables

Define a function to add arrival dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with arrival times > 24 hours and converts them to 1/6/15.

In [8]:
arbitrary_dates = ['1/5/2015', '1/6/2015']

In [9]:
def add_arrival_date(df, dates = arbitrary_dates):
    df = df.copy()
    arrival_date = []
    arrival_time = []

    for i in df['arrival_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            arrival_date.append(dates[0])
            arrival_time.append(i)
        elif 24 <= int(hour) < 48:
            arrival_date.append(dates[1])
            hour = int(hour) - 24
            arrival_time.append(str(hour)+":"+minute+":"+second)
        else:
            arrival_date.append('NA')
            
    df['arrival_date'] = arrival_date
    df['arrival_time'] = arrival_time
    return df

In [10]:
def add_departure_date(df, dates = arbitrary_dates):
    df = df.copy()
    departure_date = []
    departure_time = []

    for i in df['departure_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            departure_date.append(dates[0])
            departure_time.append(i)
        elif 24 <= int(hour) < 48:
            departure_date.append(dates[1])
            hour = int(hour) - 24
            departure_time.append(str(hour)+":"+minute+":"+second)
        else:
            departure_date.append('NA')
            
    df['departure_date'] = departure_date
    df['departure_time'] = departure_time
    return df

In [11]:
monday = calendar[calendar['monday'] == 1]
tuesday = calendar[calendar['tuesday'] == 1]
wednesday = calendar[calendar['wednesday'] == 1]
thursday = calendar[calendar['thursday'] == 1]
friday = calendar[calendar['friday'] == 1]
saturday = calendar[calendar['saturday'] == 1]
sunday = calendar[calendar['sunday'] == 1]

## Monday

In [12]:
monday_trips = monday['trip_id']
print "There are ", len(monday_trips), "trips on Monday"
monday_trips.head()

There are  1647 trips on Monday


87    BELFORD-PAULUS-WEEKDAY-01
88    BELFORD-PAULUS-WEEKDAY-02
89    BELFORD-PAULUS-WEEKDAY-03
90    BELFORD-PAULUS-WEEKDAY-04
91    BELFORD-PAULUS-WEEKDAY-05
Name: trip_id, dtype: object

In [13]:
triplist = []
count = 0

for i in monday_trips:
    count += 1
    df = stop_times[stop_times['trip_id'] == i]
    
    # add arrival and departure dates
    df = add_arrival_date(df, dates = arbitrary_dates)
    df = add_departure_date(df, dates = arbitrary_dates)
    df['arrival_datetime'] = pd.to_datetime(df['arrival_date'] + ' ' + df['arrival_time'])
    df['departure_datetime'] = pd.to_datetime(df['departure_date'] + ' ' + df['departure_time'])

    # join df with stops
    df = pd.merge(df, stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],left_on='stop_id', right_on='stop_id')

    # join df with trips to get direction and route id
    try:
        df = pd.merge(df, trips[['trip_id', 'direction_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    except:
        df = pd.merge(df, trips[['trip_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    
    # join df with routes to get route id
    df = pd.merge(df, routes[['route_id', 'route_long_name']], left_on='route_id', right_on='route_id', how='left')
    
    # create new dataframe to store results
    legs = pd.DataFrame()
    legs['type_id'] = df['route_id']
    legs['starttime'] = df['departure_datetime']
    legs['stoptime'] = df['arrival_datetime'].shift(-1).fillna(method='ffill')
    legs['tripduration'] = ((legs['stoptime'] - legs['starttime'])/np.timedelta64(1, 's')).astype(int)
    legs['start_lat'] = df['stop_lat']
    legs['start_lon'] = df['stop_lon']
    legs['end_lat'] = legs['start_lat'].shift(-1).fillna(method='ffill')
    legs['end_lon'] = legs['start_lon'].shift(-1).fillna(method='ffill')
    
    # append results to triplist
    triplist.append(legs)
    
    if count % 100 == 0:
        print str(dt.datetime.now()), "finished trip number", count, "/", len(monday_trips)

2017-04-08 12:20:33.519282 finished trip number 100 / 1647
2017-04-08 12:20:37.221995 finished trip number 200 / 1647
2017-04-08 12:20:40.920204 finished trip number 300 / 1647
2017-04-08 12:20:44.621693 finished trip number 400 / 1647
2017-04-08 12:20:48.298338 finished trip number 500 / 1647
2017-04-08 12:20:52.221854 finished trip number 600 / 1647
2017-04-08 12:20:56.332659 finished trip number 700 / 1647
2017-04-08 12:20:59.659532 finished trip number 800 / 1647
2017-04-08 12:21:03.251423 finished trip number 900 / 1647
2017-04-08 12:21:07.390588 finished trip number 1000 / 1647
2017-04-08 12:21:11.321864 finished trip number 1100 / 1647
2017-04-08 12:21:15.230589 finished trip number 1200 / 1647
2017-04-08 12:21:18.873882 finished trip number 1300 / 1647
2017-04-08 12:21:22.139726 finished trip number 1400 / 1647
2017-04-08 12:21:26.149447 finished trip number 1500 / 1647
2017-04-08 12:21:30.623964 finished trip number 1600 / 1647


In [14]:
triplist[0].head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,BELFORD+PAULUS,2015-01-05 05:45:00,2015-01-05 06:33:00,2880,40.433258,-74.078826,40.71383,-74.033207
1,BELFORD+PAULUS,2015-01-05 06:33:00,2015-01-05 06:33:00,0,40.71383,-74.033207,40.71383,-74.033207


In [15]:
len(triplist)

1647

In [16]:
data = pd.concat(triplist)

In [17]:
data = data[data.tripduration != 0]

In [18]:
data = data[data.start_lon != 0]

In [19]:
data = data.sort_values(by='starttime')
data = data.reset_index(drop=True)

In [20]:
data.shape

(2157, 8)

In [21]:
data['type_id'] = 16 # 16 = waterways

In [22]:
data.to_csv('waterways_monday_raw.csv')

In [23]:
%%bash 

head waterways_monday_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,16,2015-01-05 05:40:00,2015-01-05 05:50:00,600,41.500896,-74.005494,41.505781,-73.985538
1,16,2015-01-05 05:45:00,2015-01-05 06:33:00,2880,40.433258,-74.078826,40.71383,-74.033207
2,16,2015-01-05 05:45:00,2015-01-05 06:35:00,3000,40.433258,-74.078826,40.715145,-74.017282
3,16,2015-01-05 05:45:00,2015-01-05 06:33:00,2880,40.433258,-74.078826,40.703611,-74.006631
4,16,2015-01-05 05:49:00,2015-01-05 05:59:00,600,41.505781,-73.985538,41.500896,-74.005494
5,16,2015-01-05 05:53:00,2015-01-05 06:04:00,660,41.192703,-73.95833,41.157187,-73.870225
6,16,2015-01-05 06:00:00,2015-01-05 06:07:00,420,40.71383,-74.033207,40.715145,-74.017282
7,16,2015-01-05 06:00:00,2015-01-05 06:08:00,480,40.752735,-74.023132,40.760147,-74.003069
8,16,2015-01-05 06:00:00,2015-01-05 06:20:00,1200,40.687798,-74.074674,40.703611,-74.006631


In [24]:
%%bash

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' waterways_monday_raw.csv > waterways_monday.csv

In [25]:
%%bash 

head waterways_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
16,2015-01-05 05:40:00,2015-01-05 05:50:00,600,41.500896,-74.005494,41.505781,-73.985538
16,2015-01-05 05:45:00,2015-01-05 06:33:00,2880,40.433258,-74.078826,40.71383,-74.033207
16,2015-01-05 05:45:00,2015-01-05 06:35:00,3000,40.433258,-74.078826,40.715145,-74.017282
16,2015-01-05 05:45:00,2015-01-05 06:33:00,2880,40.433258,-74.078826,40.703611,-74.006631
16,2015-01-05 05:49:00,2015-01-05 05:59:00,600,41.505781,-73.985538,41.500896,-74.005494
16,2015-01-05 05:53:00,2015-01-05 06:04:00,660,41.192703,-73.95833,41.157187,-73.870225
16,2015-01-05 06:00:00,2015-01-05 06:07:00,420,40.71383,-74.033207,40.715145,-74.017282
16,2015-01-05 06:00:00,2015-01-05 06:08:00,480,40.752735,-74.023132,40.760147,-74.003069
16,2015-01-05 06:00:00,2015-01-05 06:20:00,1200,40.687798,-74.074674,40.703611,-74.006631
