# Visualizing Private Bus Company GTFS Data in NYC

Data source: http://web.mta.info/developers/developer-data-terms.html#data

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
agency = pd.read_csv('gtfs/agency.txt')
calendar_dates = pd.read_csv('gtfs/calendar_dates.txt')
calendar = pd.read_csv('gtfs/calendar.txt')
routes = pd.read_csv('gtfs/routes.txt')
shapes = pd.read_csv('gtfs/shapes.txt')
stop_times = pd.read_csv('gtfs/stop_times.txt')
stops = pd.read_csv('gtfs/stops.txt')
trips = pd.read_csv('gtfs/trips.txt')

In [3]:
service_trip_id = pd.merge(calendar[['service_id']],trips[['service_id', 'trip_id', 'route_id']])
service_trip_id.head()

Unnamed: 0,service_id,trip_id,route_id
0,BPPB7-BP_B7-Sunday-10,15411672-BPPB7-BP_B7-Sunday-10,Q64
1,BPPB7-BP_B7-Sunday-10,15411673-BPPB7-BP_B7-Sunday-10,Q64
2,BPPB7-BP_B7-Sunday-10,15411674-BPPB7-BP_B7-Sunday-10,Q64
3,BPPB7-BP_B7-Sunday-10,15411675-BPPB7-BP_B7-Sunday-10,Q64
4,BPPB7-BP_B7-Sunday-10,15411676-BPPB7-BP_B7-Sunday-10,Q64


In [4]:
calendar = pd.merge(calendar,service_trip_id)
calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,BPPB7-BP_B7-Sunday-10,0,0,0,0,0,0,1,20170409,20170625,15411672-BPPB7-BP_B7-Sunday-10,Q64
1,BPPB7-BP_B7-Sunday-10,0,0,0,0,0,0,1,20170409,20170625,15411673-BPPB7-BP_B7-Sunday-10,Q64
2,BPPB7-BP_B7-Sunday-10,0,0,0,0,0,0,1,20170409,20170625,15411674-BPPB7-BP_B7-Sunday-10,Q64
3,BPPB7-BP_B7-Sunday-10,0,0,0,0,0,0,1,20170409,20170625,15411675-BPPB7-BP_B7-Sunday-10,Q64
4,BPPB7-BP_B7-Sunday-10,0,0,0,0,0,0,1,20170409,20170625,15411676-BPPB7-BP_B7-Sunday-10,Q64


### Add abritrary arrival and departure dates to timetables

Define a function to add arrival dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with arrival times > 24 hours and converts them to 1/6/15.

In [6]:
arbitrary_dates = ['1/5/2015', '1/6/2015']

In [7]:
def add_arrival_date(df, dates = arbitrary_dates):
    df = df.copy()
    arrival_date = []
    arrival_time = []

    for i in df['arrival_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            arrival_date.append(dates[0])
            arrival_time.append(i)
        elif 24 <= int(hour) < 48:
            arrival_date.append(dates[1])
            hour = int(hour) - 24
            arrival_time.append(str(hour)+":"+minute+":"+second)
        else:
            arrival_date.append('NA')
            
    df['arrival_date'] = arrival_date
    df['arrival_time'] = arrival_time
    return df

In [8]:
def add_departure_date(df, dates = arbitrary_dates):
    df = df.copy()
    departure_date = []
    departure_time = []

    for i in df['departure_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            departure_date.append(dates[0])
            departure_time.append(i)
        elif 24 <= int(hour) < 48:
            departure_date.append(dates[1])
            hour = int(hour) - 24
            departure_time.append(str(hour)+":"+minute+":"+second)
        else:
            departure_date.append('NA')
            
    df['departure_date'] = departure_date
    df['departure_time'] = departure_time
    return df

In [9]:
monday = calendar[calendar['monday'] == 1]
tuesday = calendar[calendar['tuesday'] == 1]
wednesday = calendar[calendar['wednesday'] == 1]
thursday = calendar[calendar['thursday'] == 1]
friday = calendar[calendar['friday'] == 1]
saturday = calendar[calendar['saturday'] == 1]
sunday = calendar[calendar['sunday'] == 1]

## Monday

In [10]:
monday_trips = monday['trip_id']
print "There are ", len(monday_trips), "trips on Monday"
monday_trips.head()

There are  10783 trips on Monday


817    15408564-BPPB7-BP_B7-Weekday-10-SDon
818    15408565-BPPB7-BP_B7-Weekday-10-SDon
819    15408566-BPPB7-BP_B7-Weekday-10-SDon
820    15408567-BPPB7-BP_B7-Weekday-10-SDon
821    15408568-BPPB7-BP_B7-Weekday-10-SDon
Name: trip_id, dtype: object

In [11]:
triplist = []
count = 0

for i in monday_trips:
    count += 1
    df = stop_times[stop_times['trip_id'] == i]
    
    # add arrival and departure dates
    df = add_arrival_date(df, dates = arbitrary_dates)
    df = add_departure_date(df, dates = arbitrary_dates)
    df['arrival_datetime'] = pd.to_datetime(df['arrival_date'] + ' ' + df['arrival_time'])
    df['departure_datetime'] = pd.to_datetime(df['departure_date'] + ' ' + df['departure_time'])

    # join df with stops
    df = pd.merge(df, stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],left_on='stop_id', right_on='stop_id')

    # join df with trips to get direction and route id
    try:
        df = pd.merge(df, trips[['trip_id', 'direction_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    except:
        df = pd.merge(df, trips[['trip_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    
    # join df with routes to get route id
    df = pd.merge(df, routes[['route_id', 'route_long_name']], left_on='route_id', right_on='route_id', how='left')
    
    # create new dataframe to store results
    legs = pd.DataFrame()
    legs['type_id'] = df['route_id']
    legs['starttime'] = df['departure_datetime']
    legs['stoptime'] = df['arrival_datetime'].shift(-1).fillna(method='ffill')
    legs['tripduration'] = ((legs['stoptime'] - legs['starttime'])/np.timedelta64(1, 's')).astype(int)
    legs['start_lat'] = df['stop_lat']
    legs['start_lon'] = df['stop_lon']
    legs['end_lat'] = legs['start_lat'].shift(-1).fillna(method='ffill')
    legs['end_lon'] = legs['start_lon'].shift(-1).fillna(method='ffill')
    
    # append results to triplist
    triplist.append(legs)
    
    if count % 100 == 0:
        print str(dt.datetime.now()), "finished trip number", count, "/", len(monday_trips)

2017-04-08 12:03:10.814570 finished trip number 100 / 10783
2017-04-08 12:03:20.469591 finished trip number 200 / 10783
2017-04-08 12:03:30.341038 finished trip number 300 / 10783
2017-04-08 12:03:43.637175 finished trip number 400 / 10783
2017-04-08 12:03:56.531381 finished trip number 500 / 10783
2017-04-08 12:04:09.072302 finished trip number 600 / 10783
2017-04-08 12:04:20.430129 finished trip number 700 / 10783
2017-04-08 12:04:30.994823 finished trip number 800 / 10783
2017-04-08 12:04:42.460349 finished trip number 900 / 10783
2017-04-08 12:04:52.728138 finished trip number 1000 / 10783
2017-04-08 12:05:03.373361 finished trip number 1100 / 10783
2017-04-08 12:05:14.630538 finished trip number 1200 / 10783
2017-04-08 12:05:26.383442 finished trip number 1300 / 10783
2017-04-08 12:05:36.865267 finished trip number 1400 / 10783
2017-04-08 12:05:48.966804 finished trip number 1500 / 10783
2017-04-08 12:06:00.787077 finished trip number 1600 / 10783
2017-04-08 12:06:12.081417 finish

In [12]:
triplist[0].head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,Q64,2015-01-05 00:00:00,2015-01-05 00:00:29,29,40.731841,-73.805384,40.732107,-73.808141
1,Q64,2015-01-05 00:00:29,2015-01-05 00:01:00,31,40.732107,-73.808141,40.732469,-73.811062
2,Q64,2015-01-05 00:01:00,2015-01-05 00:01:24,24,40.732469,-73.811062,40.732236,-73.812835
3,Q64,2015-01-05 00:01:24,2015-01-05 00:01:58,34,40.732236,-73.812835,40.732135,-73.815248
4,Q64,2015-01-05 00:01:58,2015-01-05 00:02:33,35,40.732135,-73.815248,40.731808,-73.817786


In [13]:
len(triplist)

10783

In [14]:
data = pd.concat(triplist)

In [15]:
data = data[data.tripduration != 0]

In [16]:
data = data[data.start_lon != 0]

In [17]:
data = data.sort_values(by='starttime')
data = data.reset_index(drop=True)

In [18]:
data.shape

(340895, 8)

In [19]:
data['type_id'] = 13 # 13 = bus company

In [20]:
data.to_csv('bus_company_monday_raw.csv')

In [21]:
%%bash 

head bus_company_monday_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,13,2015-01-05 00:00:00,2015-01-05 00:00:29,29,40.731841,-73.805384,40.732107,-73.808141
1,13,2015-01-05 00:00:00,2015-01-05 00:01:11,71,40.773573,-73.89329,40.772406,-73.895761
2,13,2015-01-05 00:00:00,2015-01-05 00:00:09,9,40.770302,-73.875496,40.770078,-73.876107
3,13,2015-01-05 00:00:00,2015-01-05 00:01:15,75,40.759351,-73.830739,40.762739,-73.831742
4,13,2015-01-05 00:00:00,2015-01-05 00:04:00,240,40.745603,-73.902741,40.746288,-73.890604
5,13,2015-01-05 00:00:00,2015-01-05 00:01:03,63,40.746637,-73.890854,40.74704,-73.889365
6,13,2015-01-05 00:00:00,2015-01-05 00:04:00,240,40.745696,-73.902305,40.746515,-73.890689
7,13,2015-01-05 00:00:09,2015-01-05 00:00:32,23,40.770078,-73.876107,40.768294,-73.87641
8,13,2015-01-05 00:00:29,2015-01-05 00:01:00,31,40.732107,-73.808141,40.732469,-73.811062


In [22]:
%%bash

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' bus_company_monday_raw.csv > bus_company_monday.csv

In [23]:
%%bash 

head bus_company_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
13,2015-01-05 00:00:00,2015-01-05 00:00:29,29,40.731841,-73.805384,40.732107,-73.808141
13,2015-01-05 00:00:00,2015-01-05 00:01:11,71,40.773573,-73.89329,40.772406,-73.895761
13,2015-01-05 00:00:00,2015-01-05 00:00:09,9,40.770302,-73.875496,40.770078,-73.876107
13,2015-01-05 00:00:00,2015-01-05 00:01:15,75,40.759351,-73.830739,40.762739,-73.831742
13,2015-01-05 00:00:00,2015-01-05 00:04:00,240,40.745603,-73.902741,40.746288,-73.890604
13,2015-01-05 00:00:00,2015-01-05 00:01:03,63,40.746637,-73.890854,40.74704,-73.889365
13,2015-01-05 00:00:00,2015-01-05 00:04:00,240,40.745696,-73.902305,40.746515,-73.890689
13,2015-01-05 00:00:09,2015-01-05 00:00:32,23,40.770078,-73.876107,40.768294,-73.87641
13,2015-01-05 00:00:29,2015-01-05 00:01:00,31,40.732107,-73.808141,40.732469,-73.811062
