# Visualizing NJ PATH train GTFS data

Data Source: http://www.panynj.gov/path/developers.html

In [1]:
url = "http://data.trilliumtransit.com/gtfs/path-nj-us/path-nj-us.zip"

In [4]:
%%bash

mkdir gtfs
cd gtfs
wget http://data.trilliumtransit.com/gtfs/path-nj-us/path-nj-us.zip
unzip path-nj-us.zip

Archive:  path-nj-us.zip
  inflating: agency.txt              
  inflating: calendar_dates.txt      
  inflating: calendar.txt            
  inflating: fare_attributes.txt     
  inflating: fare_rules.txt          
  inflating: feed_info.txt           
  inflating: frequencies.txt         
  inflating: route_directions.txt    
  inflating: routes.txt              
  inflating: shapes.txt              
  inflating: stop_times.txt          
  inflating: stops.txt               
  inflating: timetable_stop_order-new.txt  
  inflating: timetables-new.txt      
  inflating: transfers.txt           
  inflating: trips.txt               


--2017-04-08 10:19:02--  http://data.trilliumtransit.com/gtfs/path-nj-us/path-nj-us.zip
Resolving data.trilliumtransit.com... 52.24.177.63
Connecting to data.trilliumtransit.com|52.24.177.63|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180404 (176K) [application/zip]
Saving to: ‘path-nj-us.zip’

     0K .......... .......... .......... .......... .......... 28%  222K 1s
    50K .......... .......... .......... .......... .......... 56%  209K 0s
   100K .......... .......... .......... .......... .......... 85%  423K 0s
   150K .......... .......... ......                          100%  211K=0.7s

2017-04-08 10:19:03 (249 KB/s) - ‘path-nj-us.zip’ saved [180404/180404]



Import libraries:

In [5]:
import pandas as pd
import numpy as np
import datetime as dt

### Load GTFS data

In [7]:
agency = pd.read_csv('gtfs/agency.txt')
calendar_dates = pd.read_csv('gtfs/calendar_dates.txt')
calendar = pd.read_csv('gtfs/calendar.txt')
routes = pd.read_csv('gtfs/routes.txt')
shapes = pd.read_csv('gtfs/shapes.txt')
stop_times = pd.read_csv('gtfs/stop_times.txt')
stops = pd.read_csv('gtfs/stops.txt')
trips = pd.read_csv('gtfs/trips.txt')

### Merge calendar, service_id and trip_id information


In [8]:
service_trip_id = pd.merge(calendar[['service_id']],trips[['service_id', 'trip_id', 'route_id']])
service_trip_id.head()

Unnamed: 0,service_id,trip_id,route_id
0,5302A6349,658979A5302B6349,5745
1,5302A6349,622824A5302B6349,5746
2,5302A6349,660183A5302B6349,5745
3,5302A6349,660184A5302B6349,5746
4,5302A6349,660185A5302B6349,5746


In [9]:
calendar = pd.merge(calendar,service_trip_id)
calendar.head()

Unnamed: 0,service_id,service_name,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,658979A5302B6349,5745
1,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,622824A5302B6349,5746
2,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660183A5302B6349,5745
3,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660184A5302B6349,5746
4,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660185A5302B6349,5746


### Add abritrary arrival and departure dates to timetables

Define a function to add arrival dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with arrival times > 24 hours and converts them to 1/6/15.

In [10]:
arbitrary_dates = ['1/5/2015', '1/6/2015']

In [11]:
def add_arrival_date(df, dates = arbitrary_dates):
    df = df.copy()
    arrival_date = []
    arrival_time = []

    for i in df['arrival_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            arrival_date.append(dates[0])
            arrival_time.append(i)
        elif 24 <= int(hour) < 48:
            arrival_date.append(dates[1])
            hour = int(hour) - 24
            arrival_time.append(str(hour)+":"+minute+":"+second)
        else:
            arrival_date.append('NA')
            
    df['arrival_date'] = arrival_date
    df['arrival_time'] = arrival_time
    return df

In [12]:
def add_departure_date(df, dates = arbitrary_dates):
    df = df.copy()
    departure_date = []
    departure_time = []

    for i in df['departure_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            departure_date.append(dates[0])
            departure_time.append(i)
        elif 24 <= int(hour) < 48:
            departure_date.append(dates[1])
            hour = int(hour) - 24
            departure_time.append(str(hour)+":"+minute+":"+second)
        else:
            departure_date.append('NA')
            
    df['departure_date'] = departure_date
    df['departure_time'] = departure_time
    return df

### Get timetables for each day of the week

In [13]:
monday = calendar[calendar['monday'] == 1]
tuesday = calendar[calendar['tuesday'] == 1]
wednesday = calendar[calendar['wednesday'] == 1]
thursday = calendar[calendar['thursday'] == 1]
friday = calendar[calendar['friday'] == 1]
saturday = calendar[calendar['saturday'] == 1]
sunday = calendar[calendar['sunday'] == 1]

### Monday


In [14]:
monday.head()

Unnamed: 0,service_id,service_name,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,658979A5302B6349,5745
1,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,622824A5302B6349,5746
2,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660183A5302B6349,5745
3,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660184A5302B6349,5746
4,5302A6349,Beginning February 20th (Mon-Sun),1,1,1,1,1,1,1,20170208,20180101,660185A5302B6349,5746


In [15]:
monday_trips = monday['trip_id']
print "There are ", len(monday_trips), "trips on Monday"
monday_trips.head()

There are  1039 trips on Monday


0    658979A5302B6349
1    622824A5302B6349
2    660183A5302B6349
3    660184A5302B6349
4    660185A5302B6349
Name: trip_id, dtype: object

In [16]:
monday_trips.tail()

1034    636190A5394B6507
1035    636997A5394B6507
1036    637648A5394B6507
1037    637696A5394B6507
1038    636204A5394B6507
Name: trip_id, dtype: object

In [17]:
triplist = []
count = 0

for i in monday_trips:
    count += 1
    df = stop_times[stop_times['trip_id'] == i]
    
    # add arrival and departure dates
    df = add_arrival_date(df, dates = arbitrary_dates)
    df = add_departure_date(df, dates = arbitrary_dates)
    df['arrival_datetime'] = pd.to_datetime(df['arrival_date'] + ' ' + df['arrival_time'])
    df['departure_datetime'] = pd.to_datetime(df['departure_date'] + ' ' + df['departure_time'])

    # join df with stops
    df = pd.merge(df, stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],left_on='stop_id', right_on='stop_id')

    # join df with trips to get direction and route id
    try:
        df = pd.merge(df, trips[['trip_id', 'direction_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    except:
        df = pd.merge(df, trips[['trip_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    
    # join df with routes to get route id
    df = pd.merge(df, routes[['route_id', 'route_long_name']], left_on='route_id', right_on='route_id', how='left')
    
    # create new dataframe to store results
    legs = pd.DataFrame()
    legs['type_id'] = df['route_id']
    legs['starttime'] = df['departure_datetime']
    legs['stoptime'] = df['arrival_datetime'].shift(-1).fillna(method='ffill')
    legs['tripduration'] = ((legs['stoptime'] - legs['starttime'])/np.timedelta64(1, 's')).astype(int)
    legs['start_lat'] = df['stop_lat']
    legs['start_lon'] = df['stop_lon']
    legs['end_lat'] = legs['start_lat'].shift(-1).fillna(method='ffill')
    legs['end_lon'] = legs['start_lon'].shift(-1).fillna(method='ffill')
    
    # append results to triplist
    triplist.append(legs)
    
    if count % 100 == 0:
        print str(dt.datetime.now()), "finished trip number", count, "/", len(monday_trips)

2017-04-08 10:21:32.076240 finished trip number 100 / 1039
2017-04-08 10:21:36.252281 finished trip number 200 / 1039
2017-04-08 10:21:40.544042 finished trip number 300 / 1039
2017-04-08 10:21:44.918853 finished trip number 400 / 1039
2017-04-08 10:21:49.181194 finished trip number 500 / 1039
2017-04-08 10:21:53.493988 finished trip number 600 / 1039
2017-04-08 10:21:57.436426 finished trip number 700 / 1039
2017-04-08 10:22:01.247265 finished trip number 800 / 1039
2017-04-08 10:22:05.068849 finished trip number 900 / 1039
2017-04-08 10:22:08.963401 finished trip number 1000 / 1039


In [18]:
triplist[0].head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,5745,2015-01-05 04:00:00,2015-01-05 04:45:00,2700,40.77481,-73.884575,40.77481,-73.884575
1,5745,2015-01-05 04:45:00,2015-01-05 04:03:00,-2520,40.77481,-73.884575,40.773265,-73.886284
2,5745,2015-01-05 04:03:00,2015-01-05 04:04:00,60,40.773265,-73.886284,40.771058,-73.885487
3,5745,2015-01-05 04:04:00,2015-01-05 04:21:00,1020,40.771058,-73.885487,40.767808,-73.860764
4,5745,2015-01-05 04:21:00,2015-01-05 04:22:00,60,40.767808,-73.860764,40.768344,-73.861868


In [19]:
len(triplist)

1039

In [20]:
data = pd.concat(triplist)

In [21]:
data.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,5745,2015-01-05 04:00:00,2015-01-05 04:45:00,2700,40.77481,-73.884575,40.77481,-73.884575
1,5745,2015-01-05 04:45:00,2015-01-05 04:03:00,-2520,40.77481,-73.884575,40.773265,-73.886284
2,5745,2015-01-05 04:03:00,2015-01-05 04:04:00,60,40.773265,-73.886284,40.771058,-73.885487
3,5745,2015-01-05 04:04:00,2015-01-05 04:21:00,1020,40.771058,-73.885487,40.767808,-73.860764
4,5745,2015-01-05 04:21:00,2015-01-05 04:22:00,60,40.767808,-73.860764,40.768344,-73.861868


In [22]:
data.shape

(6458, 8)

In [23]:
data = data[data.tripduration != 0]

In [24]:
data = data[data.start_lon != 0]

In [25]:
data = data.sort_values(by='starttime')
data = data.reset_index(drop=True)

In [26]:
data.shape

(5419, 8)

In [27]:
data['type_id'] = 12 # 12 = PATH train

In [28]:
data.to_csv('path_monday_raw.csv')

In [29]:
%%bash 

head path_monday_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,12,2015-01-05 00:10:00,2015-01-05 00:14:00,240,40.73301,-74.06289,40.71966,-74.04245
1,12,2015-01-05 00:10:00,2015-01-05 00:12:00,120,40.74912,-73.98827,40.7429,-73.99278
2,12,2015-01-05 00:12:00,2015-01-05 00:13:00,60,40.7429,-73.99278,40.73735,-73.99684
3,12,2015-01-05 00:13:00,2015-01-05 00:14:00,60,40.73735,-73.99684,40.73424,-73.9991
4,12,2015-01-05 00:14:00,2015-01-05 00:16:00,120,40.73424,-73.9991,40.73295,-74.00707
5,12,2015-01-05 00:14:00,2015-01-05 00:18:00,240,40.71966,-74.04245,40.72699,-74.03383
6,12,2015-01-05 00:16:00,2015-01-05 00:23:00,420,40.73295,-74.00707,40.73586,-74.02922
7,12,2015-01-05 00:18:00,2015-01-05 00:20:00,120,40.72699,-74.03383,40.73586,-74.02922
8,12,2015-01-05 00:23:00,2015-01-05 00:30:00,420,40.73586,-74.02922,40.73295,-74.00707


In [30]:
%%bash

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' path_monday_raw.csv > path_monday.csv

In [31]:
%%bash 

head path_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
12,2015-01-05 00:10:00,2015-01-05 00:14:00,240,40.73301,-74.06289,40.71966,-74.04245
12,2015-01-05 00:10:00,2015-01-05 00:12:00,120,40.74912,-73.98827,40.7429,-73.99278
12,2015-01-05 00:12:00,2015-01-05 00:13:00,60,40.7429,-73.99278,40.73735,-73.99684
12,2015-01-05 00:13:00,2015-01-05 00:14:00,60,40.73735,-73.99684,40.73424,-73.9991
12,2015-01-05 00:14:00,2015-01-05 00:16:00,120,40.73424,-73.9991,40.73295,-74.00707
12,2015-01-05 00:14:00,2015-01-05 00:18:00,240,40.71966,-74.04245,40.72699,-74.03383
12,2015-01-05 00:16:00,2015-01-05 00:23:00,420,40.73295,-74.00707,40.73586,-74.02922
12,2015-01-05 00:18:00,2015-01-05 00:20:00,120,40.72699,-74.03383,40.73586,-74.02922
12,2015-01-05 00:23:00,2015-01-05 00:30:00,420,40.73586,-74.02922,40.73295,-74.00707
