# Visualizing NYC Subway Timetables

Data source: http://web.mta.info/developers/data/nyct/subway/google_transit.zip

Import libraries:

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

### Load GTFS data

In [22]:
agency = pd.read_csv('gtfs/agency.txt')
calendar_dates = pd.read_csv('gtfs/calendar_dates.txt')
calendar = pd.read_csv('gtfs/calendar.txt')
routes = pd.read_csv('gtfs/routes.txt')
shapes = pd.read_csv('gtfs/shapes.txt')
stop_times = pd.read_csv('gtfs/stop_times.txt')
stops = pd.read_csv('gtfs/stops.txt')
trips = pd.read_csv('gtfs/trips.txt')

### Preview the data

In [23]:
agency.head()

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,MTA NYCT,MTA New York City Transit,http://www.mta.info,America/New_York,en,718-330-1234


In [24]:
calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,A20161106WKD,20161124,2
1,A20161106SUN,20161124,1
2,A20161106WKD,20161226,2
3,A20161106SUN,20161226,1
4,A20161106WKD,20170102,2


In [25]:
calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231
1,A20161106SAT,0,0,0,0,0,1,0,20161106,20171231
2,A20161106SUN,0,0,0,0,0,0,1,20161106,20171231
3,B20161106WKD,1,1,1,1,1,0,0,20161106,20171231
4,B20161106SAT,0,0,0,0,0,1,0,20161106,20171231


In [26]:
routes.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,MTA NYCT,1,Broadway - 7 Avenue Local,Trains operate between 242 St in the Bronx and...,1,http://web.mta.info/nyct/service/pdf/t1cur.pdf,EE352E,
1,2,MTA NYCT,2,7 Avenue Express,"Trains operate between Wakefield-241 St, Bronx...",1,http://web.mta.info/nyct/service/pdf/t2cur.pdf,EE352E,
2,3,MTA NYCT,3,7 Avenue Express,"Trains operate between 148 St, 7 Av, Manhattan...",1,http://web.mta.info/nyct/service/pdf/t3cur.pdf,EE352E,
3,4,MTA NYCT,4,Lexington Avenue Express,Trains operate daily between Woodlawn/Jerome A...,1,http://web.mta.info/nyct/service/pdf/t4cur.pdf,00933C,
4,5,MTA NYCT,5,Lexington Avenue Express,"Weekdays daytime, most trains operate between ...",1,http://web.mta.info/nyct/service/pdf/t5cur.pdf,00933C,


In [27]:
shapes.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1..N03R,40.702068,-74.013664,0,
1,1..N03R,40.703199,-74.014792,1,
2,1..N03R,40.703226,-74.01482,2,
3,1..N03R,40.703253,-74.014846,3,
4,1..N03R,40.70328,-74.01487,4,


In [28]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,A20161106SAT_036000_GS.N01R,06:00:00,06:00:00,901N,1,,0,0,
1,A20161106SAT_036000_GS.N01R,06:01:30,06:01:30,902N,2,,0,0,
2,A20161106SAT_036400_GS.S01R,06:04:00,06:04:00,902S,1,,0,0,
3,A20161106SAT_036400_GS.S01R,06:05:30,06:05:30,901S,2,,0,0,
4,A20161106SAT_037000_GS.N01R,06:10:00,06:10:00,901N,1,,0,0,


In [29]:
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,101,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,1,
1,101N,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
2,101S,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
3,103,,238 St,,40.884667,-73.90087,,,1,
4,103N,,238 St,,40.884667,-73.90087,,,0,103.0


In [30]:
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,GS,A20161106SAT,A20161106SAT_036000_GS.N01R,TIMES SQ - 42 ST,0,,GS.N01R
1,GS,A20161106SAT,A20161106SAT_036400_GS.S01R,GRAND CENTRAL - 42 ST,1,,GS.S01R
2,GS,A20161106SAT,A20161106SAT_037000_GS.N01R,TIMES SQ - 42 ST,0,,GS.N01R
3,GS,A20161106SAT,A20161106SAT_037400_GS.S01R,GRAND CENTRAL - 42 ST,1,,GS.S01R
4,GS,A20161106SAT,A20161106SAT_038000_GS.N01R,TIMES SQ - 42 ST,0,,GS.N01R


### Merge calendar, service_id and trip_id information

Join calendar and trips on service_id = trip_id.

In [31]:
service_trip_id = pd.merge(calendar[['service_id']],trips[['service_id', 'trip_id', 'route_id']])
service_trip_id.head()

Unnamed: 0,service_id,trip_id,route_id
0,A20161106WKD,A20161106WKD_029750_1..S02R,1
1,A20161106WKD,A20161106WKD_036150_1..N02R,1
2,A20161106WKD,A20161106WKD_042550_1..S02R,1
3,A20161106WKD,A20161106WKD_048950_1..N02R,1
4,A20161106WKD,A20161106WKD_055300_1..S02R,1


Join calendar with trip_id.

In [32]:
calendar = pd.merge(calendar,service_trip_id)
calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_029750_1..S02R,1
1,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_036150_1..N02R,1
2,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_042550_1..S02R,1
3,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_048950_1..N02R,1
4,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_055300_1..S02R,1


### Add abritrary arrival and departure dates to timetables

Define a function to add arrival dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with arrival times > 24 hours and converts them to 1/6/15.

In [33]:
arbitrary_dates = ['1/5/2015', '1/6/2015']

In [34]:
def add_arrival_date(df, dates = arbitrary_dates):
    df = df.copy()
    arrival_date = []
    arrival_time = []

    for i in df['arrival_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            arrival_date.append(dates[0])
            arrival_time.append(i)
        elif 24 <= int(hour) < 48:
            arrival_date.append(dates[1])
            hour = int(hour) - 24
            arrival_time.append(str(hour)+":"+minute+":"+second)
        else:
            arrival_date.append('NA')
            
    df['arrival_date'] = arrival_date
    df['arrival_time'] = arrival_time
    return df

Define a function to add departure dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with departure times > 24 hours and converts them to 1/6/15.

In [35]:
def add_departure_date(df, dates = arbitrary_dates):
    df = df.copy()
    departure_date = []
    departure_time = []

    for i in df['departure_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            departure_date.append(dates[0])
            departure_time.append(i)
        elif 24 <= int(hour) < 48:
            departure_date.append(dates[1])
            hour = int(hour) - 24
            departure_time.append(str(hour)+":"+minute+":"+second)
        else:
            departure_date.append('NA')
            
    df['departure_date'] = departure_date
    df['departure_time'] = departure_time
    return df

### Get timetables for each day of the week

In [36]:
monday = calendar[calendar['monday'] == 1]
tuesday = calendar[calendar['tuesday'] == 1]
wednesday = calendar[calendar['wednesday'] == 1]
thursday = calendar[calendar['thursday'] == 1]
friday = calendar[calendar['friday'] == 1]
saturday = calendar[calendar['saturday'] == 1]
sunday = calendar[calendar['sunday'] == 1]

### Monday

In [37]:
monday.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,trip_id,route_id
0,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_029750_1..S02R,1
1,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_036150_1..N02R,1
2,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_042550_1..S02R,1
3,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_048950_1..N02R,1
4,A20161106WKD,1,1,1,1,1,0,0,20161106,20171231,A20161106WKD_055300_1..S02R,1


In [38]:
monday_trips = monday['trip_id']
print "There are ", len(monday_trips), "trips on Monday"
monday_trips.head()

There are  8364 trips on Monday


0    A20161106WKD_029750_1..S02R
1    A20161106WKD_036150_1..N02R
2    A20161106WKD_042550_1..S02R
3    A20161106WKD_048950_1..N02R
4    A20161106WKD_055300_1..S02R
Name: trip_id, dtype: object

In [39]:
monday_trips.tail()

19711    R20161106WKD_129600_SI.S03R
19712    R20161106WKD_135100_SI.N03R
19713    R20161106WKD_141600_SI.S03R
19714    R20161106WKD_147100_SI.N03R
19715    R20161106WKD_119300_SI.S03R
Name: trip_id, dtype: object

Make a dataframe for every trip in monday_trips and append it to triplist.

In [40]:
triplist = []
count = 0

for i in monday_trips:
    count += 1
    df = stop_times[stop_times['trip_id'] == i]
    
    # add arrival and departure dates
    df = add_arrival_date(df, dates = arbitrary_dates)
    df = add_departure_date(df, dates = arbitrary_dates)
    df['arrival_datetime'] = pd.to_datetime(df['arrival_date'] + ' ' + df['arrival_time'])
    df['departure_datetime'] = pd.to_datetime(df['departure_date'] + ' ' + df['departure_time'])

    # join df with stops
    df = pd.merge(df, stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],left_on='stop_id', right_on='stop_id')

    # join df with trips to get direction id
    df = pd.merge(df, trips[['trip_id', 'direction_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')

    # join df with routes to get route id
    df = pd.merge(df, routes[['route_id', 'route_long_name']], left_on='route_id', right_on='route_id', how='left')
    
    # create new dataframe to store results
    legs = pd.DataFrame()
    legs['type_id'] = df['route_id']
    legs['starttime'] = df['departure_datetime']
    legs['stoptime'] = df['arrival_datetime'].shift(-1).fillna(method='ffill')
    legs['tripduration'] = ((legs['stoptime'] - legs['starttime'])/np.timedelta64(1, 's')).astype(int)
    legs['start_lat'] = df['stop_lat']
    legs['start_lon'] = df['stop_lon']
    legs['end_lat'] = legs['start_lat'].shift(-1).fillna(method='ffill')
    legs['end_lon'] = legs['start_lon'].shift(-1).fillna(method='ffill')
    
    # append results to triplist
    triplist.append(legs)
    
    if count % 100 == 0:
        print str(dt.datetime.now()), "finished trip number", count, "/", len(monday_trips)

2017-04-06 14:15:37.882753 finished trip number 100 / 8364
2017-04-06 14:15:47.997911 finished trip number 200 / 8364
2017-04-06 14:15:57.748757 finished trip number 300 / 8364
2017-04-06 14:16:06.943861 finished trip number 400 / 8364
2017-04-06 14:16:16.239498 finished trip number 500 / 8364
2017-04-06 14:16:26.505995 finished trip number 600 / 8364
2017-04-06 14:16:37.513529 finished trip number 700 / 8364
2017-04-06 14:16:47.016765 finished trip number 800 / 8364
2017-04-06 14:16:55.836228 finished trip number 900 / 8364
2017-04-06 14:17:04.782258 finished trip number 1000 / 8364
2017-04-06 14:17:13.901980 finished trip number 1100 / 8364
2017-04-06 14:17:23.600472 finished trip number 1200 / 8364
2017-04-06 14:17:32.268891 finished trip number 1300 / 8364
2017-04-06 14:17:40.665597 finished trip number 1400 / 8364
2017-04-06 14:17:48.594313 finished trip number 1500 / 8364
2017-04-06 14:17:56.812529 finished trip number 1600 / 8364
2017-04-06 14:18:06.384772 finished trip number 1

In [41]:
triplist[0].head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 04:57:30,2015-01-05 04:59:00,90,40.889248,-73.898583,40.884667,-73.90087
1,1,2015-01-05 04:59:00,2015-01-05 05:00:30,90,40.884667,-73.90087,40.878856,-73.904834
2,1,2015-01-05 05:00:30,2015-01-05 05:02:00,90,40.878856,-73.904834,40.874561,-73.909831
3,1,2015-01-05 05:02:00,2015-01-05 05:03:30,90,40.874561,-73.909831,40.869444,-73.915279
4,1,2015-01-05 05:03:30,2015-01-05 05:04:30,60,40.869444,-73.915279,40.864621,-73.918822


In [45]:
triplist[8363].tail()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
16,SI,2015-01-05 20:26:00,2015-01-05 20:28:00,120,40.525507,-74.200064,40.52241,-74.217847
17,SI,2015-01-05 20:28:00,2015-01-05 20:30:00,120,40.52241,-74.217847,40.519631,-74.229141
18,SI,2015-01-05 20:30:00,2015-01-05 20:32:00,120,40.519631,-74.229141,40.516578,-74.242096
19,SI,2015-01-05 20:32:00,2015-01-05 20:35:00,180,40.516578,-74.242096,40.512764,-74.251961
20,SI,2015-01-05 20:35:00,2015-01-05 20:35:00,0,40.512764,-74.251961,40.512764,-74.251961


In [46]:
len(triplist)

8364

In [47]:
data = pd.concat(triplist)

In [48]:
data.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 04:57:30,2015-01-05 04:59:00,90,40.889248,-73.898583,40.884667,-73.90087
1,1,2015-01-05 04:59:00,2015-01-05 05:00:30,90,40.884667,-73.90087,40.878856,-73.904834
2,1,2015-01-05 05:00:30,2015-01-05 05:02:00,90,40.878856,-73.904834,40.874561,-73.909831
3,1,2015-01-05 05:02:00,2015-01-05 05:03:30,90,40.874561,-73.909831,40.869444,-73.915279
4,1,2015-01-05 05:03:30,2015-01-05 05:04:30,60,40.869444,-73.915279,40.864621,-73.918822


In [49]:
len(data)

230954

In [50]:
data = data[data.tripduration != 0]

In [51]:
len(data)

222674

In [54]:
data = data.sort_values(by='starttime')
data = data.reset_index(drop=True)

Set the type_id to 8 for now (since 1-2 are taxis and 3-8 are bus)

In [60]:
data['type_id'] = 8

In [61]:
data.to_csv('subway_monday_raw.csv')

See how pandas leaves a blank first column with the index?

In [62]:
%%bash 

head subway_monday_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,8,2015-01-05 00:00:00,2015-01-05 00:02:00,120,40.680596,-73.955827,40.674772,-73.957624
1,8,2015-01-05 00:00:00,2015-01-05 00:02:30,150,40.697207,-73.935657,40.697857,-73.927397
2,8,2015-01-05 00:01:00,2015-01-05 00:02:30,90,40.903125,-73.85062,40.898379,-73.854376
3,8,2015-01-05 00:02:00,2015-01-05 00:04:00,120,40.8883,-73.830834,40.878663,-73.838591
4,8,2015-01-05 00:02:00,2015-01-05 00:03:30,90,40.674772,-73.957624,40.670343,-73.959245
5,8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.697857,-73.927397,40.698664,-73.919711
6,8,2015-01-05 00:02:30,2015-01-05 00:04:30,120,40.82388,-73.93647,40.820421,-73.936245
7,8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.898379,-73.854376,40.893193,-73.857473
8,8,2015-01-05 00:03:30,2015-01-05 00:06:30,180,40.670343,-73.959245,40.661614,-73.962246


We need to delete it. Bash command to remove first column from csv output:

In [63]:
%%bash

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' subway_monday_raw.csv > subway_monday.csv

In [64]:
%%bash

head subway_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
8,2015-01-05 00:00:00,2015-01-05 00:02:00,120,40.680596,-73.955827,40.674772,-73.957624
8,2015-01-05 00:00:00,2015-01-05 00:02:30,150,40.697207,-73.935657,40.697857,-73.927397
8,2015-01-05 00:01:00,2015-01-05 00:02:30,90,40.903125,-73.85062,40.898379,-73.854376
8,2015-01-05 00:02:00,2015-01-05 00:04:00,120,40.8883,-73.830834,40.878663,-73.838591
8,2015-01-05 00:02:00,2015-01-05 00:03:30,90,40.674772,-73.957624,40.670343,-73.959245
8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.697857,-73.927397,40.698664,-73.919711
8,2015-01-05 00:02:30,2015-01-05 00:04:30,120,40.82388,-73.93647,40.820421,-73.936245
8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.898379,-73.854376,40.893193,-73.857473
8,2015-01-05 00:03:30,2015-01-05 00:06:30,180,40.670343,-73.959245,40.661614,-73.962246


# Merge subway data with existing bus + taxi output file

In [76]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head taxi_and_bus_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,6,2015-01-05 00:00:00,2015-01-05 00:01:00,60,40.829098,-73.874573,40.826763,-73.874023
1,7,2015-01-05 00:00:00,2015-01-05 00:00:45,45,40.643429,-74.073654,40.641872,-74.075699
2,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
5,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
6,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
7,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
8,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


Remove blank first column.

In [78]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' taxi_and_bus_raw.csv > taxi_and_bus.csv

In [79]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head taxi_and_bus.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
6,2015-01-05 00:00:00,2015-01-05 00:01:00,60,40.829098,-73.874573,40.826763,-73.874023
7,2015-01-05 00:00:00,2015-01-05 00:00:45,45,40.643429,-74.073654,40.641872,-74.075699
2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


In [73]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head subway_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
8,2015-01-05 00:00:00,2015-01-05 00:02:00,120,40.680596,-73.955827,40.674772,-73.957624
8,2015-01-05 00:00:00,2015-01-05 00:02:30,150,40.697207,-73.935657,40.697857,-73.927397
8,2015-01-05 00:01:00,2015-01-05 00:02:30,90,40.903125,-73.85062,40.898379,-73.854376
8,2015-01-05 00:02:00,2015-01-05 00:04:00,120,40.8883,-73.830834,40.878663,-73.838591
8,2015-01-05 00:02:00,2015-01-05 00:03:30,90,40.674772,-73.957624,40.670343,-73.959245
8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.697857,-73.927397,40.698664,-73.919711
8,2015-01-05 00:02:30,2015-01-05 00:04:30,120,40.82388,-73.93647,40.820421,-73.936245
8,2015-01-05 00:02:30,2015-01-05 00:04:00,90,40.898379,-73.854376,40.893193,-73.857473
8,2015-01-05 00:03:30,2015-01-05 00:06:30,180,40.670343,-73.959245,40.661614,-73.962246


In [2]:
subway = pd.read_csv("/Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data/subway_monday.csv")

NameError: name 'pd' is not defined

In [3]:
taxi_bus = pd.read_csv("/Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data/taxi_and_bus.csv")

NameError: name 'pd' is not defined

In [82]:
output = pd.concat([taxi_bus, subway])

In [83]:
output.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,6,2015-01-05 00:00:00,2015-01-05 00:01:00,60,40.829098,-73.874573,40.826763,-73.874023
1,7,2015-01-05 00:00:00,2015-01-05 00:00:45,45,40.643429,-74.073654,40.641872,-74.075699
2,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.723946,-73.951294,40.697411,-73.964851
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555


In [84]:
output.tail()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
222669,8,2015-01-06 03:38:30,2015-01-06 03:40:30,120,40.898379,-73.854376,40.903125,-73.85062
222670,8,2015-01-06 03:39:30,2015-01-06 03:40:30,60,40.847391,-73.939704,40.851695,-73.937969
222671,8,2015-01-06 03:40:30,2015-01-06 03:42:30,120,40.851695,-73.937969,40.859022,-73.93418
222672,8,2015-01-06 03:42:30,2015-01-06 03:45:00,150,40.859022,-73.93418,40.865491,-73.927271
222673,8,2015-01-06 03:47:30,2015-01-06 03:49:00,90,40.865491,-73.927271,40.868072,-73.919899


In [88]:
output.shape

(4315381, 8)

In [91]:
output = output.sort_values(by=['starttime', 'type_id'])
output = output.reset_index(drop=True)

In [93]:
output.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.686058,-73.990952,40.677547,-73.938622
1,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.723946,-73.951294,40.697411,-73.964851
2,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176


In [94]:
output.tail()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
4315376,4,2015-01-06 04:42:13,2015-01-06 04:42:47,34,40.711842,-73.790161,40.710716,-73.793091
4315377,4,2015-01-06 04:42:47,2015-01-06 04:44:00,73,40.710716,-73.793091,40.707237,-73.794998
4315378,4,2015-01-06 04:42:47,2015-01-06 04:44:00,73,40.710716,-73.793091,40.707237,-73.794998
4315379,4,2015-01-06 04:44:00,2015-01-06 04:45:00,60,40.707237,-73.794998,40.704933,-73.79332
4315380,4,2015-01-06 04:44:00,2015-01-06 04:45:00,60,40.707237,-73.794998,40.704933,-73.79332


In [120]:
len(output[output.type_id == 8])

222674

In [96]:
output.to_csv("data/output.csv")

In [97]:
ls

Subway Data Notebook.ipynb  subway_dynamic.pde
Trips.pde                   subway_monday.csv
[34mdata[m[m/                       subway_monday_raw.csv
[34mgtfs[m[m/                       taxi_and_bus.csv


In [98]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head output.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.6860580444,-73.9909515381,40.6775474548,-73.938621521
1,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
2,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
5,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
6,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
7,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
8,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


In [99]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' output.csv > output_clean.csv

In [100]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head output_clean.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.6860580444,-73.9909515381,40.6775474548,-73.938621521
2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


In [101]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
awk '{n+=1} END {print n}' output_clean.csv

4315382


Split file into two smaller files

In [105]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
head -n 2500000 output_clean.csv > output_clean_h1.csv

In [106]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
tail -n 2500000 output_clean.csv > output_clean_h2.csv

In [109]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

tail output_clean_h1.csv

4,2015-01-05 15:54:23,2015-01-05 15:55:25,62,40.76762,-73.827103,40.765469,-73.827927
4,2015-01-05 15:54:23,2015-01-05 15:55:23,60,40.694389,-73.781174,40.696316,-73.782944
4,2015-01-05 15:54:23,2015-01-05 15:55:11,48,40.716011,-73.770683,40.716713,-73.768654
4,2015-01-05 15:54:23,2015-01-05 15:55:28,65,40.729076,-73.744034,40.726543,-73.741341
4,2015-01-05 15:54:23,2015-01-05 15:55:10,47,40.740452,-73.789452,40.739834,-73.791695
4,2015-01-05 15:54:23,2015-01-05 15:54:50,27,40.696316,-73.782944,40.697052,-73.783958
4,2015-01-05 15:54:23,2015-01-05 15:55:37,74,40.755901,-73.739388,40.752762,-73.741585
4,2015-01-05 15:54:23,2015-01-05 15:55:26,63,40.767597,-73.881813,40.767113,-73.88456
4,2015-01-05 15:54:23,2015-01-05 15:54:57,34,40.753956,-73.80661,40.754143,-73.809006
5,2015-01-05 15:54:23,2015-01-05 15:55:11,48,40.778725,-73.985138,40.779552,-73.984116


In [108]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

head output_clean_h2.csv

3,2015-01-05 12:55:00,2015-01-05 12:58:03,183,40.762562,-73.972794,40.759525,-73.976601
3,2015-01-05 12:55:00,2015-01-05 12:56:01,61,40.593021,-73.99218,40.594521,-73.990623
3,2015-01-05 12:55:00,2015-01-05 12:56:49,109,40.701019,-73.940872,40.700516,-73.944206
3,2015-01-05 12:55:00,2015-01-05 12:56:40,100,40.691196,-73.997696,40.689487,-74.00071
3,2015-01-05 12:55:00,2015-01-05 12:55:48,48,40.679749,-73.937424,40.679344,-73.93856
3,2015-01-05 12:55:00,2015-01-05 12:56:30,90,40.636593,-73.994728,40.637897,-73.996773
3,2015-01-05 12:55:00,2015-01-05 12:56:17,77,40.68837,-73.988808,40.687565,-73.986725
3,2015-01-05 12:55:00,2015-01-05 12:57:08,128,40.676746,-73.89994,40.67527,-73.905182
3,2015-01-05 12:55:00,2015-01-05 12:56:57,117,40.611763,-74.035118,40.614799,-74.0345
3,2015-01-05 12:55:00,2015-01-05 12:55:50,50,40.678745,-73.921585,40.680058,-73.922081


Split in half again

In [110]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
head -n 1250000 output_clean_h1.csv > output_clean_q1.csv

In [123]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
tail -n 1350000 output_clean_h1.csv > output_clean_q2.csv

In [124]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# add header back in
for csv in output_clean_q2.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_q2.csv) > tmp; mv tmp output_clean_q2_header.csv; done

In [112]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
head -n 1250000 output_clean_h2.csv > output_clean_q3.csv

In [121]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# add header back in
for csv in output_clean_q3.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_q3.csv) > tmp; mv tmp output_clean_q3_header.csv; done

In [125]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_dynamic/data

# count lines in csv
tail -n 1350000 output_clean_h2.csv > output_clean_q4.csv

rename files

In [None]:
find . -name '*.tiff' | awk 'BEGIN{ a=0 }{ printf "mv \"%s\" %06d.tiff\n", $0, a++ }' | bash

In [None]:
find . -name '*.tiff' | awk 'BEGIN{ a=1797 }{ printf "mv \"%s\" %06d.tiff\n", $0, a++ }' | bash

In [None]:
find . -name '*.tiff' | awk 'BEGIN{ a=3463 }{ printf "mv \"%s\" %06d.tiff\n", $0, a++ }' | bash

In [None]:
find . -name '*.tiff' | awk 'BEGIN{ a=4198 }{ printf "mv \"%s\" %06d.tiff\n", $0, a++ }' | bash