# Visualizing Staten Island Ferry GTFS Data

Data source: https://data.cityofnewyork.us/Transportation/Staten-Island-Ferry-Schedule-GTFS-/mwxp-krtu

In [1]:
url = "http://www.nyc.gov/html/dot/downloads/misc/siferry-gtfs.zip"

In [1]:
ls

Ferry Data Wrangling Notebook v2.ipynb  [34mgtfs[m[m/
Ferry Data Wrangling.ipynb              output_clean.csv
ferry_monday.csv                        [34msketch[m[m/
ferry_monday_raw.csv


In [4]:
%%bash

mkdir gtfs
cd gtfs
wget http://www.nyc.gov/html/dot/downloads/misc/siferry-gtfs.zip
unzip siferry-gtfs.zip

Archive:  siferry-gtfs.zip


mkdir: gtfs: File exists
--2017-04-06 19:17:26--  http://www.nyc.gov/html/dot/downloads/misc/siferry-gtfs.zip
Resolving www.nyc.gov... 2600:1400:d:38a::1500, 2600:1400:d:385::1500, 23.217.171.105
Connecting to www.nyc.gov|2600:1400:d:38a::1500|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14232 (14K) [application/zip]
Saving to: ‘siferry-gtfs.zip’

     0K .......... ...                                        100% 11.1M=0.001s

2017-04-06 19:17:26 (11.1 MB/s) - ‘siferry-gtfs.zip’ saved [14232/14232]

replace agency.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [6]:
%%bash
ls

Ferry Data Wrangling Notebook v2.ipynb
Ferry Data Wrangling.ipynb
gtfs


Import libraries:

In [7]:
import pandas as pd
import numpy as np
import datetime as dt

### Load GTFS data

In [8]:
agency = pd.read_csv('gtfs/agency.txt')
calendar_dates = pd.read_csv('gtfs/calendar_dates.txt')
calendar = pd.read_csv('gtfs/calendar.txt')
routes = pd.read_csv('gtfs/routes.txt')
shapes = pd.read_csv('gtfs/shapes.txt')
stop_times = pd.read_csv('gtfs/stop_times.txt')
stops = pd.read_csv('gtfs/stops.txt')
trips = pd.read_csv('gtfs/trips.txt')

### Preview the data

In [9]:
agency.head()

Unnamed: 0,agency_phone,agency_url,agency_id,agency_name,agency_timezone,agency_lang
0,311,http://nyc.gov/dot,NYC DOT,New York City Department of Transportation,America/New_York,en-US


In [10]:
calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,monday,20160905,2
1,monday,20190902,2
2,monday,20170102,2
3,monday,20180101,2
4,monday,20210705,2


In [11]:
calendar.head()

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday
0,monday,20140510,20211231,1,0,0,0,0,0,0
1,prevmonday,20140430,20140509,1,0,0,0,0,0,0
2,saturday,20140510,20211231,0,0,0,0,0,1,0
3,prevsunday,20140430,20140509,0,0,0,0,0,0,1
4,sunday,20140510,20211231,0,0,0,0,0,0,1


In [12]:
routes.head()

Unnamed: 0,route_long_name,short_name,route_type,route_text_color,agency_id,route_id,route_color,route_desc,route_url
0,Staten Island Ferry,,4,0,NYC DOT,SIF,FF8330,The Ferry operates between the St. George Ferr...,http://nyc.gov/siferry


In [13]:
shapes.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,stgeorge,40.644165,-74.072235,1,
1,stgeorge,40.645058,-74.070122,2,
2,stgeorge,40.645466,-74.069527,3,
3,stgeorge,40.68565,-74.032486,4,
4,stgeorge,40.698242,-74.013351,5,


In [14]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,prevsaturdaywhitehall203000,20:30:00,20:30:00,whitehall,1,,,,
1,prevsaturdaywhitehall203000,20:55:00,20:55:00,stgeorge,2,,,,
2,prevmondaywhitehall223000,22:30:00,22:30:00,whitehall,1,,,,
3,prevmondaywhitehall223000,22:55:00,22:55:00,stgeorge,2,,,,
4,saturdaystgeorge103000,10:30:00,10:30:00,stgeorge,1,,,,


In [15]:
stops.head()

Unnamed: 0,stop_desc,stop_lat,stop_name,stop_lon,stop_id,location_type
0,"Located at 1 Bay Street, Staten Island, at the...",40.644169,St. George Ferry Terminal,-74.072201,stgeorge,0
1,"Located at Peter Minuit Plaza, at the intersec...",40.70136,Whitehall Ferry Terminal,-74.012666,whitehall,0


In [16]:
trips.head()

Unnamed: 0,route_id,service_id,trip_id,headsign,shape_id
0,SIF,prevsaturday,prevsaturdaywhitehall203000,To St. George,whitehall
1,SIF,prevmonday,prevmondaywhitehall223000,To St. George,whitehall
2,SIF,saturday,saturdaystgeorge103000,To Whitehall,stgeorge
3,SIF,twtf,twtfwhitehall060000,To St. George,whitehall
4,SIF,prevsaturday,prevsaturdaywhitehall023000,To St. George,whitehall


### Merge calendar, service_id and trip_id information


In [17]:
service_trip_id = pd.merge(calendar[['service_id']],trips[['service_id', 'trip_id', 'route_id']])
service_trip_id.head()

Unnamed: 0,service_id,trip_id,route_id
0,monday,mondaywhitehall090000,SIF
1,monday,mondaystgeorge190000,SIF
2,monday,mondaystgeorge210000,SIF
3,monday,mondaystgeorge110000,SIF
4,monday,mondaywhitehall074500,SIF


In [18]:
calendar = pd.merge(calendar,service_trip_id)
calendar.head()

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday,trip_id,route_id
0,monday,20140510,20211231,1,0,0,0,0,0,0,mondaywhitehall090000,SIF
1,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge190000,SIF
2,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge210000,SIF
3,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge110000,SIF
4,monday,20140510,20211231,1,0,0,0,0,0,0,mondaywhitehall074500,SIF


### Add abritrary arrival and departure dates to timetables

Define a function to add arrival dates to stop_times tables. We will use 1/5/15 as the start date because that is the date we are currently using to visualize taxi trips. This code deals with arrival times > 24 hours and converts them to 1/6/15.

In [19]:
arbitrary_dates = ['1/5/2015', '1/6/2015']

In [20]:
def add_arrival_date(df, dates = arbitrary_dates):
    df = df.copy()
    arrival_date = []
    arrival_time = []

    for i in df['arrival_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            arrival_date.append(dates[0])
            arrival_time.append(i)
        elif 24 <= int(hour) < 48:
            arrival_date.append(dates[1])
            hour = int(hour) - 24
            arrival_time.append(str(hour)+":"+minute+":"+second)
        else:
            arrival_date.append('NA')
            
    df['arrival_date'] = arrival_date
    df['arrival_time'] = arrival_time
    return df

In [21]:
def add_departure_date(df, dates = arbitrary_dates):
    df = df.copy()
    departure_date = []
    departure_time = []

    for i in df['departure_time']:
        hour = i[:i.find(':')]
        minute = i[i.find(':')+1:i.find(':',4)]
        second = i[i.find(':',5)+1:]
        
        if int(hour) < 24:
            departure_date.append(dates[0])
            departure_time.append(i)
        elif 24 <= int(hour) < 48:
            departure_date.append(dates[1])
            hour = int(hour) - 24
            departure_time.append(str(hour)+":"+minute+":"+second)
        else:
            departure_date.append('NA')
            
    df['departure_date'] = departure_date
    df['departure_time'] = departure_time
    return df

### Get timetables for each day of the week

In [23]:
monday = calendar[calendar['monday'] == 1]
tuesday = calendar[calendar['tuesday'] == 1]
wednesday = calendar[calendar['wednesday'] == 1]
thursday = calendar[calendar['thursday'] == 1]
friday = calendar[calendar['friday'] == 1]
saturday = calendar[calendar['saturday'] == 1]
sunday = calendar[calendar['sunday'] == 1]

### Monday

In [24]:
monday.head()

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday,trip_id,route_id
0,monday,20140510,20211231,1,0,0,0,0,0,0,mondaywhitehall090000,SIF
1,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge190000,SIF
2,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge210000,SIF
3,monday,20140510,20211231,1,0,0,0,0,0,0,mondaystgeorge110000,SIF
4,monday,20140510,20211231,1,0,0,0,0,0,0,mondaywhitehall074500,SIF


In [25]:
monday_trips = monday['trip_id']
print "There are ", len(monday_trips), "trips on Monday"
monday_trips.head()

There are  213 trips on Monday


0    mondaywhitehall090000
1     mondaystgeorge190000
2     mondaystgeorge210000
3     mondaystgeorge110000
4    mondaywhitehall074500
Name: trip_id, dtype: object

In [26]:
monday_trips.tail()

208     prevmondaystgeorge220000
209     prevmondaystgeorge153000
210     prevmondaystgeorge081500
211     prevmondaystgeorge060000
212    prevmondaywhitehall093000
Name: trip_id, dtype: object

In [28]:
triplist = []
count = 0

for i in monday_trips:
    count += 1
    df = stop_times[stop_times['trip_id'] == i]
    
    # add arrival and departure dates
    df = add_arrival_date(df, dates = arbitrary_dates)
    df = add_departure_date(df, dates = arbitrary_dates)
    df['arrival_datetime'] = pd.to_datetime(df['arrival_date'] + ' ' + df['arrival_time'])
    df['departure_datetime'] = pd.to_datetime(df['departure_date'] + ' ' + df['departure_time'])

    # join df with stops
    df = pd.merge(df, stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],left_on='stop_id', right_on='stop_id')

    # join df with trips to get direction and route id
    try:
        df = pd.merge(df, trips[['trip_id', 'direction_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    except:
        df = pd.merge(df, trips[['trip_id', 'route_id']], left_on='trip_id', right_on='trip_id', how='left')
    
    # join df with routes to get route id
    df = pd.merge(df, routes[['route_id', 'route_long_name']], left_on='route_id', right_on='route_id', how='left')
    
    # create new dataframe to store results
    legs = pd.DataFrame()
    legs['type_id'] = df['route_id']
    legs['starttime'] = df['departure_datetime']
    legs['stoptime'] = df['arrival_datetime'].shift(-1).fillna(method='ffill')
    legs['tripduration'] = ((legs['stoptime'] - legs['starttime'])/np.timedelta64(1, 's')).astype(int)
    legs['start_lat'] = df['stop_lat']
    legs['start_lon'] = df['stop_lon']
    legs['end_lat'] = legs['start_lat'].shift(-1).fillna(method='ffill')
    legs['end_lon'] = legs['start_lon'].shift(-1).fillna(method='ffill')
    
    # append results to triplist
    triplist.append(legs)
    
    if count % 100 == 0:
        print str(dt.datetime.now()), "finished trip number", count, "/", len(monday_trips)

2017-04-06 19:24:36.669794 finished trip number 100 / 213
2017-04-06 19:24:40.406313 finished trip number 200 / 213


In [30]:
triplist[0].head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,SIF,2015-01-05 09:00:00,2015-01-05 09:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
1,SIF,2015-01-05 09:25:00,2015-01-05 09:25:00,0,40.644169,-74.072201,40.644169,-74.072201


In [31]:
len(triplist)

213

In [32]:
triplist[0].head()

In [33]:
data.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,SIF,2015-01-05 09:00:00,2015-01-05 09:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
1,SIF,2015-01-05 09:25:00,2015-01-05 09:25:00,0,40.644169,-74.072201,40.644169,-74.072201
0,SIF,2015-01-05 19:00:00,2015-01-05 19:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
1,SIF,2015-01-05 19:25:00,2015-01-05 19:25:00,0,40.70136,-74.012666,40.70136,-74.012666
0,SIF,2015-01-05 21:00:00,2015-01-05 21:25:00,1500,40.644169,-74.072201,40.70136,-74.012666


In [35]:
data.shape

(426, 8)

In [36]:
data = data[data.tripduration != 0]

In [37]:
data.shape

(213, 8)

In [38]:
data = data.sort_values(by='starttime')
data = data.reset_index(drop=True)

In [39]:
data['type_id'] = 9 # 9 = Ferry

In [40]:
data.to_csv('ferry_monday_raw.csv')

In [41]:
%%bash 

head ferry_monday_raw.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
1,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
2,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
3,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
4,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
5,9,2015-01-05 01:00:00,2015-01-05 01:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
6,9,2015-01-05 01:00:00,2015-01-05 01:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
7,9,2015-01-05 01:30:00,2015-01-05 01:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
8,9,2015-01-05 01:30:00,2015-01-05 01:55:00,1500,40.70136,-74.012666,40.644169,-74.072201


In [42]:
%%bash

awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' ferry_monday_raw.csv > ferry_monday.csv

In [43]:
%%bash 

head ferry_monday.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
9,2015-01-05 01:00:00,2015-01-05 01:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
9,2015-01-05 01:00:00,2015-01-05 01:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
9,2015-01-05 01:30:00,2015-01-05 01:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
9,2015-01-05 01:30:00,2015-01-05 01:55:00,1500,40.70136,-74.012666,40.644169,-74.072201


# Merge with taxi, bus, subway data

In [46]:
ls

Ferry Data Wrangling Notebook v2.ipynb  ferry_monday_raw.csv
Ferry Data Wrangling.ipynb              [34mgtfs[m[m/
ferry_monday.csv                        [34msketch[m[m/


In [48]:
ferry = pd.read_csv("/Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/ferry_monday.csv")

In [52]:
ferry.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
1,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
2,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
3,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
4,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201


In [54]:
taxiBusSubway = pd.read_csv("/Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/taxiBusSubway.csv")

In [55]:
taxiBusSubway.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.686058,-73.990952,40.677547,-73.938622
1,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.723946,-73.951294,40.697411,-73.964851
2,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176


In [67]:
output = pd.concat([ferry, taxiBusSubway])

In [68]:
output.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.70136,-74.012666,40.644169,-74.072201
1,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
2,9,2015-01-05 00:00:00,2015-01-05 00:25:00,1500,40.644169,-74.072201,40.70136,-74.012666
3,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201
4,9,2015-01-05 00:30:00,2015-01-05 00:55:00,1500,40.70136,-74.012666,40.644169,-74.072201


In [69]:
output.shape

(4315594, 8)

In [70]:
output = output[output.tripduration != 0]
output = output.sort_values(by=['starttime', 'type_id'])
output = output.reset_index(drop=True)

In [71]:
output.head()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.686058,-73.990952,40.677547,-73.938622
1,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.723946,-73.951294,40.697411,-73.964851
2,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176


In [72]:
output.tail()

Unnamed: 0,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
4315559,4,2015-01-06 04:42:13,2015-01-06 04:42:47,34,40.711842,-73.790161,40.710716,-73.793091
4315560,4,2015-01-06 04:42:47,2015-01-06 04:44:00,73,40.710716,-73.793091,40.707237,-73.794998
4315561,4,2015-01-06 04:42:47,2015-01-06 04:44:00,73,40.710716,-73.793091,40.707237,-73.794998
4315562,4,2015-01-06 04:44:00,2015-01-06 04:45:00,60,40.707237,-73.794998,40.704933,-73.79332
4315563,4,2015-01-06 04:44:00,2015-01-06 04:45:00,60,40.707237,-73.794998,40.704933,-73.79332


In [74]:
output.to_csv("/Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/output.csv")

In [78]:
%%bash 
cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/
head output.csv

,type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
0,1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.6860580444,-73.9909515381,40.6775474548,-73.938621521
1,2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
2,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
4,3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
5,3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
6,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
7,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
8,3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


In [79]:
%%bash
cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/
awk -F"," -v OFS="," '{ $1="" ; $0=substr($0,2) } 1' output.csv > output_clean.csv

In [80]:
%%bash 
cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/
head output_clean.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
1,2015-01-05 00:00:00,2015-01-05 00:09:43,583,40.6860580444,-73.9909515381,40.6775474548,-73.938621521
2,2015-01-05 00:00:00,2015-01-05 00:06:44,404,40.7239456177,-73.9512939453,40.6974105835,-73.9648513794
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:23,23,40.645714,-73.902313,40.644871,-73.902176
3,2015-01-05 00:00:00,2015-01-05 00:01:09,69,40.642979,-73.878334,40.645187,-73.878555
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116
3,2015-01-05 00:00:00,2015-01-05 00:00:48,48,40.748638,-73.938446,40.747673,-73.941116


In [82]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# count lines in csv
awk '{n+=1} END {print n}' output_clean.csv

4315565


In [83]:
4315565 / 2

2157782

In [84]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# top lines in csv
head -n 2300000 output_clean.csv > output_clean_h1.csv

In [85]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# bottom lines in csv
tail -n 2300000 output_clean.csv > output_clean_h2.csv

In [86]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# count lines in csv
awk '{n+=1} END {print n}' output_clean_h1.csv

2300000


In [87]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# count lines in csv
awk '{n+=1} END {print n}' output_clean_h2.csv

2300000


Add header back to h2

In [88]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# add header back in
for csv in output_clean_h2.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_h2.csv) > tmp; mv tmp output_clean_h2_header.csv; done

In [89]:
2300000 / 2

1150000

In [90]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# top lines in csv
head -n 1250000 output_clean_h1.csv > output_clean_q1.csv

In [91]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# top lines in csv
tail -n 1250000 output_clean_h1.csv > output_clean_q2.csv

In [94]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# add header back in
for csv in output_clean_q2.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_q2.csv) > tmp; mv tmp output_clean_q2_header.csv; done

In [95]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# top lines in csv
head -n 1250000 output_clean_h2.csv > output_clean_q3.csv

In [98]:
%%bash 
cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/
for csv in output_clean_q3.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_q3.csv) > tmp; mv tmp output_clean_q3_header.csv; done

7,2015-01-05 13:52:06,2015-01-05 13:52:46,40,40.61274,-74.126541,40.612896,-74.124496
1,2015-01-05 13:52:07,2015-01-05 14:13:08,1261,40.7685089111,-73.863067627,40.7648200989,-73.9472198486
1,2015-01-05 13:52:07,2015-01-05 13:58:27,380,40.7590904236,-73.9923553467,40.7511367798,-73.9905319214
1,2015-01-05 13:52:07,2015-01-05 13:58:59,412,40.7253074646,-73.9841995239,40.7356300354,-73.9855422974
3,2015-01-05 13:52:07,2015-01-05 13:52:53,46,40.621059,-73.988106,40.621883,-73.987274
3,2015-01-05 13:52:07,2015-01-05 13:54:09,122,40.650909,-73.94622,40.651012,-73.943321
3,2015-01-05 13:52:07,2015-01-05 13:53:34,87,40.68021,-73.905914,40.678925,-73.903595
3,2015-01-05 13:52:07,2015-01-05 13:53:23,76,40.580509,-73.967697,40.578152,-73.968239
3,2015-01-05 13:52:07,2015-01-05 13:53:14,67,40.656029,-73.973763,40.65361,-73.973038
3,2015-01-05 13:52:07,2015-01-05 13:53:14,67,40.656029,-73.973763,40.65361,-73.973038


In [100]:
%%bash
cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data/
head output_clean_q3_header.csv

type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon
7,2015-01-05 13:52:06,2015-01-05 13:52:46,40,40.61274,-74.126541,40.612896,-74.124496
1,2015-01-05 13:52:07,2015-01-05 14:13:08,1261,40.7685089111,-73.863067627,40.7648200989,-73.9472198486
1,2015-01-05 13:52:07,2015-01-05 13:58:27,380,40.7590904236,-73.9923553467,40.7511367798,-73.9905319214
1,2015-01-05 13:52:07,2015-01-05 13:58:59,412,40.7253074646,-73.9841995239,40.7356300354,-73.9855422974
3,2015-01-05 13:52:07,2015-01-05 13:52:53,46,40.621059,-73.988106,40.621883,-73.987274
3,2015-01-05 13:52:07,2015-01-05 13:54:09,122,40.650909,-73.94622,40.651012,-73.943321
3,2015-01-05 13:52:07,2015-01-05 13:53:34,87,40.68021,-73.905914,40.678925,-73.903595
3,2015-01-05 13:52:07,2015-01-05 13:53:23,76,40.580509,-73.967697,40.578152,-73.968239
3,2015-01-05 13:52:07,2015-01-05 13:53:14,67,40.656029,-73.973763,40.65361,-73.973038


In [96]:
%%bash

cd /Users/Will/Dropbox/Portfolio/taxi_and_bus_and_subway_and_ferry_dynamic/data

# top lines in csv
tail -n 1250000 output_clean_h2.csv > output_clean_q4.csv

# add header back in
for csv in output_clean_q4.csv; do (echo "type_id,starttime,stoptime,tripduration,start_lat,start_lon,end_lat,end_lon"; cat output_clean_q4.csv) > tmp; mv tmp output_clean_q4_header.csv; done