In [119]:
import pandas as pd 
import numpy as np
pd.options.display.max_rows = 1000

In [120]:
routes = pd.read_csv("../data/google_transit/routes.csv")

In [121]:
trips = pd.read_csv("../data/google_transit/trips.txt")

In [122]:
stops = pd.read_csv("../data/google_transit/stops.csv")

In [123]:
stop_times = pd.read_csv("../data/google_transit/stop_times.csv")

In [124]:
stations = pd.read_csv("../data/output/stationsWithTracts.csv")

In [125]:
stations.head()

Unnamed: 0,station_code,station,GTFS_stop_id,C/A,line_name,ct2010,ntaname,ctlabel,geometry,lat,long
0,H007AR248,1 AV,L06,H007A,L,3400,East Village,34.0,POINT (-73.981628 40.730953),40.730953,-73.981628
1,H007R248,1 AV,L06,H007,L,3400,East Village,34.0,POINT (-73.981628 40.730953),40.730953,-73.981628
2,H008R248,1 AV,L06,H008,L,3400,East Village,34.0,POINT (-73.981628 40.730953),40.730953,-73.981628
3,N037R314,103 ST,A18,N037,BC,14300,park-cemetery-etc-Manhattan,143.0,POINT (-73.961454 40.796092),40.796092,-73.961454
4,R170R191,103 ST,119,R170,1,19100,Upper West Side,191.0,POINT (-73.968379 40.799446),40.799446,-73.968379


In [126]:
# concat station codes so that there is only 1 row per gtfs stop id
stations_with_unified_id = (stations.groupby(["GTFS_stop_id"], as_index=False)["station_code", "station"]
    .agg({'station': np.min, "station_code": ','.join}) 
)

In [127]:
stations_with_unified_id.head()

Unnamed: 0,GTFS_stop_id,station,station_code
0,101,V.CORTLANDT PK,R197R117
1,103,238 ST,R196R306
2,104,231 ST,R194R040
3,106,MARBLE HILL-225,R192R039
4,107,215 ST,R190R038


In [128]:

# 1. get 'parent id' for each stop (stops are divided by N/S variance ex 'R11S' and 'R11N' which are stop 'R11')
stop_times_with_stations = stop_times.merge(stops[["stop_id", "parent_station"]], on="stop_id", how="left")

# 2. use parent id to get station name from master station dataset
stop_times_with_stations = stop_times_with_stations.merge(stations_with_unified_id, left_on="parent_station", right_on="GTFS_stop_id", how="left")

In [129]:
stop_times_with_stations.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,parent_station,GTFS_stop_id,station,station_code
0,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:06:00,00:06:00,101S,1,,0,0,,101,101,V.CORTLANDT PK,R197R117
1,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:07:30,00:07:30,103S,2,,0,0,,103,103,238 ST,R196R306
2,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:09:00,00:09:00,104S,3,,0,0,,104,104,231 ST,R194R040
3,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:10:30,00:10:30,106S,4,,0,0,,106,106,MARBLE HILL-225,R192R039
4,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:12:00,00:12:00,107S,5,,0,0,,107,107,215 ST,R190R038


In [130]:
# ref: https://stackoverflow.com/a/40490276
sequencesPerTrip = (stop_times_with_stations.groupby(["trip_id"])
    .apply(lambda x: x[["stop_id", "stop_sequence", "station_code", "station"]].to_dict("r"))
    .reset_index()
    .rename(columns={0:'stop_sequence'})
)

In [131]:
sequencesPerTrip.head()

Unnamed: 0,trip_id,stop_sequence
0,AFA19GEN-1037-Sunday-00_000600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat..."
1,AFA19GEN-1037-Sunday-00_002600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat..."
2,AFA19GEN-1037-Sunday-00_004600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat..."
3,AFA19GEN-1037-Sunday-00_006600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat..."
4,AFA19GEN-1037-Sunday-00_007200_1..N03R,"[{'stop_id': '142N', 'stop_sequence': 1, 'stat..."


In [132]:
tripWithRoute = sequencesPerTrip.merge(trips[["trip_id", "trip_headsign", "direction_id", "route_id"]], on="trip_id")

In [133]:
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,1,AFA19GEN-1037-Sunday-00,AFA19GEN-1037-Sunday-00_000600_1..S03R,South Ferry,1,,1..S03R
1,1,AFA19GEN-1037-Sunday-00,AFA19GEN-1037-Sunday-00_002600_1..S03R,South Ferry,1,,1..S03R
2,1,AFA19GEN-1037-Sunday-00,AFA19GEN-1037-Sunday-00_004600_1..S03R,South Ferry,1,,1..S03R
3,1,AFA19GEN-1037-Sunday-00,AFA19GEN-1037-Sunday-00_006600_1..S03R,South Ferry,1,,1..S03R
4,1,AFA19GEN-1037-Sunday-00,AFA19GEN-1037-Sunday-00_007200_1..N03R,Van Cortlandt Park - 242 St,0,,1..N03R


In [134]:
tripWithRoute.head()

Unnamed: 0,trip_id,stop_sequence,trip_headsign,direction_id,route_id
0,AFA19GEN-1037-Sunday-00_000600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat...",South Ferry,1,1
1,AFA19GEN-1037-Sunday-00_002600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat...",South Ferry,1,1
2,AFA19GEN-1037-Sunday-00_004600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat...",South Ferry,1,1
3,AFA19GEN-1037-Sunday-00_006600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat...",South Ferry,1,1
4,AFA19GEN-1037-Sunday-00_007200_1..N03R,"[{'stop_id': '142N', 'stop_sequence': 1, 'stat...",Van Cortlandt Park - 242 St,0,1


In [135]:
routeSequences = tripWithRoute.groupby(["route_id", "trip_headsign", "direction_id"], as_index=False).first()

In [136]:
routeSequences.head()

Unnamed: 0,route_id,trip_headsign,direction_id,trip_id,stop_sequence
0,1,137 St - City College,0,AFA19GEN-1087-Weekday-00_047050_1..N12R,"[{'stop_id': '142N', 'stop_sequence': 1, 'stat..."
1,1,215 St,0,AFA19GEN-1087-Weekday-00_052550_1..N13R,"[{'stop_id': '142N', 'stop_sequence': 1, 'stat..."
2,1,South Ferry,1,AFA19GEN-1037-Sunday-00_000600_1..S03R,"[{'stop_id': '101S', 'stop_sequence': 1, 'stat..."
3,1,Van Cortlandt Park - 242 St,0,AFA19GEN-1037-Sunday-00_007200_1..N03R,"[{'stop_id': '142N', 'stop_sequence': 1, 'stat..."
4,2,Flatbush Av - Brooklyn College,1,AFA19GEN-2042-Saturday-00_001900_2..S08R,"[{'stop_id': '201S', 'stop_sequence': 1, 'stat..."


In [137]:
routeSequences.to_json("../data/output/route_sequences.json", orient="records")