# Post-Processing
- We've got all the data we need including the `inzone` dummy variable from ArcGIS Pro.
- Now we need to do some post-processing to get the data into a format that can be used for analysis
    - Removing variables we don't need
    - Making sure each station ID has a row for each date spanning from 1-1 to 4-3
    - With 121 stations and 92 days, theoretically the final dataframe should have with 11,112 rows

In [55]:
import pandas as pd

In [56]:
manhattan_2025 = pd.read_csv("data-postarcpro/Manhattan_Subway_Ridership_2025_w_dummy.csv")
manhattan_2024 = pd.read_csv("data-postarcpro/Manhattan_Subway_Ridership_2024_w_dummy.csv")

In [57]:
#First view the data to see if anything is missing
manhattan_2025.head()

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,latitude,longitude,Georeference,transit_date,transit_time,total_ridership_ondate_all_station,total_ridership_ondate_and_station,station_percentage_of_total_ridership,average_ridership_station,inside_zone
0,3/9/2025 9:00:00,316,50 St (1),Manhattan,1,40.761726,-73.98385,POINT (-73.98385 40.761726),3/9/2025,09:00:00,1129837,1,8.9e-05,14654.411888,1
1,1/19/2025 2:00:00,605,"168 St (A,C,1)",Manhattan,3,40.840717,-73.93956,POINT (-73.93956 40.840717),1/19/2025,02:00:00,1047999,3,0.000286,13344.571429,0
2,3/27/2025 2:00:00,118,3 Av (L),Manhattan,2,40.73285,-73.98612,POINT (-73.98612 40.73285),3/27/2025,02:00:00,2379341,4020,0.168954,4543.664122,1
3,3/27/2025 10:00:00,160,"72 St (C,B)",Manhattan,242,40.775593,-73.97641,POINT (-73.97641 40.775593),3/27/2025,10:00:00,2379341,5920,0.248808,6410.384089,0
4,3/27/2025 7:00:00,405,23 St (6),Manhattan,103,40.739864,-73.9866,POINT (-73.9866 40.739864),3/27/2025,07:00:00,2379341,17875,0.751258,15326.066034,1


In [58]:
manhattan_2024.head()

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,latitude,longitude,Georeference,transit_date,transit_time,total_ridership_ondate_all_station,total_ridership_ondate_and_station,station_percentage_of_total_ridership,average_ridership_station_2024,inside_zone
0,1/1/2024 12:00:00,157,"96 St (C,B)",Manhattan,3,40.79164,-73.9647,POINT (-73.9647 40.79164),1/1/2024,12:00:00,992908,2171,0.218651,6284.050682,0
1,1/1/2024 12:00:00,440,"116 St (2,3)",Manhattan,5,40.802097,-73.94962,POINT (-73.94962 40.802097),1/1/2024,12:00:00,992908,2796,0.281597,5141.996618,0
2,1/1/2024 12:00:00,404,28 St (6),Manhattan,36,40.74307,-73.98426,POINT (-73.98426 40.74307),1/1/2024,12:00:00,992908,5664,0.570446,12643.067707,1
3,1/1/2024 12:00:00,477,72 St (Q),Manhattan,44,40.7688,-73.95843,POINT (-73.95843 40.7688),1/1/2024,12:00:00,992908,6425,0.647089,18237.057288,0
4,1/1/2024 12:00:00,306,125 St (1),Manhattan,2,40.815582,-73.958374,POINT (-73.958374 40.815582),1/1/2024,12:00:00,992908,2159,0.217442,5585.126057,0


In [59]:
# We have the following columns of interest: 'ridership' (ridership at that hour), 'transit_date','transit_time' (in hours), and 'station_complex_id' (the station)
# Lets look at each station and sum the ridership for every day, saved into a field called station_ridership -- we'll compare to the 'total_ridership_ondate_and_station' column to see if they match
# We technically don't need to do this because there's a a 'total_ridership_ondate_and_station' column, but we will do it anyway to see if the two match
manhattan_2024['station_ridership'] = manhattan_2024.groupby(['transit_date', 'station_complex_id'])['ridership'].transform('sum')
manhattan_2025['station_ridership'] = manhattan_2025.groupby(['transit_date', 'station_complex_id'])['ridership'].transform('sum')

In [60]:
manhattan_2024.head()

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,latitude,longitude,Georeference,transit_date,transit_time,total_ridership_ondate_all_station,total_ridership_ondate_and_station,station_percentage_of_total_ridership,average_ridership_station_2024,inside_zone,station_ridership
0,1/1/2024 12:00:00,157,"96 St (C,B)",Manhattan,3,40.79164,-73.9647,POINT (-73.9647 40.79164),1/1/2024,12:00:00,992908,2171,0.218651,6284.050682,0,2171
1,1/1/2024 12:00:00,440,"116 St (2,3)",Manhattan,5,40.802097,-73.94962,POINT (-73.94962 40.802097),1/1/2024,12:00:00,992908,2796,0.281597,5141.996618,0,2796
2,1/1/2024 12:00:00,404,28 St (6),Manhattan,36,40.74307,-73.98426,POINT (-73.98426 40.74307),1/1/2024,12:00:00,992908,5664,0.570446,12643.067707,1,5664
3,1/1/2024 12:00:00,477,72 St (Q),Manhattan,44,40.7688,-73.95843,POINT (-73.95843 40.7688),1/1/2024,12:00:00,992908,6425,0.647089,18237.057288,0,6425
4,1/1/2024 12:00:00,306,125 St (1),Manhattan,2,40.815582,-73.958374,POINT (-73.958374 40.815582),1/1/2024,12:00:00,992908,2159,0.217442,5585.126057,0,2159


In [61]:
manhattan_2025
# Doesn't match for 2025, but does for 2024. Still worth checking for this reason.

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,latitude,longitude,Georeference,transit_date,transit_time,total_ridership_ondate_all_station,total_ridership_ondate_and_station,station_percentage_of_total_ridership,average_ridership_station,inside_zone,station_ridership
0,3/9/2025 9:00:00,316,50 St (1),Manhattan,1,40.761726,-73.98385,POINT (-73.98385 40.761726),3/9/2025,09:00:00,1129837,1,0.000089,14654.411888,1,8940
1,1/19/2025 2:00:00,605,"168 St (A,C,1)",Manhattan,3,40.840717,-73.93956,POINT (-73.93956 40.840717),1/19/2025,02:00:00,1047999,3,0.000286,13344.571429,0,6597
2,3/27/2025 2:00:00,118,3 Av (L),Manhattan,2,40.732850,-73.98612,POINT (-73.98612 40.73285),3/27/2025,02:00:00,2379341,4020,0.168954,4543.664122,1,5813
3,3/27/2025 10:00:00,160,"72 St (C,B)",Manhattan,242,40.775593,-73.97641,POINT (-73.97641 40.775593),3/27/2025,10:00:00,2379341,5920,0.248808,6410.384089,0,7675
4,3/27/2025 7:00:00,405,23 St (6),Manhattan,103,40.739864,-73.98660,POINT (-73.9866 40.739864),3/27/2025,07:00:00,2379341,17875,0.751258,15326.066034,1,23648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299698,1/5/2025 11:00:00,436,Harlem-148 St (3),Manhattan,5,40.823880,-73.93647,POINT (-73.93647 40.82388),1/5/2025,11:00:00,1014010,5,0.000493,1806.595637,0,1656
2299699,1/5/2025 11:00:00,438,"135 St (2,3)",Manhattan,6,40.814228,-73.94077,POINT (-73.94077 40.814228),1/5/2025,11:00:00,1014010,313,0.030868,6729.080565,0,4646
2299700,1/5/2025 11:00:00,476,86 St (Q),Manhattan,4,40.777890,-73.95179,POINT (-73.95179 40.77789),1/5/2025,11:00:00,1014010,1302,0.128401,14525.205236,0,10493
2299701,1/5/2025 11:00:00,602,"14 St-Union Sq (L,N,Q,R,W,4,5,6)",Manhattan,28,40.735737,-73.98995,POINT (-73.98995 40.735737),1/5/2025,11:00:00,1014010,297,0.029290,59142.219493,1,35828


In [None]:
#We need an entry for each station for each day, removing duplicates now that we have the station_ridership column:
manhattan_2025 = manhattan_2025[['transit_date', 'station_complex_id', 'station_ridership',]].drop_duplicates().reset_index(drop=True)
manhattan_2025


In [77]:
manhattan_2024 = manhattan_2024[['transit_date', 'station_complex_id', 'station_ridership',]].drop_duplicates().reset_index(drop=True)
manhattan_2024

Unnamed: 0,transit_date,station_complex_id,station_ridership
0,1/1/2024,157,2171
1,1/1/2024,440,2796
2,1/1/2024,404,5664
3,1/1/2024,477,6425
4,1/1/2024,306,2159
...,...,...,...
11362,4/3/2024,296,3028
11363,4/3/2024,395,8214
11364,4/3/2024,320,8812
11365,4/3/2024,328,14211


In [78]:
#Save both to csvs in the data-final/ directory
manhattan_2025.to_csv("data-final/manhattan_2025.csv", index=False)
manhattan_2024.to_csv("data-final/manhattan_2024.csv", index=False)