In [87]:
import pandas as pd
import numpy as np
import geopandas as gpd
import geopy
import geocoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from shapely.geometry import Point
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rc
rc('animation', html='jshtml')
from IPython.display import HTML

Congestion pricing in Manhattan is a little over four months old, yet the backlash from conservative media and the Trump administration has been brewing for way longer. In part, this is because congestion pricing _as a thing_ is entirely new to American society despite being the norm for many European cities. But it is also due to a lack of conclusive evidence of its effectiveness — granted there are no long-term benefits to speak of after only four months — which this story will (attempt to) provide. 

While priding mostly on its subway system, New York is also home to over 2,000 CitiBike stations. Citi Bike is New York City’s bike-sharing program, offering residents and visitors a sustainable way to travel around the city. Launched in 2013, it provides thousands of bikes at hundreds of docking stations located across Manhattan, Brooklyn, Queens, the Bronx, and Jersey City. Users can rent bikes for short trips, commuting, or leisure through a mobile app or at kiosks, paying per ride or with membership options.

This story will compare CitiBike ridership in March 2024 (pre-congesion pricing) and March 2025 (roughly three months after). CitiBike public data contains __enormous__ datasets with information on __every single ride,__ which is why I recruited the `pandas` Python library to analyze it.

In [88]:
def load_data(filename):
    df = pd.read_csv(filename, dtype={"start_station_id":"object", "end_station_id":"object"})
    df = df.drop(columns=['start_station_id','end_station_id', 'start_station_name','end_station_name'])
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])
    df['rideable_type'] = df['rideable_type'].astype('category')
    df['member_casual'] = df['member_casual'].astype('category')
    df['start_hour'] = df['started_at'].dt.hour
    df['end_hour'] = df['ended_at'].dt.hour
    df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

    return df

In [89]:
def sjoin(df):
    start_geometry = [Point(xy) for xy in zip(df["start_lng"], df["start_lat"])]
    gdf_start = gpd.GeoDataFrame(df, geometry=start_geometry, crs="EPSG:4326")

    end_geometry = [Point(xy) for xy in zip(df["end_lng"], df["end_lat"])]
    gdf_end = gpd.GeoDataFrame(df.copy(), geometry=end_geometry, crs="EPSG:4326")

    nhoods = gpd.read_file('nta2020.shp')
    nhoods = nhoods.to_crs(epsg=4326)

    # spatial join: START coordinates
    gdf_start_joined = gpd.sjoin(gdf_start, nhoods[['geometry', 'ntaname']], how="left", predicate='within')
    gdf_start_joined = gdf_start_joined.rename(columns={'ntaname': 'start_neighborhood'})

    # spatial join: END coordinates
    gdf_end_joined = gpd.sjoin(gdf_end, nhoods[['geometry', 'ntaname']], how="left", predicate='within')
    gdf_end_joined = gdf_end_joined.rename(columns={'ntaname': 'end_neighborhood'})

    # combine start and end info
    df_withnhoods = df.copy()
    df_withnhoods['start_neighborhood'] = gdf_start_joined['start_neighborhood']
    df_withnhoods['end_neighborhood'] = gdf_end_joined['end_neighborhood']

    return df_withnhoods

In [90]:
df25 = load_data('202503-citibike-tripdata.csv')
df24 = load_data('202403-citibike-tripdata.csv')

In [91]:
df25 = sjoin(df25)
df24 = sjoin(df24)

In [92]:
# all nhoods below 60th Street
below60 = ['Chelsea-Hudson Yards',
           'Chinatown-Two Bridges',
           'East Midtown-Turtle Bay',
           'East Village',
           'Financial District-Battery Park City',
           'Gramercy',
           'Greenwich Village',
           'West Village',
           'Tribeca-Civic Center',
           'The Battery-Governors Island-Ellis Island-Liberty Island',
           'Stuyvesant Town-Peter Cooper Village',
           'SoHo-Little Italy-Hudson Square',
           'Murray Hill-Kips Bay',
           'Midtown-Times Square',
           'Midtown South-Flatiron-Union Square',
           'Lower East Side',
           "Hell's Kitchen"]

# all nhoods in Brooklyn

bk = [
    "Greenpoint",
    "Williamsburg",
    "South Williamsburg",
    "East Williamsburg",
    "Brooklyn Heights",
    "Downtown Brooklyn-DUMBO-Boerum Hill",
    "Fort Greene",
    "Clinton Hill",
    "Brooklyn Navy Yard",
    "Bedford-Stuyvesant (West)",
    "Bedford-Stuyvesant (East)",
    "Bushwick (West)",
    "Bushwick (East)",
    "The Evergreens Cemetery",
    "Cypress Hills",
    "East New York (North)",
    "East New York-New Lots",
    "Spring Creek-Starrett City",
    "East New York-City Line",
    "Highland Park-Cypress Hills Cemeteries (South)",
    "Carroll Gardens-Cobble Hill-Gowanus-Red Hook",
    "Park Slope",
    "Windsor Terrace-South Slope",
    "Sunset Park (West)",
    "Sunset Park (Central)",
    "Green-Wood Cemetery",
    "Prospect Heights",
    "Crown Heights (North)",
    "Lincoln Terrace Park",
    "Crown Heights (South)",
    "Prospect Lefferts Gardens-Wingate",
    "Bay Ridge",
    "Dyker Heights",
    "Fort Hamilton",
    "Dyker Beach Park",
    "Bensonhurst",
    "Bath Beach",
    "Gravesend (West)",
    "Sunset Park (East)-Borough Park (West)",
    "Borough Park",
    "Kensington",
    "Mapleton-Midwood (West)",
    "Gravesend (South)",
    "Coney Island-Sea Gate",
    "Brighton Beach",
    "Calvert Vaux Park",
    "Flatbush",
    "Flatbush (West)-Ditmas Park-Parkville",
    "Midwood",
    "Gravesend (East)-Homecrest",
    "Madison"
]

In [93]:
# this function returns only the rides from/to the congestion pricing zone to/from Brooklyn

def count_by_boro(df):
    df['start_boro'] = 'OTHER'
    df['end_boro'] = 'OTHER'

    # Assign 'BROOKLYN'
    df.loc[df['start_neighborhood'].isin(bk), 'start_boro'] = 'BROOKLYN'
    df.loc[df['end_neighborhood'].isin(bk), 'end_boro'] = 'BROOKLYN'

    # Assign 'MANHATTAN BELOW 60ST'
    df.loc[df['start_neighborhood'].isin(below60), 'start_boro'] = 'MANHATTAN BELOW 60TH ST'
    df.loc[df['end_neighborhood'].isin(below60), 'end_boro'] = 'MANHATTAN BELOW 60TH ST'

    filtered = df[
    ((df['start_boro'] == 'MANHATTAN BELOW 60TH ST') & (df['end_boro'] == 'BROOKLYN')) |
    ((df['start_boro'] == 'BROOKLYN') & (df['end_boro'] == 'MANHATTAN BELOW 60TH ST'))
    ]

    return filtered

In [94]:
def get_total_ride_count(df):
    grouped = df.groupby(['start_boro', 'end_boro']).size().reset_index(name='ride_count')
    return grouped
def get_total_ride_count_by_hour(df):
    grouped = df.groupby(['start_boro', 'end_boro', 'start_hour']).size().reset_index(name='ride_count')
    return grouped

In [95]:
df25_filtered = count_by_boro(df25)
df24_filtered = count_by_boro(df24)

df25_grouped = get_total_ride_count(df25_filtered)
df24_grouped = get_total_ride_count(df24_filtered)

In [96]:
def export(df, name):
    df.to_csv(name, index=False)

In [98]:
export(df25_filtered, 'March 2025 FILTERED.csv')
export(df24_filtered, 'March 2024 FILTERED.csv')
export(df25_grouped, 'March 2025 GROUPED.csv')
export(df24_grouped, 'March 2024 GROUPED.csv')