In [3]:
import pandas as pd
import numpy as np
import geopandas as gpd
import geopy
import geocoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from shapely.geometry import Point
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rc
rc('animation', html='jshtml')
from IPython.display import HTML

In [None]:
# This function reads the CSV file, drops some irrelevant columns (comment out those lines
# if you think some columns will be useful for you), converts date columns to datetime

def load_data(filename):
    df = pd.read_csv(filename, dtype={"start_station_id":"object", "end_station_id":"object"})
    df = df.drop(columns=['start_station_id','end_station_id', 'start_station_name','end_station_name'])
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])
    df['rideable_type'] = df['rideable_type'].astype('category')
    df['member_casual'] = df['member_casual'].astype('category')
    df['start_hour'] = df['started_at'].dt.hour
    df['end_hour'] = df['ended_at'].dt.hour
    df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

    return df

In [None]:
# Function that performs a spatial join to find neighborhoods for start and end coordinates
# It returns a DataFrame with additional columns for start and end neighborhoods
# It uses the 'nta2020.shp' shapefile for neighborhood boundaries

def sjoin(df):
    start_geometry = [Point(xy) for xy in zip(df["start_lng"], df["start_lat"])]
    gdf_start = gpd.GeoDataFrame(df, geometry=start_geometry, crs="EPSG:4326")

    end_geometry = [Point(xy) for xy in zip(df["end_lng"], df["end_lat"])]
    gdf_end = gpd.GeoDataFrame(df.copy(), geometry=end_geometry, crs="EPSG:4326")

    nhoods = gpd.read_file('shapefiles/nta2020.shp') 
    nhoods = nhoods.to_crs(epsg=4326)

    # spatial join: START coordinates
    gdf_start_joined = gpd.sjoin(gdf_start, nhoods[['geometry', 'ntaname']], how="left", predicate='within')
    gdf_start_joined = gdf_start_joined.rename(columns={'ntaname': 'start_neighborhood'})

    # spatial join: END coordinates
    gdf_end_joined = gpd.sjoin(gdf_end, nhoods[['geometry', 'ntaname']], how="left", predicate='within')
    gdf_end_joined = gdf_end_joined.rename(columns={'ntaname': 'end_neighborhood'})

    # combine start and end info
    df_withnhoods = df.copy()
    df_withnhoods['start_neighborhood'] = gdf_start_joined['start_neighborhood']
    df_withnhoods['end_neighborhood'] = gdf_end_joined['end_neighborhood']

    return df_withnhoods

In [None]:
# all neighborhoods below 60th Street
below60 = ['Chelsea-Hudson Yards',
           'Chinatown-Two Bridges',
           'East Midtown-Turtle Bay',
           'East Village',
           'Financial District-Battery Park City',
           'Gramercy',
           'Greenwich Village',
           'West Village',
           'Tribeca-Civic Center',
           'The Battery-Governors Island-Ellis Island-Liberty Island',
           'Stuyvesant Town-Peter Cooper Village',
           'SoHo-Little Italy-Hudson Square',
           'Murray Hill-Kips Bay',
           'Midtown-Times Square',
           'Midtown South-Flatiron-Union Square',
           'Lower East Side',
           "Hell's Kitchen"]

# all neighborhoods in Brooklyn
bk = [
    "Greenpoint",
    "Williamsburg",
    "South Williamsburg",
    "East Williamsburg",
    "Brooklyn Heights",
    "Downtown Brooklyn-DUMBO-Boerum Hill",
    "Fort Greene",
    "Clinton Hill",
    "Brooklyn Navy Yard",
    "Bedford-Stuyvesant (West)",
    "Bedford-Stuyvesant (East)",
    "Bushwick (West)",
    "Bushwick (East)",
    "The Evergreens Cemetery",
    "Cypress Hills",
    "East New York (North)",
    "East New York-New Lots",
    "Spring Creek-Starrett City",
    "East New York-City Line",
    "Highland Park-Cypress Hills Cemeteries (South)",
    "Carroll Gardens-Cobble Hill-Gowanus-Red Hook",
    "Park Slope",
    "Windsor Terrace-South Slope",
    "Sunset Park (West)",
    "Sunset Park (Central)",
    "Green-Wood Cemetery",
    "Prospect Heights",
    "Crown Heights (North)",
    "Lincoln Terrace Park",
    "Crown Heights (South)",
    "Prospect Lefferts Gardens-Wingate",
    "Bay Ridge",
    "Dyker Heights",
    "Fort Hamilton",
    "Dyker Beach Park",
    "Bensonhurst",
    "Bath Beach",
    "Gravesend (West)",
    "Sunset Park (East)-Borough Park (West)",
    "Borough Park",
    "Kensington",
    "Mapleton-Midwood (West)",
    "Gravesend (South)",
    "Coney Island-Sea Gate",
    "Brighton Beach",
    "Calvert Vaux Park",
    "Flatbush",
    "Flatbush (West)-Ditmas Park-Parkville",
    "Midwood",
    "Gravesend (East)-Homecrest",
    "Madison"
]

In [None]:
# this function returns only the rides between Brooklyn and Manhattan below 60th Street
# it assigns the start and end boroughs to 'BROOKLYN' or 'MANHATTAN BELOW 60TH ST'
# and filters the DataFrame to include only those rides

def count_by_boro(df):
    df['start_boro'] = 'OTHER'
    df['end_boro'] = 'OTHER'

    # Assign 'BROOKLYN'
    df.loc[df['start_neighborhood'].isin(bk), 'start_boro'] = 'BROOKLYN'
    df.loc[df['end_neighborhood'].isin(bk), 'end_boro'] = 'BROOKLYN'

    # Assign 'MANHATTAN BELOW 60ST'
    df.loc[df['start_neighborhood'].isin(below60), 'start_boro'] = 'MANHATTAN BELOW 60TH ST'
    df.loc[df['end_neighborhood'].isin(below60), 'end_boro'] = 'MANHATTAN BELOW 60TH ST'

    filtered = df[
    ((df['start_boro'] == 'MANHATTAN BELOW 60TH ST') & (df['end_boro'] == 'BROOKLYN')) |
    ((df['start_boro'] == 'BROOKLYN') & (df['end_boro'] == 'MANHATTAN BELOW 60TH ST'))
    ]

    return filtered

In [None]:
# Groups the rides by start and end borough

def get_total_ride_count(df):
    grouped = df.groupby(['start_boro', 'end_boro']).size().reset_index(name='ride_count')
    return grouped
def get_total_ride_count_by_hour(df):
    grouped = df.groupby(['start_boro', 'end_boro', 'start_hour']).size().reset_index(name='ride_count')
    return grouped

In [None]:
# Counts the number of membership holders vs casul riders

def count_members(df):
    return df['member_casual'].value_counts().to_dict()

In [None]:
# Wrapper function to export DataFrame to CSV

def export(df, name):
    df.to_csv(name, index=False)

In [None]:
# filters the rides by trip duration

def filter_by_duration(df):
    df['trip_duration'].round(0)
    grouped = df.groupby('trip_duration').size().reset_index(name='ride_count')
    return grouped

In [None]:
# Cleans trip duration data by removing outliers using the interquartile range method
def clean_dur(df):
    Q1 = df['trip_duration'].quantile(0.25)
    Q3 = df['trip_duration'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    clean = df[(df['trip_duration'] >= lower_bound) & (df['trip_duration'] <= upper_bound)]

    return clean