# Bulk inserting segment Ids
This notebook handles the translation of latitude/longitude columns within a dataset to their nearest segment ID. This is a very compute-intensive operation which should only need to be run one time on any given dataset.

In [97]:
import pandas as pd
import requests
from zipfile import ZipFile as zzip
import fiona
import geopandas as gpd
from shapely.geometry import Point
import json

In [109]:
LION_ZIP_DIR = "input_data/nyclion_19b.zip"
GDB_FILE = r"input_data/lion/lion.gdb"
OUTPUT_DIR = "output/"

## Download LION data

In [None]:
# Download and store lion files
url = r"https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyclion_19b.zip"

# download the file contents in binary format
r = requests.get(url)
# open method to open a file on your system and write the contents
with open(LION_ZIP_DIR, "wb") as file:
    file.write(r.content)

# opening the zip file in READ mode
with zzip(LION_ZIP_DIR, 'r') as file:
    # printing all the contents of the zip file
    file.printdir()

    # extracting all the files
    file.extractall("input_data/")
    print('Done!')

In [9]:
layers = fiona.listlayers(GDB_FILE)
print(layers)

['node', 'node_stname', 'altnames', 'lion']


In [13]:
lion_gdf = gpd.read_file(GDB_FILE, engine='pyogrio', layer='lion')

In [14]:
# Remove rows outside of manhattan just to clean up and speed up operations
print("Before: ", len(lion_gdf.index))
lion_gdf = lion_gdf[lion_gdf.LBoro == 1]
print("After: ", len(lion_gdf.index))

Before:  226977
After:  32966


In [25]:
def get_segment_id_from_coords(lat, lng):
    point = Point(lng, lat)
    if lion_gdf.crs.is_geographic:
        point_gdf = gpd.GeoDataFrame([{'geometry': point}], crs=lion_gdf.crs)
    else:
        point_gdf = gpd.GeoDataFrame([{'geometry': point}], crs="EPSG:4326").to_crs(lion_gdf.crs)
    
    lion_gdf['distance'] = lion_gdf.geometry.distance(point_gdf.iloc[0].geometry)
    nearest_segment = lion_gdf.loc[lion_gdf['distance'].idxmin()]
    segment_id = nearest_segment['SegmentID']
    return segment_id

In [27]:
# Quick test
get_segment_id_from_coords(40.748433, -73.985656)

'0297696'

In [110]:
def load_segment_ids():
    new_segment_ids = {}
    # Opening JSON file
    f = open(f"{OUTPUT_DIR}/segment_id_dict.json")
    data = json.load(f)
    for key in data.keys():
        coordinates_tuple = tuple(map(float, key.split(',')))
        new_segment_ids[coordinates_tuple] = data[key]

    return new_segment_ids

In [111]:
segment_ids = load_segment_ids()

In [93]:
def add_segment_id_column(df):
    global segment_ids
    for idx, row in df.iterrows():
        try:
            lat = row['Latitude']
            lng = row['Longitude']
            if (lat, lng) not in segment_ids.keys():
                print(f'Getting segment for ({lat}, {lng})')
                segment_id = get_segment_id_from_coords(lat, lng)
                matching_rows = df[(df.Latitude == lat) & (df.Longitude == lng)]
                print(f'Updating: {len(matching_rows)}')
                df.loc[(df.Latitude == lat) & (df.Longitude == lng), 'SegmentId'] = segment_id
                
                print(f'Updating: {len(matching_rows)}')
                segment_ids[(lat, lng)] = segment_id

                rows_left = df['SegmentId'].isnull().sum()
                print(f'Rows left: {rows_left}')
        except:
            print(f"Unable to translate ({lat}, {lng}), skipping")

In [18]:
SUBWAY_DATA = "input_data/MTA_Subway_Hourly_Ridership_20240607.csv"
BIKE_DATA_1 = "input_data/202401-citibike-tripdata_1.csv"
BIKE_DATA_2 = "input_data/202401-citibike-tripdata_2.csv"

In [19]:
df_transit = pd.read_csv(SUBWAY_DATA)

In [53]:
df_transit.columns
df_transit = df_transit[df_transit.borough == 'Manhattan']
df_transit.rename(columns={'latitude': 'Latitude', 'longitude': 'Longitude'}, inplace=True)
df_transit['SegmentId'] = None

df_transit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1987659 entries, 0 to 1987658
Data columns (total 16 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   transit_timestamp         object 
 1   transit_mode              object 
 2   station_complex_id        object 
 3   station_complex           object 
 4   borough                   object 
 5   payment_method            object 
 6   fare_class_category       object 
 7   ridership                 int64  
 8   transfers                 int64  
 9   Latitude                  float64
 10  Longitude                 float64
 11  Georeference              object 
 12  Counties                  int64  
 13  NYS Municipal Boundaries  int64  
 14  New York Zip Codes        float64
 15  SegmentId                 object 
dtypes: float64(3), int64(4), object(9)
memory usage: 257.8+ MB


In [62]:
uniq = df_transit.drop_duplicates(subset=['Latitude', 'Longitude'])
print("Expected length of segment ids: ", len(uniq))
print("Actual: ", len(segment_ids))

Expected length of segment ids:  161
Actual:  161


In [60]:
add_segment_id_column(df_transit)

KeyboardInterrupt: 

In [76]:
df_transit['transit_timestamp'] = pd.to_datetime(df_transit['transit_timestamp'])
min_start_date = df_transit['transit_timestamp'].min()
max_start_date = df_transit['transit_timestamp'].max()

print(f"Minimum start date: {min_start_date}")
print(f"Maximum start date: {max_start_date}")

Minimum start date: 2024-01-01 00:00:00
Maximum start date: 2024-03-31 23:00:00


  df_transit['transit_timestamp'] = pd.to_datetime(df_transit['transit_timestamp'])


## Add column to bike data

In [68]:
df_bike_1 = pd.read_csv(BIKE_DATA_1, dtype={"start_station_id": str})
df_bike_2 = pd.read_csv(BIKE_DATA_2, dtype={"start_station_id": str, "end_station_id": str})

In [72]:
df_bike = pd.concat([df_bike_1, df_bike_2], ignore_index=True)

In [73]:
df_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954376 entries, 0 to 1954375
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 193.8+ MB


In [74]:
df_bike['started_at'] = pd.to_datetime(df_bike['started_at'])
min_start_date = df_bike['started_at'].min()
max_start_date = df_bike['started_at'].max()

print(f"Minimum start date: {min_start_date}")
print(f"Maximum start date: {max_start_date}")

Minimum start date: 2024-01-01 00:00:03
Maximum start date: 2024-01-31 23:59:59


In [77]:
df_bike_new = pd.DataFrame()
df_bike['started_at'] = pd.to_datetime(df_bike['started_at'])
df_bike['ended_at'] = pd.to_datetime(df_bike['ended_at'])

df_bike_new['Timestamp'] = pd.concat([df_bike['started_at'], df_bike['ended_at']], ignore_index=True)
df_bike_new['Timestamp'] = pd.to_datetime(df_bike_new['Timestamp'])
df_bike_new['Timestamp_Rounded'] = df_bike_new['Timestamp'].dt.round("h")
df_bike_new['Latitude'] = pd.concat([df_bike['start_lat'], df_bike['end_lat']], ignore_index=True)
df_bike_new['Longitude'] = pd.concat([df_bike['start_lng'], df_bike['end_lng']], ignore_index=True)

# Round the lat/lngs
df_bike_new['Latitude'] = df_bike_new['Latitude'].round(3)
df_bike_new['Longitude'] = df_bike_new['Longitude'].round(3)

In [79]:
print("Unique lat/long: ", len(df_bike_new.drop_duplicates(subset=['Latitude', 'Longitude'])))
print("Unique lat/long/time: ", len(df_bike_new.drop_duplicates(subset=['Latitude', 'Longitude', 'Timestamp_Rounded'])))

Unique lat/long:  6475
Unique lat/long/time:  926827


In [81]:
uniq = df_bike_new.drop_duplicates(subset=['Latitude', 'Longitude'])
print("Expected length of segment ids: ", len(segment_ids) + len(uniq))
print("Actual: ", len(segment_ids))

Expected length of segment ids:  6636
Actual:  161


In [84]:
# Split up DF into chunks to be pooled
chunk_size = 100
num_chunks = len(df_bike_new) // chunk_size + (len(df_bike_new) % chunk_size > 0)
chunks = [df_bike_new.iloc[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]
print(f"Broke up bike into {len(chunks)} chunks")

Broke up bike into 39088 chunks


In [None]:
add_segment_id_column(df_bike_new)

In [116]:
print("Length of segment ids: ", len(segment_ids))
df_bike_new.head()

Length of segment ids:  6635


Unnamed: 0,Timestamp,Timestamp_Rounded,Latitude,Longitude,SegmentId
0,2024-01-25 20:39:09,2024-01-25 21:00:00,40.735,-73.991,32805
1,2024-01-15 18:44:36,2024-01-15 19:00:00,40.735,-73.988,32949
2,2024-01-03 19:27:58,2024-01-03 19:00:00,40.735,-73.988,32949
3,2024-01-22 18:29:46,2024-01-22 18:00:00,40.735,-73.988,32949
4,2024-01-27 09:55:39,2024-01-27 10:00:00,40.735,-73.988,32949


In [115]:
# Convert segment ids into json format
segment_ids_as_json = {}
for key in segment_ids.keys():
    tuple_str = ",".join([str(key[0]), str(key[1])])
    segment_ids_as_json[tuple_str] = segment_ids[key]

# Save segment ids for future use
with open(f"{OUTPUT_DIR}/segment_id_dict.json", "w") as outfile: 
    json.dump(segment_ids_as_json, outfile)
    print("Segment ids saved")

Segment ids saved


In [118]:
# Save our dataframes
SUBWAY_DATA = "input_data/MTA_Subway_Hourly_Ridership_20240607.csv"
BIKE_DATA_1 = "input_data/202401-citibike-tripdata_1.csv"
BIKE_DATA_2 = "input_data/202401-citibike-tripdata_2.csv"

bike_csv = f"{OUTPUT_DIR}/202401-citibike-tripdata_with_segments.csv"
df_bike_new.to_csv(bike_csv, index=False)
print(f"Bike data saved: {bike_csv}")

transit_csv = f"{OUTPUT_DIR}/MTA_Subway_Hourly_Ridership_20240607_with_segments.csv"
df_transit.to_csv(transit_csv, index=False)
print(f"Transit data saved: {transit_csv}")

Bike data saved: output//202401-citibike-tripdata_with_segments.csv
Transit data saved: output//MTA_Subway_Hourly_Ridership_20240607_with_segments.csv
