# 01 Prepare Spatial Backbone

**Project:** NORI  
**Author:** Yuseof J  
**Date:** 09/12/25

### **Purpose**
Load the raw NY State tract shapefile, filter for NYC tracts, and set crs to Long Island/NYC. 

### **Inputs**
- `data/raw/tiger_tracts_ny/tl_2025_36_tract.shp`

### **Outputs**
- `data/processed/nyc_tracts.gpkg`
  
--------------------------------------------------------------------------

### 0. Imports and Setup

In [66]:
# package imports
import os
import pandas as pd
import geopandas as gpd
from pathlib import Path

# specify filepaths
path_tracts_shapefile = 'data/raw/tiger_tracts_ny/tl_2025_36_tract.shp'
path_output_processed_geodata = 'data/processed/nyc_tracts.gpkg'
path_nyc_boroughs = 'data/raw/nyc_boroughs/nybbwi.shp'

# list of fips codes for nyc - used for filtering whole ny state dataset 
nyc_county_fips = ["005", "047", "061", "081", "085"]

# EPSG:2263 - this coordinate reference system is specifically used for high-accuracy mapping of nyc boroughs 
nyc_crs = 'EPSG:2263'

# ensure cwd is project root for file paths to function properly
project_root = Path(os.getcwd())            # get current directory
while not (project_root / "data").exists(): # keep moving up until in parent
    project_root = project_root.parent
os.chdir(project_root)                      # switch to parent directory

### 1. Load Data

In [67]:
gdf_tracts = gpd.read_file(path_tracts_shapefile)
gdf_boroughs = gpd.read_file(path_nyc_boroughs)

### 2. EDA 

In [68]:
gdf_tracts.columns.tolist()

['STATEFP',
 'COUNTYFP',
 'TRACTCE',
 'GEOID',
 'GEOIDFQ',
 'NAME',
 'NAMELSAD',
 'MTFCC',
 'FUNCSTAT',
 'ALAND',
 'AWATER',
 'INTPTLAT',
 'INTPTLON',
 'geometry']

In [69]:
gdf_tracts.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,36,29,8400,36029008400,1400000US36029008400,84.0,Census Tract 84,G5020,S,10966624,3505091,42.9713848,-78.9194986,"POLYGON ((-78.94456 42.98506, -78.94216 42.992..."
1,36,103,123600,36103123600,1400000US36103123600,1236.0,Census Tract 1236,G5020,S,2302367,1082191,40.6608399,-73.4145754,"POLYGON ((-73.42559 40.65629, -73.42529 40.656..."
2,36,103,146001,36103146001,1400000US36103146001,1460.01,Census Tract 1460.01,G5020,S,2225464,0,40.7703277,-73.2532537,"POLYGON ((-73.26159 40.76307, -73.2615 40.7636..."
3,36,103,190402,36103190402,1400000US36103190402,1904.02,Census Tract 1904.02,G5020,S,44073411,23956,40.8468673,-72.6336641,"POLYGON ((-72.72668 40.8339, -72.72515 40.8387..."
4,36,103,158709,36103158709,1400000US36103158709,1587.09,Census Tract 1587.09,G5020,S,13099359,110761,40.8517499,-72.9216255,"POLYGON ((-72.94716 40.8556, -72.94649 40.8576..."


In [70]:
gdf_boroughs.head()

Unnamed: 0,BoroCode,BoroName,Shape_Leng,Shape_Area,geometry
0,5,Staten Island,220557.476076,2851518000.0,"POLYGON ((968762.067 175987.749, 968654.251 17..."
1,2,Bronx,188162.483488,1598501000.0,"POLYGON ((1021632.336 267934.439, 1022108.577 ..."
2,4,Queens,459301.799089,4962897000.0,"POLYGON ((1045438.075 235083.296, 1049675.845 ..."
3,1,Manhattan,203708.610992,944330100.0,"MULTIPOLYGON (((972081.788 190733.467, 972184...."
4,3,Brooklyn,236476.766501,2697661000.0,"POLYGON ((1004421.481 203543.225, 1004612.108 ..."


### 3. Data Processing and Filtering

In [71]:
# ensure county fips codes are properly formatted before filtering
gdf_tracts.COUNTYFP = gdf_tracts.COUNTYFP.apply(lambda x: str(x).zfill(3))

# filter for nyc boroughs
gdf_tracts_nyc = gdf_tracts[gdf_tracts.COUNTYFP.isin(nyc_county_fips)]

print("Total tracts: ", len(gdf_tracts_nyc))

Total tracts:  2327


In [72]:
# set coordinate reference system 
gdf_tracts_nyc = gdf_tracts_nyc.to_crs(nyc_crs)

Here we'll also add borough name. This will be useful for spatial CV in the ML pipeline. Tract centroid is used rather than tract boundaries to handle tracts that span borough boundaries.  

In [73]:
# ensure same crs before sjoin
gdf_boroughs = gdf_boroughs.to_crs(nyc_crs)

# calculate tract centroids for point-in-polygon sjoin
gdf_tracts_nyc['centroid'] = gdf_tracts_nyc.geometry.centroid

# spatial join between tracts and broughs
gdf_tracts_nyc = gpd.sjoin(
    gdf_tracts_nyc.set_geometry('centroid'),
    gdf_boroughs[['BoroCode', 'BoroName', 'geometry']],
    how='left',
    predicate='within'
)

# drop unneeded columns
gdf_tracts_nyc.drop(columns=['index_right', 'centroid'], inplace=True)

# make sure all tracts have a borough
#assert(gdf_tracts_nyc.BoroName.isna().any() == False)

One tract did not join to a borough. Upon visual inspection in QGIS, this tract belongs to Manhattan. I'll set this manually.

In [74]:
# find unnasigned tract
# gdf_tracts_nyc[gdf_tracts_nyc.BoroName.isna()]

# set borough manually
gdf_tracts_nyc.loc[gdf_tracts_nyc.GEOID == '36061000100', ['BoroName', 'BoroCode']] = ['Manhattan', 1]

In [75]:
gdf_tracts_nyc[gdf_tracts_nyc.GEOID == '36061000100']

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,BoroCode,BoroName
1144,36,61,100,36061000100,1400000US36061000100,1,Census Tract 1,G5020,S,76389,0,40.689971,-74.0461025,"MULTIPOLYGON (((972951.667 194368.81, 972956.3...",1.0,Manhattan


In [76]:
assert(gdf_tracts_nyc.BoroName.isna().any() == False)

In [77]:
gdf_tracts_nyc.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,BoroCode,BoroName
6,36,85,24402,36085024402,1400000US36085024402,244.02,Census Tract 244.02,G5020,S,1823028,2065530,40.4997874,-74.2384712,"POLYGON ((912287.455 121583.989, 912302.099 12...",5.0,Staten Island
7,36,85,27705,36085027705,1400000US36085027705,277.05,Census Tract 277.05,G5020,S,531529,0,40.5882479,-74.156982,"POLYGON ((938978.947 152923.44, 939459.264 153...",5.0,Staten Island
8,36,85,12806,36085012806,1400000US36085012806,128.06,Census Tract 128.06,G5020,S,1319470,580167,40.557671,-74.1076715,"POLYGON ((950892.186 143465.985, 951155.755 14...",5.0,Staten Island
48,36,47,24400,36047024400,1400000US36047024400,244.0,Census Tract 244,G5020,S,155278,0,40.6217475,-73.9862364,"POLYGON ((987168.126 166387.055, 987335.492 16...",3.0,Brooklyn
61,36,47,23000,36047023000,1400000US36047023000,230.0,Census Tract 230,G5020,S,150941,0,40.637816,-73.9842809,"POLYGON ((987443.171 173158.468, 988054.351 17...",3.0,Brooklyn


### 4. Save Data

In [78]:
# export processed tract data
gdf_tracts_nyc.to_file(path_output_processed_geodata, layer="tracts")