### ZONAL STATISTICS AND MERGE ALL DATA FROM NOTEBOOKS 1 THRU 4

In [1]:
# Modules.
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
from rasterstats import zonal_stats
import rasterio

In [22]:
# Paths.
tracts_path = Path("data/nyc_tracts_2020/nyc_tracts_2020.shp")

ndvi_path = Path("data/raster/processed/ndvi_summer_2025.tif")
ndwi_path = Path("data/raster/processed/ndwi_summer_2025.tif")
ndbi_path = Path("data/raster/processed/ndbi_summer_2025.tif")

tree_canopy_path = Path("data/raster/nlcd_raster/nlcd_tree_canopy_2023.tiff")
impervious_path = Path("data/raster/nlcd_raster/nyc_ncld_impervious_2024.tiff")
landcover_path = Path("data/raster/nlcd_raster/nyc_ncld_land_cover_2024.tiff")

acs_path = Path("data/acs/acs_socioeconomic_tract_2022.csv")
jfk_heat_path = Path("data/heat/jfk_daily_heat_2025.csv")
calls_path = Path("data/nyc_311/panel/nyc_311_tract_day_2025.csv")

output_path = Path("data/model/nyc_tract_summer_2025_final.csv")

tracts = gpd.read_file(tracts_path)
tracts = tracts.to_crs(4326)
tracts.head()

Unnamed: 0,ctlabel,borocode,boroname,ct2020,boroct2020,cdeligibil,ntaname,nta2020,cdta2020,cdtaname,geoid,shape_leng,shape_area,geometry
0,1.0,1,Manhattan,100,1000100,I,The Battery-Governors Island-Ellis Island-Libe...,MN0191,MN01,MN01 Financial District-Tribeca (CD 1 Equivalent),36061000100,10833.043929,1843005.0,"MULTIPOLYGON (((-74.04388 40.69019, -74.04351 ..."
1,14.01,1,Manhattan,1401,1001401,I,Lower East Side,MN0302,MN03,MN03 Lower East Side-Chinatown (CD 3 Equivalent),36061001401,5075.332,1006117.0,"POLYGON ((-73.98837 40.71645, -73.98754 40.716..."
2,14.02,1,Manhattan,1402,1001402,E,Lower East Side,MN0302,MN03,MN03 Lower East Side-Chinatown (CD 3 Equivalent),36061001402,4459.156019,1226206.0,"POLYGON ((-73.98507 40.71908, -73.98423 40.718..."
3,18.0,1,Manhattan,1800,1001800,I,Lower East Side,MN0302,MN03,MN03 Lower East Side-Chinatown (CD 3 Equivalent),36061001800,6391.921174,2399277.0,"POLYGON ((-73.98985 40.72052, -73.98972 40.720..."
4,22.01,1,Manhattan,2201,1002201,E,Lower East Side,MN0302,MN03,MN03 Lower East Side-Chinatown (CD 3 Equivalent),36061002201,5779.062607,1740174.0,"POLYGON ((-73.97875 40.71993, -73.97879 40.719..."


In [3]:
# Zonal statistics helper function.
def zonal_mean(file_path, target_gdf):
    """
    Computes mean zonal statistics for a raster over polygons.
    Automatically reprojects polygons to raster CRS.
    """    
    with rasterio.open(file_path) as src:
        raster_crs = src.crs

    gdf_proj = target_gdf.to_crs(raster_crs)

    zonal_statistic = zonal_stats(
        gdf_proj,
        file_path,
        stats = ["mean"],
        nodata = np.nan,
        geojson_out = False
    )
    
    return [x["mean"] for x in zonal_statistic]

In [4]:
# NLCD calculations with print checks, because I learned my lesson.
print("Computing NLCD zonal stats.")

tracts["tree_canopy_pct"] = zonal_mean(tree_canopy_path, tracts)
tracts["impervious_pct"] = zonal_mean(impervious_path, tracts)
tracts["landcover_mean"] = zonal_mean(landcover_path, tracts)

print("NLCD zonal stats complete.")

Computing NLCD zonal stats.
NLCD zonal stats complete.


In [5]:
# Landsat calculations with print checks, because I learned my lesson.
print("Computing Landsat NDVI zonal stats.")
tracts["ndvi_mean"] = zonal_mean(ndvi_path, tracts)

print("Computing Landsat NDWI zonal stats.")
tracts["ndwi_mean"] = zonal_mean(ndwi_path, tracts)

print("Computing Landsat NDBI zonal stats.")
tracts["ndbi_mean"] = zonal_mean(ndbi_path, tracts)

print("Finished Landsat zonal statistics.")

Computing Landsat NDVI zonal stats.
Computing Landsat NDWI zonal stats.
Computing Landsat NDBI zonal stats.
Finished Landsat zonal statistics.


In [13]:
# ACS.
acs = pd.read_csv(acs_path, dtype = {"GEOID":"string"})
acs.head()

Unnamed: 0,NAME,total_pop,median_income,poverty_all,poverty_count,no_vehicle_hh,edu_bachelors,edu_masters,edu_professional,edu_doctorate,...,state,county,tract,GEOID,poverty_rate,poverty_rate_c,edu_bachelors_plus,pct_bachelors_plus,pct_renters,pct_limited_english
0,Census Tract 1; Bronx County; New York,4446,-666666666,0,0,0,34,7,25,9,...,36,5,100,36005000100,0.257322,,75,0.019405,0.856193,0.001799
1,Census Tract 2; Bronx County; New York,4870,115064,4870,688,73,489,619,16,36,...,36,5,200,36005000200,0.141273,-0.024086,1160,0.323751,0.397895,0.019587
2,Census Tract 4; Bronx County; New York,6257,100553,6257,378,119,995,338,206,0,...,36,5,400,36005000400,0.060412,-0.104947,1539,0.337057,0.389779,0.006332
3,Census Tract 16; Bronx County; New York,6177,41362,5961,893,7,682,123,0,0,...,36,5,1600,36005001600,0.149807,-0.015552,805,0.194351,0.794104,0.015487
4,Census Tract 19.01; Bronx County; New York,2181,49500,2178,623,0,283,133,22,9,...,36,5,1901,36005001901,0.286042,0.120683,447,0.304911,1.0,0.0


In [18]:
# Heat.
heat = pd.read_csv(jfk_heat_path, parse_dates = ["DATE"])
heat.head()

Unnamed: 0,STATION,STATION.1,LATITUDE,LONGITUDE,ELEVATION,DATE,TEMP_MAX_F,TEMP_MIN_F,PRCP_IN,TEMP_MEAN_F,EXTREME_HEAT,HEAT_THRESHOLD,NAME
0,74486094789,JFK,40.63915,-73.7639,2.7,2025-06-01,73.0,52.0,0.42,60.3,0.0,94.37,JFK
1,74486094789,JFK,40.63915,-73.7639,2.7,2025-06-02,72.0,52.0,0.0,62.7,0.0,94.37,JFK
2,74486094789,JFK,40.63915,-73.7639,2.7,2025-06-03,75.9,52.0,0.0,65.6,0.0,94.37,JFK
3,74486094789,JFK,40.63915,-73.7639,2.7,2025-06-04,75.9,59.0,0.0,66.9,0.0,94.37,JFK
4,74486094789,JFK,40.63915,-73.7639,2.7,2025-06-05,81.0,64.0,0.0,71.8,0.0,94.37,JFK


In [23]:
# 311.
calls = pd.read_csv(calls_path, dtype = {"GEOID":"string"}, parse_dates = ["date"])
calls.head()

Unnamed: 0,GEOID,date,total_calls,qol_calls,qol_pct
0,36005000100,2025-06-30,1,1,1.0
1,36005000100,2025-07-23,1,0,0.0
2,36005000100,2025-08-04,1,1,1.0
3,36005000100,2025-08-24,1,1,1.0
4,36005000200,2025-06-01,8,6,0.75


In [41]:
# Merge ACS.
tracts_df = tracts.drop(columns = "geometry").copy()

# Column alignment.
tracts_df = tracts_df.rename(columns = {"geoid": "GEOID"})
heat = heat.rename(columns = {"date": "DATE"})
calls = calls.rename(columns = {"date": "DATE"})

merged = tracts_df.merge(acs, on = "GEOID", how = "left")
print("After ACS merge:", merged.shape)

# Add constant merge key.
merged["key"] = 1
heat["key"] = 1

# Merge heat.
tract_day = merged.merge(heat, on = "key").drop(columns = ["key"])
print("Tract by day:", tract_day.shape)

final = tract_day.merge(
    calls,
    on = ["GEOID", "DATE"],
    how = "left"
)

print("After merging 311:", final.shape)

After ACS merge: (2325, 38)
Tract by day: (204600, 51)
After merging 311: (204600, 54)


In [42]:
# Take care of NA values.
final["total_calls"] = final["total_calls"].fillna(0)
final["qol_calls"] = final["qol_calls"].fillna(0)

final["log_total_calls"] = np.log(final["total_calls"].replace(0, 1))

In [43]:
# Final data.
final.to_csv(output_path, index = False)

print(f"Final data for modeling: {output_path}")

Final data for modeling: data\model\nyc_tract_summer_2025_final.csv


In [52]:
# Write final data for high heat and normal heat days.
high_heat = final[final["EXTREME_HEAT"] == 1]
normal_heat = final[final["EXTREME_HEAT"] == 0]

high_heat.to_csv("data/model/model_high_heat.csv", index = False)
normal_heat.to_csv("data/model/model_normal_heat.csv", index = False)

print("Model data:")
print(" - model_high_heat.csv")
print(" - model_normal_heat.csv")

Model data:
 - model_high_heat.csv
 - model_normal_heat.csv
