# 02 Prepare CDC Places

**Project:** NORI  
**Author:** Yuseof J  
**Date:** 10/12/25  

### **Purpose**
Load the raw CDC PLACES csv, filter for NYC tracts, and output data as parquet file. 

### **Inputs**
- `data/raw/cdc_places.csv`

### **Outputs**
- `data/processed/cdc_places_nyc.parquet`
  
--------------------------------------------------------------------------

### 0. Imports and Setup

In [1]:
# package imports
import os
import pandas as pd
import geopandas as gpd
from pathlib import Path

# specify filepaths
path_cdc_places = 'data/raw/cdc_places.csv'
path_nyc_tracts = 'data/processed/nyc_tracts.gpkg'
path_output_processed_data = 'data/processed/cdc_places_nyc.parquet'

# ensure cwd is project root for file paths to function properly
project_root = Path(os.getcwd())            # get current directory
while not (project_root / "data").exists(): # keep moving up until in parent
    project_root = project_root.parent
os.chdir(project_root)                      # switch to parent directory

### 1. Load Data

In [37]:
# cdc places
df_cdc = pd.read_csv(path_cdc_places)

# nyc tracts
gdf_tracts_nyc = gpd.read_file(path_nyc_tracts, layer="tracts")

### 2. EDA 

In [10]:
df_cdc.columns.tolist()

['StateAbbr',
 'StateDesc',
 'CountyName',
 'CountyFIPS',
 'TractFIPS',
 'TotalPopulation',
 'TotalPop18plus',
 'ACCESS2_CrudePrev',
 'ACCESS2_Crude95CI',
 'ARTHRITIS_CrudePrev',
 'ARTHRITIS_Crude95CI',
 'BINGE_CrudePrev',
 'BINGE_Crude95CI',
 'BPHIGH_CrudePrev',
 'BPHIGH_Crude95CI',
 'BPMED_CrudePrev',
 'BPMED_Crude95CI',
 'CANCER_CrudePrev',
 'CANCER_Crude95CI',
 'CASTHMA_CrudePrev',
 'CASTHMA_Crude95CI',
 'CHD_CrudePrev',
 'CHD_Crude95CI',
 'CHECKUP_CrudePrev',
 'CHECKUP_Crude95CI',
 'CHOLSCREEN_CrudePrev',
 'CHOLSCREEN_Crude95CI',
 'COLON_SCREEN_CrudePrev',
 'COLON_SCREEN_Crude95CI',
 'COPD_CrudePrev',
 'COPD_Crude95CI',
 'CSMOKING_CrudePrev',
 'CSMOKING_Crude95CI',
 'DENTAL_CrudePrev',
 'DENTAL_Crude95CI',
 'DEPRESSION_CrudePrev',
 'DEPRESSION_Crude95CI',
 'DIABETES_CrudePrev',
 'DIABETES_Crude95CI',
 'GHLTH_CrudePrev',
 'GHLTH_Crude95CI',
 'HIGHCHOL_CrudePrev',
 'HIGHCHOL_Crude95CI',
 'LPA_CrudePrev',
 'LPA_Crude95CI',
 'MAMMOUSE_CrudePrev',
 'MAMMOUSE_Crude95CI',
 'MHLTH_CrudePr

In [44]:
df_cdc.head()

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,TotalPop18plus,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,...,FOODINSECU_Crude95CI,HOUSINSECU_CrudePrev,HOUSINSECU_Crude95CI,SHUTUTILITY_CrudePrev,SHUTUTILITY_Crude95CI,LACKTRPT_CrudePrev,LACKTRPT_Crude95CI,EMOTIONSPT_CrudePrev,EMOTIONSPT_Crude95CI,Geolocation
0,AL,Alabama,Autauga,1001,1001020100,1775,1370,9.7,"( 7.4, 12.3)",30.6,...,"(13.8, 20.9)",12.3,"(10.2, 14.7)",8.1,"( 6.9, 9.6)",8.9,"( 7.5, 10.4)",23.5,"(19.9, 27.4)",POINT (-86.4915648 32.4819731)
1,AL,Alabama,Autauga,1001,1001020200,2055,1584,10.6,"( 8.5, 13.0)",29.5,...,"(20.9, 29.1)",18.5,"(15.6, 21.6)",12.5,"(10.7, 14.4)",12.0,"(10.3, 13.8)",29.1,"(25.2, 33.6)",POINT (-86.4724678 32.475758)
2,AL,Alabama,Autauga,1001,1001020300,3216,2485,10.6,"( 8.4, 13.1)",31.5,...,"(15.9, 22.6)",14.1,"(11.8, 16.3)",9.0,"( 7.7, 10.4)",9.7,"( 8.3, 11.1)",25.9,"(22.2, 30.0)",POINT (-86.4597033 32.4740243)
3,AL,Alabama,Autauga,1001,1001020400,4246,3344,7.9,"( 6.1, 10.1)",32.5,...,"(10.0, 15.7)",9.3,"( 7.7, 11.4)",6.1,"( 5.2, 7.4)",6.8,"( 5.8, 8.1)",21.5,"(18.2, 25.5)",POINT (-86.4448353 32.4710304)
4,AL,Alabama,Autauga,1001,1001020501,4322,3369,6.2,"( 4.7, 7.8)",27.7,...,"( 7.6, 11.9)",8.1,"( 6.6, 9.8)",5.1,"( 4.2, 6.0)",5.5,"( 4.6, 6.6)",21.1,"(17.6, 24.8)",POINT (-86.4225578 32.4478607)


### 3. Data Filtering / Cleaning

In [47]:
# ensure matching dtypes for filtering
nyc_tracts_fips = gdf_tracts_nyc.GEOID.astype(int)
df_cdc.TractFIPS = df_cdc.TractFIPS.astype(int)

# filter cdc places for nyc tracts
df_cdc_nyc = df_cdc[df_cdc.TractFIPS.isin(nyc_tracts_fips)]

# report number of matched tracts
percent_matched = int((df_cdc_nyc.TractFIPS.nunique()/df_tracts_nyc.GEOID.nunique()) * 100)
print(f"Found places data for {df_cdc_nyc.TractFIPS.nunique()} / {df_tracts_nyc.GEOID.nunique()} ({percent_matched}%) of nyc tracts")

Found places data for 2231 / 2327 (95%) of nyc tracts


### 4. Save Data

In [None]:
df_cdc_nyc.to_parquet("data_processed/cdc_places_nyc.parquet")