# Puerto Rico Census Risk Features (Town, ZIP, Coordinates)

This notebook pulls the latest available ACS 5-year Census data for Puerto Rico, builds risk features, and exports model-ready tables for municipios, ZIP Code Tabulation Areas (ZCTAs), and town coordinate points.

Data sources in this notebook are public/open:
- U.S. Census API (ACS 5-year)
- U.S. Census Geocoder API (optional point-to-geography lookup)
- Local PR town coordinate lookup file (`Puerto_RIco_Towns_Coords.xlsx`)


## Run Instructions

1. Run cells top-to-bottom.
2. Optional `.env` values:
   - `CENSUS_API_KEY=<your_key>` (optional; useful above 500 requests/day/IP)
   - `ACS_YEAR=2024` (optional; if omitted, notebook auto-detects latest available year)
   - `PR_TOWNS_COORDS_FILE=JupyterNotebooks/Puerto_RIco_Towns_Coords.xlsx` (optional override)
   - `ENABLE_CENSUS_GEOCODER=1` (optional; enriches town points with county/tract GEOIDs)
3. Outputs are written to `JupyterNotebooks/outputs/census_pr/`.


In [None]:
# Cell 1: Install and import dependencies
import sys
import subprocess
import os
import re
import json
import logging
import unicodedata
from datetime import datetime, UTC
from pathlib import Path

print("Installing required packages...")
required_packages = ["pandas", "requests", "openpyxl", "python-dotenv", "numpy"]
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", *required_packages])
print("Installation complete.")

import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv

try:
    from IPython.display import display
except ImportError:
    display = print

# Load .env if present in repo root or current directory
load_dotenv(Path.cwd() / ".env")
load_dotenv(Path.cwd().parent / ".env")

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 120)

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("census-pr-risk")

print("Cell 1 complete: dependencies installed and imported.")

In [None]:
# Cell 2: Configuration and helper functions
PR_STATE_FIPS = "72"
ACS_DATASET = "acs/acs5"
DEFAULT_TOWNS_FILE = "Puerto_RIco_Towns_Coords.xlsx"
OUTPUT_DIR = Path("JupyterNotebooks/outputs/census_pr")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CENSUS_API_KEY = os.environ.get("CENSUS_API_KEY")
ACS_YEAR_ENV = os.environ.get("ACS_YEAR", "").strip()
ENABLE_CENSUS_GEOCODER = os.environ.get("ENABLE_CENSUS_GEOCODER", "").lower() in {"1", "true", "yes"}

ACS_VARIABLES = {
    "population": "B01003_001E",
    "median_income": "B19013_001E",
    "poverty_universe": "B17001_001E",
    "poverty_count": "B17001_002E",
    "housing_units": "B25001_001E",
    "occupied_units": "B25002_002E",
    "vacant_units": "B25002_003E",
    "no_vehicle_owner": "B25044_003E",
    "no_vehicle_renter": "B25044_010E",
}

CENSUS_SENTINEL_MISSING = {
    -666666666, -555555555, -333333333, -222222222, -111111111, -999999999
}


def resolve_file(filename, env_var=None, search_roots=None):
    if env_var:
        env_value = os.environ.get(env_var)
        if env_value:
            candidate = Path(env_value).expanduser()
            if candidate.exists():
                return candidate
            raise FileNotFoundError(f"{env_var} is set but file was not found: {candidate}")

    roots = search_roots or [Path.cwd(), Path.cwd() / "JupyterNotebooks", Path.cwd().parent]
    for root in roots:
        if not root.exists():
            continue
        found = next(root.rglob(filename), None)
        if found:
            return found
    raise FileNotFoundError(f"Could not find {filename}. Set {env_var} or place the file under this repo.")


def normalize_text(value):
    if pd.isna(value):
        return ""
    text = str(value).strip()
    text = unicodedata.normalize("NFKD", text)
    text = "".join(ch for ch in text if not unicodedata.combining(ch))
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower().strip()


def parse_zip_codes(value):
    if pd.isna(value):
        return []
    return sorted(set(re.findall(r"\b\d{5}\b", str(value))))


def to_numeric(df, columns):
    for col in columns:
        numeric_series = pd.to_numeric(df[col], errors="coerce")
        df[col] = numeric_series.replace(list(CENSUS_SENTINEL_MISSING), np.nan)
    return df


def safe_divide(numerator, denominator):
    result = numerator / denominator.replace({0: np.nan})
    return result.replace([np.inf, -np.inf], np.nan)


def minmax_score(series, invert=False):
    values = pd.to_numeric(series, errors="coerce")
    vmin = values.min(skipna=True)
    vmax = values.max(skipna=True)
    if pd.isna(vmin) or pd.isna(vmax) or vmin == vmax:
        score = pd.Series(np.nan, index=values.index, dtype="float64")
    else:
        score = (values - vmin) / (vmax - vmin)
    if invert:
        score = 1 - score
    return score.clip(lower=0, upper=1)


def find_latest_acs5_year(state_fips="72", min_year=2010):
    current_year = datetime.now(UTC).year
    for year in range(current_year, min_year - 1, -1):
        url = f"https://api.census.gov/data/{year}/{ACS_DATASET}"
        params = {"get": "NAME", "for": f"state:{state_fips}"}
        try:
            response = requests.get(url, params=params, timeout=30)
            if response.status_code == 200:
                return year
        except requests.RequestException:
            continue
    raise RuntimeError("Unable to find an available ACS 5-year dataset year.")


def census_get(year, variables, geography_params, api_key=None):
    url = f"https://api.census.gov/data/{year}/{ACS_DATASET}"
    params = {"get": ",".join(variables)}
    params.update(geography_params)
    if api_key:
        params["key"] = api_key

    response = requests.get(url, params=params, timeout=90)
    try:
        response.raise_for_status()
    except requests.HTTPError as exc:
        raise RuntimeError(f"Census API error for {url}: {response.text}") from exc

    payload = response.json()
    return pd.DataFrame(payload[1:], columns=payload[0])


def geocode_coordinates(lat, lon):
    url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
    params = {
        "x": lon,
        "y": lat,
        "benchmark": "Public_AR_Current",
        "vintage": "Current_Current",
        "format": "json",
    }
    response = requests.get(url, params=params, timeout=30)
    response.raise_for_status()
    result = response.json().get("result", {}).get("geographies", {})

    county_info = (result.get("Counties") or [{}])[0]
    tract_info = (result.get("Census Tracts") or [{}])[0]

    return {
        "county_geoid": county_info.get("GEOID"),
        "county_name": county_info.get("NAME"),
        "tract_geoid": tract_info.get("GEOID"),
        "tract_name": tract_info.get("NAME"),
    }


def choose_acs_year():
    if ACS_YEAR_ENV:
        try:
            return int(ACS_YEAR_ENV)
        except ValueError as exc:
            raise ValueError("ACS_YEAR must be an integer like 2024") from exc
    return find_latest_acs5_year(state_fips=PR_STATE_FIPS)


selected_year = choose_acs_year()
print(f"ACS year selected: {selected_year}")
print(f"Census API key provided: {'yes' if CENSUS_API_KEY else 'no'}")
print(f"Census geocoder enabled: {ENABLE_CENSUS_GEOCODER}")
print("Cell 2 complete: config and helpers ready.")

In [None]:
# Cell 3: Pull ACS data for PR municipios and ZCTAs
requested_columns = ["NAME", *ACS_VARIABLES.values()]
rename_map = {v: k for k, v in ACS_VARIABLES.items()}

# Municipio-level data (Census county geography for Puerto Rico)
municipio_df = census_get(
    year=selected_year,
    variables=requested_columns,
    geography_params={"for": "county:*", "in": f"state:{PR_STATE_FIPS}"},
    api_key=CENSUS_API_KEY,
)
municipio_df = municipio_df.rename(columns=rename_map)
municipio_df = to_numeric(municipio_df, list(ACS_VARIABLES.keys()))
municipio_df["municipio"] = (
    municipio_df["NAME"]
    .str.replace(", Puerto Rico", "", regex=False)
    .str.replace(" Municipio", "", regex=False)
)
municipio_df["municipio_key"] = municipio_df["municipio"].map(normalize_text)

# ZCTA data is queried nationally, then filtered to PR ZIP codes from local town table
zcta_df = census_get(
    year=selected_year,
    variables=requested_columns,
    geography_params={"for": "zip code tabulation area:*"},
    api_key=CENSUS_API_KEY,
)
zcta_rename = rename_map.copy()
zcta_rename["zip code tabulation area"] = "zip_code"
zcta_df = zcta_df.rename(columns=zcta_rename)
zcta_df = to_numeric(zcta_df, list(ACS_VARIABLES.keys()))

# Load local town lookup
pr_towns_file = resolve_file(DEFAULT_TOWNS_FILE, env_var="PR_TOWNS_COORDS_FILE")
pr_towns_df = pd.read_excel(pr_towns_file)
if "municipio2" in pr_towns_df.columns:
    pr_towns_df["municipio"] = pr_towns_df["municipio2"].fillna(pr_towns_df.get("municipio"))
elif "municipio" not in pr_towns_df.columns:
    raise ValueError("Town lookup file must include municipio or municipio2 column.")

for col in ["latitude", "longitude"]:
    if col in pr_towns_df.columns:
        pr_towns_df[col] = pd.to_numeric(pr_towns_df[col], errors="coerce")

if "Zip Codes" not in pr_towns_df.columns:
    raise ValueError("Town lookup file must include 'Zip Codes' column.")

pr_towns_df["municipio_key"] = pr_towns_df["municipio"].map(normalize_text)
pr_towns_df["zip_list"] = pr_towns_df["Zip Codes"].map(parse_zip_codes)
pr_zip_set = {zip_code for zip_list in pr_towns_df["zip_list"] for zip_code in zip_list}

zcta_pr_df = zcta_df[zcta_df["zip_code"].isin(pr_zip_set)].copy()

print(f"Municipios pulled: {len(municipio_df)}")
print(f"US ZCTAs pulled: {len(zcta_df)}")
print(f"PR ZCTAs retained after filter: {len(zcta_pr_df)}")
print(f"Unique PR ZIP codes from lookup: {len(pr_zip_set)}")

print("\nMunicipio sample:")
display(municipio_df.head(3))
print("\nZCTA sample:")
display(zcta_pr_df.head(3))

In [None]:
# Cell 4: Build risk features and baseline risk indexes

def add_risk_features(df):
    out = df.copy()

    out["poverty_rate"] = safe_divide(out["poverty_count"], out["poverty_universe"])
    out["no_vehicle_rate"] = safe_divide(
        out["no_vehicle_owner"] + out["no_vehicle_renter"],
        out["occupied_units"],
    )
    out["vacancy_rate"] = safe_divide(out["vacant_units"], out["housing_units"])

    out["score_population"] = minmax_score(out["population"])
    out["score_poverty"] = minmax_score(out["poverty_rate"])
    out["score_income_vulnerability"] = minmax_score(out["median_income"], invert=True)
    out["score_transport_vulnerability"] = minmax_score(out["no_vehicle_rate"])
    out["score_housing_vulnerability"] = minmax_score(out["vacancy_rate"])

    # Weighted baseline index (0-100). Adjust weights as your model evolves.
    out["risk_index_raw"] = (
        0.30 * out["score_population"].fillna(0)
        + 0.25 * out["score_poverty"].fillna(0)
        + 0.20 * out["score_income_vulnerability"].fillna(0)
        + 0.15 * out["score_transport_vulnerability"].fillna(0)
        + 0.10 * out["score_housing_vulnerability"].fillna(0)
    )
    out["risk_index"] = (out["risk_index_raw"] * 100).round(1)
    return out


municipio_risk_df = add_risk_features(municipio_df)
zcta_risk_df = add_risk_features(zcta_pr_df)

print("Top municipio risk rows:")
display(
    municipio_risk_df[[
        "municipio", "population", "median_income", "poverty_rate", "no_vehicle_rate", "risk_index"
    ]]
    .sort_values("risk_index", ascending=False)
    .head(10)
)

print("Top ZCTA risk rows:")
display(
    zcta_risk_df[[
        "zip_code", "population", "median_income", "poverty_rate", "no_vehicle_rate", "risk_index"
    ]]
    .sort_values("risk_index", ascending=False)
    .head(10)
)

In [None]:
# Cell 5: Join municipio + ZIP features to town coordinates (model-ready table)

towns_exploded_df = (
    pr_towns_df[["designated_area", "municipio", "municipio_key", "latitude", "longitude", "zip_list"]]
    .explode("zip_list")
    .rename(columns={"zip_list": "zip_code"})
)

zip_feature_cols = [
    "zip_code", "risk_index", "population", "median_income", "poverty_rate", "no_vehicle_rate", "vacancy_rate"
]
zip_aggregated_df = (
    towns_exploded_df
    .merge(zcta_risk_df[zip_feature_cols], on="zip_code", how="left")
    .groupby(["designated_area", "municipio", "municipio_key", "latitude", "longitude"], dropna=False)
    .agg(
        zip_count=("zip_code", "nunique"),
        zip_risk_index=("risk_index", "mean"),
        zip_population=("population", "sum"),
        zip_median_income=("median_income", "mean"),
        zip_poverty_rate=("poverty_rate", "mean"),
        zip_no_vehicle_rate=("no_vehicle_rate", "mean"),
        zip_vacancy_rate=("vacancy_rate", "mean"),
    )
    .reset_index()
)

municipio_feature_cols = [
    "municipio_key", "risk_index", "population", "median_income", "poverty_rate", "no_vehicle_rate", "vacancy_rate"
]
town_risk_df = zip_aggregated_df.merge(
    municipio_risk_df[municipio_feature_cols].rename(
        columns={
            "risk_index": "municipio_risk_index",
            "population": "municipio_population",
            "median_income": "municipio_median_income",
            "poverty_rate": "municipio_poverty_rate",
            "no_vehicle_rate": "municipio_no_vehicle_rate",
            "vacancy_rate": "municipio_vacancy_rate",
        }
    ),
    on="municipio_key",
    how="left",
)


def blend_risk(municipio_risk, zip_risk):
    if pd.notna(municipio_risk) and pd.notna(zip_risk):
        return round(0.6 * municipio_risk + 0.4 * zip_risk, 1)
    if pd.notna(municipio_risk):
        return round(municipio_risk, 1)
    if pd.notna(zip_risk):
        return round(zip_risk, 1)
    return np.nan


town_risk_df["risk_index"] = town_risk_df.apply(
    lambda row: blend_risk(row["municipio_risk_index"], row["zip_risk_index"]),
    axis=1,
)

# Optional: enrich points with county/tract GEOIDs for coordinate-based joins
if ENABLE_CENSUS_GEOCODER:
    geocoded_records = []
    for row in town_risk_df.itertuples(index=False):
        if pd.notna(row.latitude) and pd.notna(row.longitude):
            try:
                geocoded_records.append(geocode_coordinates(row.latitude, row.longitude))
            except Exception as exc:
                logger.warning("Geocoder failed for %s: %s", row.designated_area, exc)
                geocoded_records.append({
                    "county_geoid": None,
                    "county_name": None,
                    "tract_geoid": None,
                    "tract_name": None,
                })
        else:
            geocoded_records.append({
                "county_geoid": None,
                "county_name": None,
                "tract_geoid": None,
                "tract_name": None,
            })

    geocoded_df = pd.DataFrame(geocoded_records)
    town_risk_df = pd.concat([town_risk_df.reset_index(drop=True), geocoded_df], axis=1)

print("Town-level sample:")
display(
    town_risk_df[[
        "designated_area", "municipio", "latitude", "longitude", "risk_index",
        "municipio_risk_index", "zip_risk_index", "zip_count"
    ]].head(10)
)

print(f"Town rows generated: {len(town_risk_df)}")

In [None]:
# Cell 6: Export CSV + GeoJSON outputs
municipio_out = OUTPUT_DIR / "municipio_risk_features.csv"
zcta_out = OUTPUT_DIR / "zcta_risk_features.csv"
town_out = OUTPUT_DIR / "town_risk_features.csv"
geojson_out = OUTPUT_DIR / "town_risk_features.geojson"

municipio_risk_df.to_csv(municipio_out, index=False)
zcta_risk_df.to_csv(zcta_out, index=False)
town_risk_df.to_csv(town_out, index=False)

geojson_features = []
for row in town_risk_df.to_dict(orient="records"):
    lat = row.get("latitude")
    lon = row.get("longitude")
    if pd.isna(lat) or pd.isna(lon):
        continue

    attrs = {}
    for key, value in row.items():
        if pd.isna(value):
            attrs[key] = None
        elif isinstance(value, (np.integer, np.floating)):
            attrs[key] = value.item()
        else:
            attrs[key] = value

    geojson_features.append(
        {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": [float(lon), float(lat)]},
            "properties": attrs,
        }
    )

geojson_payload = {"type": "FeatureCollection", "features": geojson_features}
with open(geojson_out, "w", encoding="utf-8") as file_handle:
    json.dump(geojson_payload, file_handle, ensure_ascii=False, indent=2, default=str)

print("Export complete:")
print(f"- {municipio_out}")
print(f"- {zcta_out}")
print(f"- {town_out}")
print(f"- {geojson_out}")

## Next Modeling Steps

- Add hazard intensity features from your USGS/NOAA/USACE notebooks (flood stage, alert density, event frequency).
- Use this notebook output as static/demographic vulnerability features.
- Train and compare candidate models (e.g., linear baseline, random forest, gradient boosting) on a shared event-labeled dataset.
- Keep all secrets in `.env` only; do not hardcode keys in notebooks.
