# 00_fetch_saipe
This notebook creates the **canonical SAIPE state-year panel** used by the rest of the project.

**Output (written to `data/raw/`)**
- `saipe_state_year.parquet`

**Expected columns**
- `state` (USPS two-letter code, includes DC)
- `state_name`
- `state_fips` (2-digit string)
- `year`
- `poverty_rate` (percent)
- `median_income` (USD, nominal)

> If `data/raw/saipe_state_year.parquet` already exists, this notebook will just validate it and stop.


In [None]:
from pathlib import Path
import pandas as pd

# =====================================================
# 00_fetch_saipe.ipynb — Locate repo + define paths
# =====================================================

START = Path.cwd().resolve()

def find_repo(start: Path) -> Path:
    roots = [start]
    if start.drive:
        roots.append(Path(start.drive + "\\projects"))

    seen = set()
    for root in roots:
        root = root.resolve()
        if root in seen or not root.exists():
            continue
        seen.add(root)

        for raw_dir in root.rglob("data/raw"):
            if raw_dir.exists():
                return raw_dir.parent.parent  # .../data/raw → repo root

    raise RuntimeError(f"Could not find repo root from start={start}")

REPO = find_repo(START)
RAW = REPO / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

OUT_PATH = RAW / "saipe_state_year.parquet"

print("CWD:", START)
print("Repo:", REPO)
print("Raw dir:", RAW)
print("Raw files:", [p.name for p in RAW.glob("*")])
print("SAIPE output path:", OUT_PATH)

CWD: C:\projects\python-policy-project\notebooks
Repo: C:\projects\python-policy-project
Raw dir: C:\projects\python-policy-project\data\raw
Raw files: ['la.area.txt', 'la.series.txt', 'laus_allstates_u.txt', 'saipe_state_year.parquet']


In [None]:
import requests
from time import sleep

# -----------------------------
# If output already exists, validate + stop
# -----------------------------
if OUT_PATH.exists():
    saipe_existing = pd.read_parquet(OUT_PATH)
    print("✅ Found existing:", OUT_PATH)
    display(saipe_existing.head())
    print("Shape:", saipe_existing.shape)
    print("Years:", int(saipe_existing["year"].min()), "to", int(saipe_existing["year"].max()), "| n_years:", saipe_existing["year"].nunique())
    print("Duplicate state-year rows:", int(saipe_existing.duplicated(["state","year"]).sum()))
    raise SystemExit("SAIPE parquet already exists — skipping fetch.")

# -----------------------------
# SAIPE time-series availability:
# Years: 1989, 1993, 1995–present
# -----------------------------
SAIPE_YEARS = [1989, 1993] + list(range(1995, 2024))

BASE_URL = "https://api.census.gov/data/timeseries/poverty/saipe"

# Variables:
# SAEPOVRTALL_PT = poverty rate (%), all ages
# SAEMHI_PT      = median household income
# STABREV        = USPS state code
# NAME           = state name
VARS = "NAME,STABREV,SAEPOVRTALL_PT,SAEMHI_PT"

SAIPE output path: C:\projects\python-policy-project\data\raw\saipe_state_year.parquet


In [None]:
rows = []

for year in SAIPE_YEARS:
    # Primary attempt: time=YYYY (standard for Census timeseries endpoints)
    params = {
        "get": VARS,
        "for": "state:*",
        "time": str(year),
    }

    r = requests.get(BASE_URL, params=params, timeout=60)

    # Fallback attempt if the endpoint rejects time=
    if r.status_code != 200:
        params_fallback = {
            "get": VARS,
            "for": "state:*",
            "YEAR": str(year),
        }
        r2 = requests.get(BASE_URL, params=params_fallback, timeout=60)
        if r2.status_code != 200:
            print(f"⚠️  YEAR {year} failed: {r.status_code} (time=) and {r2.status_code} (YEAR=)")
            continue
        r = r2

    data = r.json()
    header, body = data[0], data[1:]

    df_y = pd.DataFrame(body, columns=header)

    # normalize year column name
    if "time" in df_y.columns:
        df_y["year"] = df_y["time"].astype(str).str.slice(0, 4).astype(int)
    else:
        df_y["year"] = int(year)

    rows.append(df_y)

    sleep(0.15)  # be nice to Census API

print(f"Fetched {len(rows)} years")

✅ Found existing: C:\projects\python-policy-project\data\raw\saipe_state_year.parquet
Shape: (1581, 6)
Columns: ['state', 'state_name', 'state_fips', 'year', 'poverty_rate', 'median_income']


Unnamed: 0,state,state_name,state_fips,year,poverty_rate,median_income
0,AL,Alabama,1,1989,17.7,22202
1,AK,Alaska,2,1989,10.6,33885
2,AZ,Arizona,4,1989,14.7,28924
3,AR,Arkansas,5,1989,17.9,20729
4,CA,California,6,1989,12.7,33474


SystemExit: All good — nothing to fetch.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Build from downloaded Census SAIPE Excel files (recommended, reproducible)
This project intentionally keeps data acquisition simple and auditable:

1. Download the **SAIPE state estimates** Excel files for each year you want (e.g., 1989, 1993, 1995–2023).
2. Place them in:
   - `data/raw/saipe_xls/`

Expected: one workbook per year. Filenames can be anything as long as the year appears somewhere in the name (e.g., `saipe_state_2007.xls`).

Then run the next cells to parse and write `saipe_state_year.parquet`.

> If the Census changes file formats, you may need small tweaks to the header-detection logic below.


In [None]:
if len(rows) == 0:
    raise RuntimeError("No SAIPE years were fetched. Check your internet connection / API availability.")

saipe = pd.concat(rows, ignore_index=True)

print("Raw columns:", saipe.columns.tolist())
print("Raw shape:", saipe.shape)

# Rename Census "state" FIPS to avoid collisions later
if "state" in saipe.columns:
    saipe = saipe.rename(columns={"state": "state_fips"})

# Standardize columns
saipe_clean = (
    saipe.rename(columns={
        "STABREV": "state",
        "NAME": "state_name",
        "SAEPOVRTALL_PT": "poverty_rate",
        "SAEMHI_PT": "median_income",
    })
    [["state", "state_name", "state_fips", "year", "poverty_rate", "median_income"]]
)

# Types (use plain int for year to avoid patsy/statsmodels dtype issues later)
saipe_clean["year"] = pd.to_numeric(saipe_clean["year"], errors="coerce").astype(int)
saipe_clean["poverty_rate"] = pd.to_numeric(saipe_clean["poverty_rate"], errors="coerce")
saipe_clean["median_income"] = pd.to_numeric(saipe_clean["median_income"], errors="coerce")

# Drop Puerto Rico (keep 50 states + DC)
saipe_clean = saipe_clean[saipe_clean["state"] != "PR"].copy()

print("Clean shape:", saipe_clean.shape)
display(saipe_clean.head())

In [None]:
# No duplicate state-years
dup = int(saipe_clean.duplicated(["state", "year"]).sum())
print("Duplicate state-year rows:", dup)
assert dup == 0

# Coverage check (50 states + DC = 51)
counts = saipe_clean.groupby("year")["state"].nunique()
print("States per year (tail):")
print(counts.tail())

min_states = int(counts.min())
assert min_states == 51, f"Expected 51 states per year (50 + DC). Got min={min_states}"

print(
    "Years:",
    int(saipe_clean["year"].min()),
    "to",
    int(saipe_clean["year"].max()),
    "| n_years:",
    int(saipe_clean["year"].nunique()),
)

# Save canonical raw artifact
saipe_clean.to_parquet(OUT_PATH, index=False)
print("✅ Saved:", OUT_PATH)

saipe_clean.head()