# 02 Ingest Earthquake Hazard Feeds (USGS)

Stage: `01_ingest_hazard`
Discipline: earthquake hazard data generation.

Outputs:
- `JupyterNotebooks/outputs/index_pipeline/01_ingest/usgs_earthquake_events.csv`


In [None]:
# Cell 1: Setup
import importlib.util
import subprocess
import sys
import logging
import os
from datetime import datetime, timedelta, timezone
from pathlib import Path


def ensure_packages(packages):
    missing = [p for p in packages if importlib.util.find_spec(p) is None]
    if missing:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", *missing])


ensure_packages(["pandas", "requests"])

import pandas as pd
import requests

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("index-pipeline-stage01-eq")


def find_repo_root():
    p = Path.cwd().resolve()
    for c in [p, *p.parents]:
        if (c / "JupyterNotebooks").exists():
            return c
    return p


REPO_ROOT = find_repo_root()
OUTPUT_DIR = REPO_ROOT / "JupyterNotebooks" / "outputs" / "index_pipeline" / "01_ingest"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

try:
    from IPython.display import display
except ImportError:
    display = print


In [None]:
# Cell 2: Configuration
LOOKBACK_DAYS = int(os.environ.get("EQ_LOOKBACK_DAYS", "30"))
MIN_MAG = float(os.environ.get("EQ_MIN_MAG", "1.0"))

PR_EXTENT = {
    "minlatitude": 17.4,
    "maxlatitude": 18.9,
    "minlongitude": -68.6,
    "maxlongitude": -65.0,
}

USGS_EQ_URL = "https://earthquake.usgs.gov/fdsnws/event/1/query"
RUN_UTC = datetime.now(timezone.utc)
START_UTC = RUN_UTC - timedelta(days=LOOKBACK_DAYS)

print(f"Lookback days: {LOOKBACK_DAYS} | Min magnitude: {MIN_MAG}")


In [None]:
# Cell 3: Fetch and export
params = {
    "format": "geojson",
    "starttime": START_UTC.strftime("%Y-%m-%d"),
    "endtime": RUN_UTC.strftime("%Y-%m-%d"),
    "minmagnitude": MIN_MAG,
    **PR_EXTENT,
    "orderby": "time",
}

resp = requests.get(USGS_EQ_URL, params=params, timeout=120)
resp.raise_for_status()
payload = resp.json()

rows = []
for feat in payload.get("features", []):
    props = feat.get("properties", {})
    geom = feat.get("geometry") or {}
    coords = geom.get("coordinates") or [None, None, None]
    event_time = pd.to_datetime(props.get("time"), unit="ms", utc=True, errors="coerce")

    rows.append({
        "event_id": feat.get("id"),
        "time_utc": event_time,
        "magnitude": props.get("mag"),
        "depth_km": coords[2],
        "longitude": coords[0],
        "latitude": coords[1],
        "place": props.get("place"),
        "status": props.get("status"),
        "alert_level": props.get("alert"),
        "tsunami": props.get("tsunami"),
        "updated_utc": pd.to_datetime(props.get("updated"), unit="ms", utc=True, errors="coerce"),
        "run_utc": RUN_UTC,
    })

eq_df = pd.DataFrame(rows).sort_values("time_utc", ascending=False).reset_index(drop=True)

eq_out = OUTPUT_DIR / "usgs_earthquake_events.csv"
eq_df.to_csv(eq_out, index=False)

print(f"Earthquake rows: {len(eq_df)}")
print(f"Output: {eq_out}")
display(eq_df.head(10))
