In [5]:
import json
import re
from collections import defaultdict

import requests
import pandas as pd
import geopandas as gpd
import networkx as nx
from shapely.geometry import LineString, Point
from arcgis2geojson import arcgis2geojson

# -----------------------------
# SETTINGS
# -----------------------------
AFFECTED_PATH = "Miami-Permits-Affected-Streets_attributes-only.geojson"

CENTERLINES_LAYER_URL = (
    "https://services.arcgis.com/8Pc9XBTAsYuxx9Ny/arcgis/rest/services/GeoStreets_gdb/FeatureServer/0"
)

METERS_CRS = 26917  # UTM 17N (Miami area)
BUFFER_FEET = 50.0
BUFFER_METERS = BUFFER_FEET * 0.3048

ENDPOINT_SNAP_M = 0.25
NODE_SNAP_TOL_M = 200.0  # increase to 250-300 if still not snapping


# -----------------------------
# ARCGIS DOWNLOAD (EsriJSON)
# -----------------------------
def arcgis_download_all_esrijson(layer_url: str, where="1=1", out_fields="*", chunk=2000):
    feats = []
    offset = 0
    while True:
        params = {
            "f": "json",
            "where": where,
            "outFields": out_fields,
            "returnGeometry": "true",
            "outSR": 4326,
            "resultOffset": offset,
            "resultRecordCount": chunk,
        }
        r = requests.get(f"{layer_url}/query", params=params, timeout=180)
        r.raise_for_status()
        js = r.json()
        batch = js.get("features", [])
        if not batch:
            break
        feats.extend(batch)
        offset += len(batch)
        if len(batch) < chunk:
            break

    return {
        "displayFieldName": "",
        "fieldAliases": {},
        "geometryType": "esriGeometryPolyline",
        "spatialReference": {"wkid": 4326},
        "features": feats,
    }


# -----------------------------
# STRING NORMALIZATION + EXTRACTION
# -----------------------------
_DIR_MAP = {
    "N": "N", "NORTH": "N",
    "S": "S", "SOUTH": "S",
    "E": "E", "EAST": "E",
    "W": "W", "WEST": "W",
    "NE": "NE", "NORTHEAST": "NE",
    "NW": "NW", "NORTHWEST": "NW",
    "SE": "SE", "SOUTHEAST": "SE",
    "SW": "SW", "SOUTHWEST": "SW",
}

_TYPE_MAP = {
    "ST": "ST", "STREET": "ST",
    "AVE": "AVE", "AV": "AVE", "AVENUE": "AVE",
    "BLVD": "BLVD", "BOULEVARD": "BLVD",
    "RD": "RD", "ROAD": "RD",
    "DR": "DR", "DRIVE": "DR",
    "CT": "CT", "COURT": "CT",
    "LN": "LN", "LANE": "LN",
    "TER": "TER", "TERRACE": "TER",
    "PL": "PL", "PLACE": "PL",
    "CIR": "CIR", "CIRCLE": "CIR",
    "HWY": "HWY", "HIGHWAY": "HWY",
    "PKWY": "PKWY", "PARKWAY": "PKWY",
}

SHORT_DIRS = {"N", "S", "E", "W", "NE", "NW", "SE", "SW"}

# Strip ordinal suffixes: 5TH -> 5 (helps align naming conventions)
_ORDINAL_RE = re.compile(r"\b(\d+)(ST|ND|RD|TH)\b", re.IGNORECASE)

# Strip leading “North,” / “North and South,” style junk in some descriptions
_LEADING_DIRECTIONAL_PHRASE = re.compile(
    r"^\s*(north and south|south and north|east and west|west and east|north|south|east|west)\s*,?\s+",
    re.IGNORECASE
)

def strip_leading_directional_phrase(s: str) -> str:
    return _LEADING_DIRECTIONAL_PHRASE.sub("", s).strip()

def clean_raw_name(raw: str) -> str:
    if raw is None or (isinstance(raw, float) and pd.isna(raw)):
        return ""
    s = str(raw).strip()
    # remove parentheticals: "Coral Way (SW 22nd St)" -> "Coral Way"
    s = re.sub(r"\s*\(.*?\)\s*", " ", s)
    s = s.replace(",", " ")
    s = " ".join(s.split())
    return s

def strip_ordinals(s: str) -> str:
    return _ORDINAL_RE.sub(r"\1", s)

def expand_slash_variants(raw: str) -> list[str]:
    raw = clean_raw_name(raw)
    if not raw or "/" not in raw:
        return [raw]
    toks = raw.split()
    for i, t in enumerate(toks):
        if "/" in t:
            out = []
            for p in t.split("/"):
                tt = toks[:]
                tt[i] = p
                out.append(" ".join(tt))
            return out
    return [raw]

def normalize_street_name(raw: str, collapse_cardinals: bool = True) -> str:
    raw = strip_leading_directional_phrase(strip_ordinals(clean_raw_name(raw)))
    if not raw:
        return ""

    s = re.sub(r"\s*&\s*", " & ", raw)
    toks = [t.upper().replace(".", "") for t in s.split()]

    out = []
    for t in toks:
        if t in SHORT_DIRS:
            out.append(t)
            continue
        if collapse_cardinals and t in _DIR_MAP:
            out.append(_DIR_MAP[t])
            continue
        if t in _TYPE_MAP:
            out.append(_TYPE_MAP[t])
        else:
            out.append(t)
    return " ".join(out).strip()

def key_variants(raw: str) -> list[str]:
    variants: list[str] = []

    def add(v: str):
        v = v.strip()
        if v and v not in variants:
            variants.append(v)

    for raw2 in expand_slash_variants(raw):
        for collapse in (True, False):
            s = normalize_street_name(raw2, collapse_cardinals=collapse)
            if not s:
                continue
            toks = s.split()

            add(s)

            # drop trailing type
            if toks and toks[-1] in set(_TYPE_MAP.values()):
                add(" ".join(toks[:-1]))

            # drop leading short dir
            if toks and toks[0] in SHORT_DIRS:
                add(" ".join(toks[1:]))

            # drop both
            tt = toks[:]
            if tt and tt[0] in SHORT_DIRS:
                tt = tt[1:]
            if tt and tt[-1] in set(_TYPE_MAP.values()):
                tt = tt[:-1]
            if tt:
                add(" ".join(tt))

    return variants


# Treat these as “open-ended boundary” rather than matchable streets
def boundary_kind(raw: str):
    s = clean_raw_name(raw).lower()
    if not s:
        return None

    # railroads / tracks
    if any(k in s for k in ["railroad", "railway", "rr", "tracks", "track", "spur", "f.e.c", "fec"]):
        if "west" in s or "westerly" in s: return "WEST"
        if "east" in s or "easterly" in s: return "EAST"
        if "north" in s or "northerly" in s: return "NORTH"
        if "south" in s or "southerly" in s: return "SOUTH"
        return "WATER"

    # legal/plat/prose boundaries
    if any(k in s for k in ["production", "extension", "right-of-way", "right of way", "tract", "plat"]):
        if "west" in s or "westerly" in s: return "WEST"
        if "east" in s or "easterly" in s: return "EAST"
        if "north" in s or "northerly" in s: return "NORTH"
        if "south" in s or "southerly" in s: return "SOUTH"
        return "WATER"

    if "city limit" in s or "city limits" in s:
        if "west" in s: return "WEST"
        if "east" in s: return "EAST"
        if "north" in s: return "NORTH"
        if "south" in s: return "SOUTH"
        return "WATER"

    if any(w in s for w in ["river", "bay", "ocean", "canal", "lagoon"]):
        return "WATER"

    return None


# Extract streets from phrases (supports multiple matches)
_STREET_EXTRACT_RE = re.compile(
    r"(?P<street>"
    r"(?:N|S|E|W|NE|NW|SE|SW|NORTH|SOUTH|EAST|WEST)?\.?\s*"
    r"(?:[A-Z0-9][A-Z0-9'\-]*\s+){0,8}"
    r"(?:ST|STREET|AVE|AVENUE|BLVD|BOULEVARD|RD|ROAD|DR|DRIVE|CT|COURT|LN|LANE|TER|TERRACE|PL|PLACE|CIR|CIRCLE|HWY|HIGHWAY|PKWY|PARKWAY)"
    r"(?:\s+(?:N|S|E|W|NE|NW|SE|SW|NORTH|SOUTH|EAST|WEST))?"
    r")",
    re.IGNORECASE
)

def extract_streets_from_phrase(raw: str) -> list[str]:
    s = strip_leading_directional_phrase(clean_raw_name(raw))
    if not s:
        return []
    matches = [clean_raw_name(m.group("street")) for m in _STREET_EXTRACT_RE.finditer(s)]
    out = []
    for m in matches:
        if m and m not in out:
            out.append(m)
    return out

def extract_street_from_phrase(raw: str) -> str:
    cands = extract_streets_from_phrase(raw)
    return cands[0] if cands else ""

def canonical_cross_candidates(raw: str):
    """
    Returns (boundary_kind, [street_candidate_1, street_candidate_2, ...])
    """
    b = boundary_kind(raw)
    if b:
        return (b, [])
    cands = extract_streets_from_phrase(raw)
    if cands:
        return (None, cands)
    raw2 = strip_leading_directional_phrase(clean_raw_name(raw))
    return (None, [raw2] if raw2 else [])


# -----------------------------
# CENTERLINE NAME FIELD DISCOVERY
# -----------------------------
def pick_centerline_name_field(streets: gpd.GeoDataFrame) -> str:
    candidates = [
        "FULLNAME", "FULL_NAME", "FULL_STREET_NAME",
        "STREETNAME", "STREET_NAME", "RDNAME", "RD_NAME", "NAME"
    ]
    cols_upper = {c.upper(): c for c in streets.columns}
    for want in candidates:
        if want in cols_upper:
            return cols_upper[want]
    return ""

def build_centerline_fullname(streets: gpd.GeoDataFrame) -> pd.Series:
    component_candidates = [
        ("PRE_DIR", "PREDIR", "PREFIXDIR", "PRE_DIRECTION"),
        ("ST_NAME", "NAME", "STREETNAME", "RDNAME"),
        ("ST_TYPE", "TYPE", "STREETTYPE", "RD_TYPE"),
        ("SUF_DIR", "SUFDIR", "SUFFIXDIR", "SUF_DIRECTION"),
    ]

    def find_col(options):
        for o in options:
            if o in streets.columns:
                return o
        upper_map = {c.upper(): c for c in streets.columns}
        for o in options:
            if o.upper() in upper_map:
                return upper_map[o.upper()]
        return ""

    pre = find_col(component_candidates[0])
    name = find_col(component_candidates[1])
    typ = find_col(component_candidates[2])
    suf = find_col(component_candidates[3])

    if not name:
        raise ValueError(
            "Could not identify a usable street-name field in centerlines.\n"
            f"Columns include: {list(streets.columns)[:60]} ..."
        )

    parts = []
    for col in [pre, name, typ, suf]:
        if col:
            parts.append(streets[col].fillna("").astype(str))

    full = parts[0]
    for p in parts[1:]:
        full = full.str.cat(p, sep=" ")
    return full


# -----------------------------
# GEOMETRY HELPERS
# -----------------------------
def endpoints(line: LineString):
    coords = list(line.coords)
    return Point(coords[0]), Point(coords[-1])

def node_key(pt: Point, snap_m=ENDPOINT_SNAP_M):
    return (round(pt.x / snap_m) * snap_m, round(pt.y / snap_m) * snap_m)

def build_graph(main_gdf_m: gpd.GeoDataFrame) -> nx.Graph:
    G = nx.Graph()
    for idx, row in main_gdf_m.iterrows():
        geom = row.geometry
        if geom is None or geom.is_empty:
            continue
        a, b = endpoints(geom)
        na, nb = node_key(a), node_key(b)
        G.add_edge(na, nb, fid=idx, weight=float(geom.length))
    return G

def intersection_points(main_gdf_m: gpd.GeoDataFrame, cross_gdf_m: gpd.GeoDataFrame):
    if main_gdf_m.empty or cross_gdf_m.empty:
        return []
    inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
    if inter.is_empty:
        return []
    if inter.geom_type == "Point":
        return [inter]
    if inter.geom_type == "MultiPoint":
        return list(inter.geoms)

    pts = []
    if hasattr(inter, "geoms"):
        for g in inter.geoms:
            if g.is_empty:
                continue
            if g.geom_type == "Point":
                pts.append(g)
            elif g.geom_type in ("LineString", "MultiLineString"):
                pts.append(g.representative_point())
    elif inter.geom_type in ("LineString", "MultiLineString"):
        pts.append(inter.representative_point())
    return pts

def nearest_points_on_main(main_gdf_m: gpd.GeoDataFrame, cross_gdf_m: gpd.GeoDataFrame):
    if main_gdf_m.empty or cross_gdf_m.empty:
        return []
    cross_union = cross_gdf_m.unary_union
    rep = cross_union.representative_point()
    pts = []
    for _, row in main_gdf_m.iterrows():
        g = row.geometry
        if g is None or g.is_empty:
            continue
        pts.append(g.interpolate(g.project(rep)))
    return pts

def snap_points_to_nodes(G: nx.Graph, pts, tol_m=NODE_SNAP_TOL_M):
    nodes = list(G.nodes)
    if not nodes:
        return set()
    out = set()
    for p in pts:
        best_d2 = None
        best_n = None
        for n in nodes:
            d2 = (p.x - n[0])**2 + (p.y - n[1])**2
            if best_d2 is None or d2 < best_d2:
                best_d2 = d2
                best_n = n
        if best_d2 is not None and (best_d2 ** 0.5) <= tol_m:
            out.add(best_n)
    return out

def extreme_node(G: nx.Graph, mode: str):
    nodes = list(G.nodes)
    if not nodes:
        return None
    if mode == "WEST":
        return min(nodes, key=lambda n: n[0])
    if mode == "EAST":
        return max(nodes, key=lambda n: n[0])
    if mode == "SOUTH":
        return min(nodes, key=lambda n: n[1])
    if mode == "NORTH":
        return max(nodes, key=lambda n: n[1])
    return min(nodes, key=lambda n: n[0])

def best_path_or_component_fallback(G: nx.Graph, A_nodes, B_nodes):
    # Try shortest path first
    best = None
    for a in A_nodes:
        for b in B_nodes:
            if a == b:
                continue
            try:
                path = nx.shortest_path(G, a, b, weight="weight")
                dist = nx.shortest_path_length(G, a, b, weight="weight")
                if best is None or dist < best[0]:
                    best = (dist, path)
            except nx.NetworkXNoPath:
                continue

    if best is not None:
        _, path = best
        fids = []
        for u, v in zip(path[:-1], path[1:]):
            fids.append(G[u][v]["fid"])
        return ("path", fids)

    # Component fallback: choose component most related to A/B nodes, else largest
    comps = list(nx.connected_components(G))
    if not comps:
        return ("none", None)

    comp_id = {}
    for i, comp in enumerate(comps):
        for n in comp:
            comp_id[n] = i

    score = defaultdict(int)
    for n in list(A_nodes) + list(B_nodes):
        if n in comp_id:
            score[comp_id[n]] += 1

    if score:
        best_comp = max(score.items(), key=lambda kv: kv[1])[0]
    else:
        best_comp = max(range(len(comps)), key=lambda i: len(comps[i]))

    keep_nodes = set(comps[best_comp])
    fids = []
    for u, v, data in G.edges(data=True):
        if u in keep_nodes and v in keep_nodes:
            fids.append(data["fid"])
    return ("component", list(set(fids)))


# -----------------------------
# MAIN
# -----------------------------
def main():
    with open(AFFECTED_PATH, "r", encoding="utf-8") as f:
        affected_gj = json.load(f)
    affected_df = pd.DataFrame([ft.get("properties", {}) for ft in affected_gj.get("features", [])])

    col_street = "street_full_name"
    col_from = "from_cross_street"
    col_to = "to_cross_street"
    col_id = "id" if "id" in affected_df.columns else None

    print("Downloading Miami-Dade centerlines...")
    esri = arcgis_download_all_esrijson(CENTERLINES_LAYER_URL)
    gj = arcgis2geojson(esri)
    streets = gpd.GeoDataFrame.from_features(gj["features"], crs=4326)

    name_field = pick_centerline_name_field(streets)
    if name_field:
        streets["FULLNAME_RAW"] = streets[name_field].fillna("").astype(str)
        print(f"Using centerline name field: {name_field}")
    else:
        streets["FULLNAME_RAW"] = build_centerline_fullname(streets)
        print("Assembled FULLNAME_RAW from parts")

    streets_m = streets.to_crs(METERS_CRS)

    # Build variant index: key -> indices
    idx = defaultdict(list)
    for i, val in streets_m["FULLNAME_RAW"].fillna("").astype(str).items():
        for k in key_variants(val):
            idx[k].append(i)

    def best_match_gdf(raw_name: str) -> gpd.GeoDataFrame:
        # normal variant lookup
        for k in key_variants(raw_name):
            hits = idx.get(k, [])
            if hits:
                return streets_m.loc[hits].copy()

        # last-ditch substring search on a core token
        core = strip_leading_directional_phrase(strip_ordinals(clean_raw_name(raw_name))).upper()
        core = re.sub(r"\b(DRIVE|DR|STREET|ST|AVENUE|AVE|ROAD|RD|BOULEVARD|BLVD)\b", "", core).strip()
        if core and len(core) >= 6:
            mask = streets_m["FULLNAME_RAW"].fillna("").astype(str).str.upper().str.contains(core, na=False)
            if mask.any():
                return streets_m.loc[mask].copy()

        return streets_m.iloc[0:0].copy()

    qa_rows = []
    out_segments = []

    for _, rec in affected_df.iterrows():
        rid = rec[col_id] if col_id else rec.name
        raw_main = rec.get(col_street, "")
        raw_from = rec.get(col_from, "")
        raw_to = rec.get(col_to, "")

        # --- main street match ---
        main_gdf = best_match_gdf(raw_main)

        # If main doesn't match, try extracting a street from the phrase/parenthetical
        if main_gdf.empty:
            alt = extract_street_from_phrase(raw_main)
            if alt:
                main_gdf = best_match_gdf(alt)

        if main_gdf.empty:
            qa_rows.append({"id": rid, "status": "no_main_match", "main": raw_main, "from": raw_from, "to": raw_to})
            continue

        # --- cross endpoints ---
        from_b, from_cands = canonical_cross_candidates(raw_from)
        to_b, to_cands = canonical_cross_candidates(raw_to)

        # if missing from/to, treat as extremes
        if not clean_raw_name(raw_from):
            from_b = from_b or "WEST"
        if not clean_raw_name(raw_to):
            to_b = to_b or "EAST"

        from_gdf = streets_m.iloc[0:0].copy()
        if from_b is None:
            for cand in from_cands:
                from_gdf = best_match_gdf(cand)
                if not from_gdf.empty:
                    break

        to_gdf = streets_m.iloc[0:0].copy()
        if to_b is None:
            for cand in to_cands:
                to_gdf = best_match_gdf(cand)
                if not to_gdf.empty:
                    break

        if (from_b is None and from_gdf.empty) or (to_b is None and to_gdf.empty):
            qa_rows.append({
                "id": rid,
                "status": "cross_name_no_match",
                "main": raw_main,
                "from": raw_from,
                "to": raw_to,
                "from_match": (from_b is not None) or (not from_gdf.empty),
                "to_match": (to_b is not None) or (not to_gdf.empty),
            })
            continue

        G = build_graph(main_gdf)
        if len(G.nodes) == 0:
            qa_rows.append({"id": rid, "status": "empty_graph", "main": raw_main, "from": raw_from, "to": raw_to})
            continue

        # A_nodes
        if from_b is not None:
            A_nodes = {extreme_node(G, from_b)}
            A_nodes = {n for n in A_nodes if n is not None}
        else:
            A_pts = intersection_points(main_gdf, from_gdf)
            if not A_pts:
                A_pts = nearest_points_on_main(main_gdf, from_gdf)
            A_nodes = snap_points_to_nodes(G, A_pts)

        # B_nodes
        if to_b is not None:
            B_nodes = {extreme_node(G, to_b)}
            B_nodes = {n for n in B_nodes if n is not None}
        else:
            B_pts = intersection_points(main_gdf, to_gdf)
            if not B_pts:
                B_pts = nearest_points_on_main(main_gdf, to_gdf)
            B_nodes = snap_points_to_nodes(G, B_pts)

        if not A_nodes or not B_nodes:
            qa_rows.append({
                "id": rid,
                "status": "endpoints_not_found",
                "main": raw_main,
                "from": raw_from,
                "to": raw_to,
                "A_nodes": len(A_nodes),
                "B_nodes": len(B_nodes),
            })
            continue

        mode, edge_fids = best_path_or_component_fallback(G, A_nodes, B_nodes)
        if not edge_fids:
            qa_rows.append({"id": rid, "status": "no_path", "main": raw_main, "from": raw_from, "to": raw_to})
            continue

        segs = main_gdf.loc[edge_fids].copy()
        segs["affected_id"] = rid
        segs["street_full_name_src"] = raw_main
        segs["from_cross_src"] = raw_from
        segs["to_cross_src"] = raw_to
        segs["match_mode"] = mode  # "path" or "component"
        out_segments.append(segs)

        qa_rows.append({
            "id": rid,
            "status": "matched",
            "main": raw_main,
            "from": raw_from,
            "to": raw_to,
            "segments": len(segs),
            "length_m": float(segs.length.sum()),
            "match_mode": mode,
        })

    qa = pd.DataFrame(qa_rows)
    qa.to_csv("qa_matches.csv", index=False)

    if not out_segments:
        print("No segments matched. Check qa_matches.csv.")
        return

    affected_lines_m = gpd.GeoDataFrame(pd.concat(out_segments, ignore_index=True), crs=streets_m.crs)
    affected_lines_m = affected_lines_m.drop_duplicates(subset=["geometry"])

    buffer_geom_m = affected_lines_m.buffer(BUFFER_METERS).unary_union
    buffer_gdf_m = gpd.GeoDataFrame([{"geometry": buffer_geom_m}], crs=affected_lines_m.crs)

    affected_lines = affected_lines_m.to_crs(4326)
    buffer_gdf = buffer_gdf_m.to_crs(4326)

    affected_lines.to_file("affected_streets_lines.geojson", driver="GeoJSON")
    buffer_gdf.to_file("affected_50ft_buffer.geojson", driver="GeoJSON")

    print("Wrote:")
    print("  - affected_streets_lines.geojson")
    print("  - affected_50ft_buffer.geojson")
    print("  - qa_matches.csv")


if __name__ == "__main__":
    main()


Downloading Miami-Dade centerlines...
Assembled FULLNAME_RAW from parts


  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  cross_union = cross_gdf_m.unary_union
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_union)
  cross_union = cross_gdf_m.unary_union
  inter = main_gdf_m.unary_union.intersection(cross_gdf_m.unary_unio

Wrote:
  - affected_streets_lines.geojson
  - affected_50ft_buffer.geojson
  - qa_matches.csv
