In [8]:
# ---- Load Google Maps API key: your %store approach, with env fallback ----
try:
    get_ipython().run_line_magic("store", "-r google_maps_API_Key")
except Exception:
    pass

import googlemaps

if 'google_maps_API_Key' not in globals() or not google_maps_API_Key:
    google_maps_API_Key = os.getenv("GOOGLE_MAPS_API_KEY", "")

if not google_maps_API_Key:
    raise ValueError("No Google Maps API key found. Set `%store google_maps_API_Key` or env var GOOGLE_MAPS_API_KEY.")

google_maps_API_Key = googlemaps.Client(key=google_maps_API_Key)

2025-09-25 01:14:00 | INFO | API queries_quota: 60


In [11]:
#!/usr/bin/env python3
"""
Build a GeoJSON of ALL South Florida transactions from CSVs (Miami-Dade, Palm Beach, Broward)
— preserving every valid row (no date or doc-type filtering) — and enriching
DEED rows by bundling the matching MORTGAGE (Lender + Loan Amount) while
keeping mortgage rows in the dataset. Geocoding is cached, Florida-biased, and
address-normalized. Rows that fail geocoding remain with geometry=null. Also
exports an `ungeocoded.csv` triage file. Includes a tqdm progress bar.

Usage (script):
  pip install geopandas googlemaps tqdm shapely
  export GOOGLE_MAPS_API_KEY=...  # or use Jupyter %store google_maps_API_Key
  python transactions_geojson_from_csv.py

Outputs:
  - map_data.geojson
  - geocode_cache.json (on-disk cache)
  - ungeocoded.csv (rows missing lat/lon)

Notes:
  - Never drops rows for dates, doc types, or geocode failures.
  - "Bundling" = add Lender/Loan Amount onto matching DEED rows; mortgages stay too.
"""
from __future__ import annotations

import json
import logging
import os
import re
from typing import Dict, Optional, Tuple

import geopandas as gpd
import googlemaps
import numpy as np
import pandas as pd
from shapely.geometry import Point
from tqdm import tqdm

# -----------------------------------------------------------------------------
# Google Maps API key (works in scripts AND notebooks)
# -----------------------------------------------------------------------------
GOOGLE_MAPS_API_KEY = os.getenv("GOOGLE_MAPS_API_KEY", "")
try:
    # If running in a notebook and the user stored `%store google_maps_API_Key`
    ip = get_ipython()  # type: ignore[name-defined]
    try:
        ip.run_line_magic("store", "-r google_maps_API_Key")  # loads into local namespace if present
        if 'google_maps_API_Key' in globals() and google_maps_API_Key and not GOOGLE_MAPS_API_KEY:  # noqa: F821
            GOOGLE_MAPS_API_KEY = google_maps_API_Key  # noqa: F821
    except Exception:
        pass
except Exception:
    pass

if not GOOGLE_MAPS_API_KEY:
    raise ValueError(
        "No Google Maps API key found. Set env var GOOGLE_MAPS_API_KEY or `%store google_maps_API_Key`."
    )

GMAPS_CLIENT = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)

# -----------------------------------------------------------------------------
# Settings
# -----------------------------------------------------------------------------
INPUT_MD   = "SoFlaRecordsScraper - MiamiDade.csv"
INPUT_PBC  = "SoFlaRecordsScraper - PalmBeachCounty.csv"
INPUT_BRD  = "SoFlaRecordsScraper - Broward.csv"
OUTPUT_GEOJSON = "map_data.geojson"
GEO_CACHE_PATH = "geocode_cache.json"
DEFAULT_CRS = "EPSG:4326"

TEXT_TO_DROP = "Scrape Attempt Date:"  # sentinel rows to drop
COLUMNS_TO_KEEP = [  # used when present; we won’t fail if some are missing
    "ScrapeDate","Doc Type","Instrument_Num","Record Date","Record Date Search",
    "Seller","Buyer","Consideration","Folio","Use Code Description","Building Sq. Ft",
    "Lot Size","Date of Previous Sale","Previous Owner Name","Previous Sale Price",
    "Physical Address","Mailing Address","Municipality","PropAppraiserURL",
    "Sunbiz Doc URL First Party","Sunbiz Doc URL Second Party",
    "First Party Registered Agent Name & Address","First Party Document Number",
    "First Party FEI/EIN Number","First Party Mailing Address",
    "First Party Principal Address","First Party State","First Party Date Filed",
    "Second Party Registered Agent Name & Address","Second Party Status",
    "Second Party Document Number","Second Party FEI/EIN Number",
    "Second Party Mailing Address","Second Party Principal Address",
    "Second Party State","Second Party Date Filed",
]

RES_CLASSES = ["RESIDENTIAL","CONDOMINIUM","CONDO","FAMILY","RV PARK"]
COM_CLASSES = [
    "OFFICE","MANUFACTURING","COMMERCIAL","HOTEL","MOTEL","INDUSTRIAL","HEAVY IND",
    "GOLF COURSE","RETAIL","WAREH/DIST TERM","WAREHOUSE","STORAGE","MULTIFAMILY",
    "SCHOOL","RESTAURANTS","SHOPPING CENTER","MULTI-FAMILY","SERVICE STATION",
    "DRUG STORE","RELIGIOUS","WAREHOUSING","NIGHTCLUBS","PARKING LOT",
]

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("transactions-geojson")

# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------

def _drop_rows_with_text(df: pd.DataFrame, text: str) -> pd.DataFrame:
    if df.empty:
        return df
    mask_keep = ~df.astype(str).apply(
        lambda col: col.str.contains(text, na=False)
    ).any(axis=1)
    return df.loc[mask_keep]


def format_md_folio(folio: str) -> str:
    s = str(folio)
    return f"{s[0:2]}-{s[2:6]}-{s[6:9]}-{s[9:13]}" if (s.isdigit() and len(s) == 13) else s


def extract_broward_folio(url: str) -> Optional[str]:
    if isinstance(url, str) and "Folio=" in url:
        return url.split("Folio=")[1]
    return None


def make_anchor(text: str, url: Optional[str]) -> str:
    if pd.isna(url) or url in {"No matching document found","No Prop Appraiser URL Found","Data Not Found"}:
        return str(text)
    return f'<a href="{url}" target="_blank">{text}</a>'


def normalize_pbc_addresses(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ["Physical Address","Municipality"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().replace({"Data Not Found": ""}).fillna("")
    has_addr = df.get("Physical Address", "") != ""
    is_uninc = df.get("Municipality", "").eq("UNINCORPORATED") if "Municipality" in df.columns else False
    df.loc[has_addr & is_uninc, "Physical Address"] = (
        df.loc[has_addr & is_uninc, "Physical Address"] + " IN UNINCORPORATED PALM BEACH COUNTY"
    )
    df.loc[has_addr & ~is_uninc, "Physical Address"] = (
        df.loc[has_addr & ~is_uninc, "Physical Address"].str.cat(
            df.loc[has_addr & ~is_uninc, "Municipality"], sep=" IN "
        )
    )
    if "Municipality" in df.columns:
        df = df.drop(columns="Municipality")
    df["Physical Address"] = df["Physical Address"].str.replace(
        r" in $", "Data Not Found", regex=True
    ).str.strip(", ")
    return df


def split_zip_from_address(df: pd.DataFrame, addr_col: str) -> pd.DataFrame:
    df = df.copy()
    z = df[addr_col].astype(str).str.extract(r"(\b\d{5}(?:-\d{4})?\b)", expand=False)
    df["ZipCode"] = z
    df[addr_col] = df[addr_col].astype(str).str.replace(
        r"\b\d{5}(?:-\d{4})?\b", "", regex=True
    ).str.strip().str.strip(",")
    return df


def classify_use_code(series: pd.Series) -> pd.Series:
    s = series.fillna("").astype(str).str.upper()
    res_pat = "|".join(map(re.escape, RES_CLASSES))
    com_pat = "|".join(map(re.escape, COM_CLASSES))
    out = pd.Series("OTHER", index=s.index)
    out[s.str.contains(res_pat, na=False)] = "RESIDENTIAL"
    out[s.str.contains(com_pat, na=False)] = "COMMERCIAL"
    return out


def normalize_doc_types(s: pd.Series) -> pd.Series:
    s = s.fillna("").astype(str)
    replacements = {
        r"\bDEE\b": "DEED",
        r"\bDeed Transfers of Real Property\b": "DEED",
        r"\bMOR\b": "MORTGAGE",
        r"Mortgage/ Modifications & Assumptions": "MORTGAGE",
    }
    for pat, repl in replacements.items():
        s = s.str.replace(pat, repl, regex=True)
    return s


def clean_sale_price(s: pd.Series) -> pd.Series:
    s = s.replace(["None","No price found!","Data Not Found","No results."], np.nan)
    s = s.astype(str).str.replace("$", "", regex=False).str.replace(",", "", regex=False)
    return pd.to_numeric(s, errors="coerce")


def add_anchors(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if {"Sunbiz Doc URL First Party","Seller"}.issubset(df.columns):
        df["Seller"] = [make_anchor(t, u) for t, u in zip(df["Seller"].astype(str), df["Sunbiz Doc URL First Party"])]
    if {"Sunbiz Doc URL Second Party","Buyer"}.issubset(df.columns):
        df["Buyer"] = [make_anchor(t, u) for t, u in zip(df["Buyer"].astype(str), df["Sunbiz Doc URL Second Party"])]
    if {"PropAppraiserURL","Folio"}.issubset(df.columns):
        df["Folio"] = [make_anchor(str(t), u) for t, u in zip(df["Folio"], df["PropAppraiserURL"])]
    return df

# -----------------------------------------------------------------------------
# Geocoding (cache + normalization + FL/US bias)
# -----------------------------------------------------------------------------

def load_cache(path: str) -> Dict[str, Tuple[float, float]]:
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
    return {}


def save_cache(path: str, cache: Dict[str, Tuple[float, float]]) -> None:
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(cache, f)
    os.replace(tmp, path)

county_suffix_re = re.compile(r"\s+(MIAMI-DADE|PALM BEACH COUNTY|BROWARD COUNTY)\b", re.I)
zip4_zeros_re    = re.compile(r"(\b\d{5})-0000\b")
zipcode_re       = re.compile(r"\b\d{5}(?:-\d{4})?\b")

def normalize_for_geocode(a: str) -> str:
    if not isinstance(a, str):
        return ""
    s = a.strip()
    s = county_suffix_re.sub("", s)                               # drop appended county
    s = re.sub(r",?\s*UNINCORPORATED COUNTY\b", "", s, flags=re.I)
    s = zip4_zeros_re.sub(r"\1", s)                               # 33178-0000 -> 33178
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r",\s*,", ", ", s)
    return s.strip(", ").strip()


def geocode_addresses(
    df: pd.DataFrame,
    addr_col: str,
    cache_path: str,
    gmaps_client: Optional[googlemaps.Client],
    show_progress: bool = True,
) -> pd.DataFrame:
    df = df.copy()
    cache: Dict[str, Tuple[float, float]] = load_cache(cache_path)
    gmaps = gmaps_client

    def _geo_one(addr: str) -> Tuple[Optional[float], Optional[float]]:
        if not addr:
            return (None, None)
        if addr in cache:
            return cache[addr]
        if gmaps is None:
            return (None, None)
        try:
            # Primary attempt with FL/US bias
            result = gmaps.geocode(
                addr,
                components={"administrative_area": "FL", "country": "US"},
                region="us",
            )
            # Fallback: drop zip entirely
            if not result:
                no_zip = zipcode_re.sub("", addr).replace("  ", " ").strip(", ")
                if no_zip:
                    result = gmaps.geocode(
                        f"{no_zip}, FL, USA",
                        components={"administrative_area": "FL", "country": "US"},
                        region="us",
                    )
            if result:
                lat = result[0]["geometry"]["location"]["lat"]
                lng = result[0]["geometry"]["location"]["lng"]
                cache[addr] = (lat, lng)
                return (lat, lng)
        except Exception as e:
            log.warning("Geocode failed for '%s': %s", addr, e)
        return (None, None)

    norm = df[addr_col].fillna("").astype(str).map(normalize_for_geocode)
    unique_addrs = norm.unique().tolist()
    log.info("Unique addresses to resolve (normalized): %d", len(unique_addrs))

    iterator = tqdm(unique_addrs, desc="Geocoding", unit="addr") if show_progress else unique_addrs
    for a in iterator:
        if a and a not in cache:
            _geo_one(a)
    save_cache(cache_path, cache)

    coords = norm.map(lambda a: cache.get(a, (None, None)))
    df["latitude"] = coords.map(lambda t: t[0])
    df["longitude"] = coords.map(lambda t: t[1])
    return df

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------

def main() -> None:
    # Remove old geojson if present (optional, harmless if missing)
    if os.path.exists(OUTPUT_GEOJSON):
        os.remove(OUTPUT_GEOJSON)
        log.info("Deleted old %s", OUTPUT_GEOJSON)

    # 1) Load CSVs (full sheets)
    md_df  = pd.read_csv(INPUT_MD, encoding="utf-8-sig")
    pbc_df = pd.read_csv(INPUT_PBC, encoding="utf-8-sig")
    brd_df = pd.read_csv(INPUT_BRD, encoding="utf-8-sig")

    # Keep only columns we recognize (don’t error if others are present/missing)
    def _keep(df: pd.DataFrame) -> pd.DataFrame:
        avail = [c for c in COLUMNS_TO_KEEP if c in df.columns]
        return df[avail].copy() if avail else df.copy()

    md_df, pbc_df, brd_df = _keep(md_df), _keep(pbc_df), _keep(brd_df)

    # Drop any “scrape attempt” rows (content rows, not transactions)
    md_df  = _drop_rows_with_text(md_df,  TEXT_TO_DROP)
    pbc_df = _drop_rows_with_text(pbc_df, TEXT_TO_DROP)
    brd_df = _drop_rows_with_text(brd_df, TEXT_TO_DROP)

    # 2) County-specific tweaks
    if "Folio" in md_df.columns:
        md_df["Folio"] = md_df["Folio"].astype(str).map(format_md_folio)
    if "PropAppraiserURL" in brd_df.columns:
        brd_df["Folio"] = brd_df["PropAppraiserURL"].map(extract_broward_folio)

    # Full Address + County (for display only)
    if "Physical Address" in md_df.columns:
        md_df["Full Address"] = md_df["Physical Address"].astype(str) + " MIAMI-DADE"
        md_df["County"] = "Miami-Dade"
    if "Physical Address" in pbc_df.columns:
        pbc_df["Full Address"] = pbc_df["Physical Address"].astype(str) + " PALM BEACH COUNTY"
        pbc_df["County"] = "Palm Beach"
    if "Physical Address" in brd_df.columns:
        brd_df["Full Address"] = brd_df["Physical Address"].astype(str) + " BROWARD COUNTY"
        brd_df["County"] = "Broward"

    # PBC: merge municipality into address & drop Municipality column
    pbc_df = normalize_pbc_addresses(pbc_df)

    # Split ZIP from Physical Address for MD & Broward (optional, keeps ZipCode col)
    if "Physical Address" in md_df.columns:
        md_df  = split_zip_from_address(md_df,  "Physical Address")
    if "Physical Address" in brd_df.columns:
        brd_df = split_zip_from_address(brd_df, "Physical Address")

    # 3) Concatenate — DO NOT drop anything
    df = pd.concat([md_df, pbc_df, brd_df], ignore_index=True, sort=False)

    # 4) Cleaning / normalization — DO NOT drop anything
    if "Consideration" in df.columns:
        df = df.rename(columns={"Consideration": "Sale Price"})
        df["Sale Price"] = clean_sale_price(df["Sale Price"])  # float

    if "Physical Address" in df.columns:
        df["Physical Address"] = df["Physical Address"].astype(str).str.upper()
    if "Use Code Description" in df.columns:
        df["Use Code Description"] = df["Use Code Description"].astype(str).str.upper()
    if "Doc Type" in df.columns:
        df["Doc Type"] = normalize_doc_types(df["Doc Type"])  # normalize deed/mortgage labels

    # Remove "NOT FOUND" noise inside cells (keep the row)
    df = df.apply(lambda col: col.mask(col.astype(str).str.contains("NOT FOUND", case=False, na=False), ""))

    # Classification
    if "Use Code Description" in df.columns:
        df["Simple Classification"] = classify_use_code(df["Use Code Description"])

    # Anchors (Sunbiz/Appraiser)
    df = add_anchors(df)

    # 5) Geocoding (keep all rows; geometry can be null)
    geocode_input_col = "Physical Address" if "Physical Address" in df.columns else "Full Address"
    df = geocode_addresses(df, addr_col=geocode_input_col, cache_path=GEO_CACHE_PATH, gmaps_client=GMAPS_CLIENT, show_progress=True)

    # 6) Bundle mortgages with deeds (enrich deeds) — KEEP ALL ROWS
    if {"Doc Type","Seller","Buyer","Record Date"}.issubset(df.columns):
        deeds = df[df["Doc Type"].eq("DEED")].copy()
        mortgages = df[df["Doc Type"].eq("MORTGAGE")].copy()
        if not deeds.empty and not mortgages.empty:
            mortgages["TransactionID"] = mortgages["Seller"].astype(str) + mortgages["Record Date"].astype(str)
            deeds["TransactionID"] = deeds["Buyer"].astype(str) + deeds["Record Date"].astype(str)
            if "Buyer" in mortgages.columns:
                mortgages = mortgages.rename(columns={"Buyer": "Lender"})
            if "Sale Price" in mortgages.columns:
                mortgages = mortgages.rename(columns={"Sale Price": "Loan Amount"})
            # Enrich deeds with Lender/Loan Amount; keep mortgages as-is
            deeds = deeds.merge(
                mortgages[["TransactionID","Lender","Loan Amount"]],
                on="TransactionID",
                how="left",
            )
            # Put enriched deeds back with all other rows (mortgages & others)
            df = pd.concat([deeds, df[~df["Doc Type"].eq("DEED")]], ignore_index=True, sort=False)

    # 6.5) Export ungeocoded rows for cleanup (but keep them in GeoJSON)
    if {"latitude","longitude"}.issubset(df.columns):
        ungeocoded_mask = df[["latitude","longitude"]].isna().any(axis=1)
        try:
            df.loc[ungeocoded_mask].to_csv("ungeocoded.csv", index=False)
            log.info("Wrote %d ungeocoded rows to ungeocoded.csv", int(ungeocoded_mask.sum()))
        except Exception as e:
            log.warning("Could not write ungeocoded.csv: %s", e)

    # 7) GeoDataFrame (geometry can be None)
    geometry = [
        Point(lon, lat) if pd.notna(lat) and pd.notna(lon) else None
        for lon, lat in zip(df.get("longitude"), df.get("latitude"))
    ]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=DEFAULT_CRS)

    # Fill NaNs for readability (properties only; geometry stays None if ungeocoded)
    for col in gdf.columns:
        if col == "geometry":
            continue
        if pd.api.types.is_numeric_dtype(gdf[col]):
            gdf[col] = gdf[col].fillna(0)
        else:
            gdf[col] = gdf[col].fillna("Data Not Found")

    # 8) Write GeoJSON
    gdf.to_file(OUTPUT_GEOJSON, driver="GeoJSON")
    log.info("Wrote %d rows to %s (geometries may be null)", len(gdf), OUTPUT_GEOJSON)


if __name__ == "__main__":
    main()


2025-09-25 01:34:43 | INFO | API queries_quota: 60
2025-09-25 01:34:43 | INFO | Deleted old map_data.geojson
2025-09-25 01:34:44 | INFO | Unique addresses to resolve (normalized): 2635
Geocoding: 100%|█████████████████████| 2635/2635 [00:00<00:00, 1526728.97addr/s]
2025-09-25 01:34:44 | INFO | Wrote 2327 ungeocoded rows to ungeocoded.csv
2025-09-25 01:34:44 | INFO | Created 5,640 records
2025-09-25 01:34:44 | INFO | Wrote 5640 rows to map_data.geojson (geometries may be null)
