cleaning osm data


step1

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

def _ensure_osmid_column(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Make sure we have an 'osmid' column (OSMnx can put it in the index)."""
    if 'osmid' in gdf.columns:
        return gdf
    if isinstance(gdf.index, pd.MultiIndex):
        if 'osmid' in gdf.index.names:
            return gdf.reset_index()
        gdf = gdf.reset_index()
    else:
        gdf = gdf.reset_index()
    # Fallbacks
    if 'osmid' not in gdf.columns:
        if 'id' in gdf.columns:
            gdf = gdf.rename(columns={'id': 'osmid'})
        else:
            gdf['osmid'] = gdf.index.astype(str)
    return gdf

def _coalesce_cols(df: pd.DataFrame, cols):
    """Row-wise first-non-null across a list of optional columns."""
    out = pd.Series(pd.NA, index=df.index, dtype="object")
    for c in cols:
        if c in df.columns:
            out = out.fillna(df[c])
    return out

def normalize_clinic_geometries(gdf: gpd.GeoDataFrame, keep_footprint=True) -> gpd.GeoDataFrame:
    """Set CRS to EPSG:4326, convert non-points to centroids; optionally keep original footprint."""
    # CRS → 4326
    if gdf.crs is None:
        gdf = gdf.set_crs(4326, allow_override=True)
    else:
        gdf = gdf.to_crs(4326)

    if keep_footprint:
        gdf['geometry_footprint'] = gdf.geometry

    # Point or centroid
    geomtype = gdf.geometry.geom_type
    centroids = gdf.geometry.centroid
    new_geom = np.where(geomtype.eq('Point'), gdf.geometry, centroids)
    gdf = gdf.set_geometry(new_geom)
    gdf = gdf.set_crs(4326, allow_override=True)

    # Lat/Lon from point
    gdf['latitude']  = gdf.geometry.y
    gdf['longitude'] = gdf.geometry.x
    return gdf

def prune_and_rename_clinics(gdf: gpd.GeoDataFrame, keep_footprint=True) -> gpd.GeoDataFrame:
    gdf = _ensure_osmid_column(gdf).copy()

    # Filter to amenity=veterinary (if available)
    if 'amenity' in gdf.columns:
        gdf = gdf[gdf['amenity'].astype(str).str.lower().eq('veterinary')].copy()

    # Geometry normalization (point + lat/lon, keep footprint optionally)
    gdf = normalize_clinic_geometries(gdf, keep_footprint=keep_footprint)

    # Coalesce contact fields
    phone   = _coalesce_cols(gdf, ['contact:phone', 'phone'])
    website = _coalesce_cols(gdf, ['contact:website', 'website'])
    email   = _coalesce_cols(gdf, ['contact:email', 'email'])

    # Build full address
    def _full_address(row):
        a = " ".join([str(row.get('addr:street') or "").strip(),
                      str(row.get('addr:housenumber') or "").strip()]).strip()
        b = " ".join([str(row.get('addr:postcode') or "").strip(),
                      str(row.get('addr:city') or "").strip()]).strip()
        parts = [p for p in [a, b] if p]
        return ", ".join(parts) if parts else pd.NA

    gdf['full_address'] = gdf.apply(_full_address, axis=1)

    # Target schema mapping (rename/pass-through)
    rename_map = {
        'osmid': 'clinic_id',
        'name': 'clinic_name',
        'addr:street': 'street',
        'addr:housenumber': 'house_number',
        'addr:postcode': 'postcode',
        'addr:city': 'city',
        'veterinary:speciality': 'speciality',
        'wheelchair': 'wheelchair_acces',  # as requested (one 's')
        # keep: amenity, opening_hours, operator, brand, geometry
    }
    for old, new in rename_map.items():
        if old in gdf.columns:
            gdf = gdf.rename(columns={old: new})

    # Inject coalesced contact fields to target names
    gdf['phone_number'] = phone
    gdf['website']      = website
    gdf['email']        = email

    # Add district/neighbourhood placeholders (to be enriched later)
    gdf['district_id']      = pd.NA
    gdf['neighbourhood_id'] = pd.NA

    # Final column order
    final_cols = [
        'clinic_id', 'clinic_name',
        'street', 'house_number', 'postcode', 'city',
        'district_id', 'neighbourhood_id',
        'phone_number', 'website', 'email',
        'opening_hours', 'operator', 'brand',
        'speciality', 'wheelchair_acces',
        'full_address', 'latitude', 'longitude',
        'amenity', 'geometry'
    ]

    # Keep only requested columns that actually exist
    existing = [c for c in final_cols if c in gdf.columns]
    out = gdf[existing].copy()

    # Ensure proper dtypes for id/postcode if desired (keep as text here)
    if 'clinic_id' in out.columns:
        out['clinic_id'] = out['clinic_id'].astype(str)
    if 'postcode' in out.columns:
        out['postcode'] = out['postcode'].astype(str)

    return out


In [None]:
# 1) geting raw OSM features (point/polygon geometries)
from osmnx.features import features_from_place as get_features  # OSMnx ≥2
gdf_raw = get_features("Berlin, Germany", {"amenity": "veterinary"}).reset_index()

# 2) running the pruning/renaming + geometry normalization
clinics_gdf = prune_and_rename_clinics(gdf_raw, keep_footprint=True)

# 3) quick sanity checks
clinics_gdf.head(), clinics_gdf.crs, clinics_gdf.shape

clinics_gdf.to_file("berlin_vet_clinics_normalized.geojson", driver="GeoJSON")
clinics_gdf.drop(columns="geometry").to_csv("berlin_vet_clinics_normalized.csv", index=False, encoding="utf-8")


step2

In [None]:
# --- Apply cleaning helpers to the target-schema GeoDataFrame ---

import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

def _geo_latlon(g):
    """Return (lat, lon) from geometry; centroid fallback; None if missing."""
    if g is None or g.is_empty:
        return (None, None)
    p = g
    if not isinstance(p, Point):
        try:
            p = g.centroid
        except Exception:
            return (None, None)
    return (float(p.y), float(p.x))

def clean_clinics_target_schema(clinics_gdf, country="DE", postcode_as_text=True):
    """
    Use your clean_vet_df() (which expects OSM-style column names),
    then map cleaned values back to the target schema on clinics_gdf.
    """
    gdf = clinics_gdf.copy()

    # 1) Build a temporary OSM-style frame the cleaner expects
    tmp = pd.DataFrame({
        "name": gdf.get("clinic_name"),
        "addr:street": gdf.get("street"),
        "addr:housenumber": gdf.get("house_number"),
        "addr:city": gdf.get("city"),
        "addr:postcode": gdf.get("postcode"),
        "phone": gdf.get("phone_number"),
        "website": gdf.get("website"),
        "email": gdf.get("email"),
        "opening_hours": gdf.get("opening_hours"),
        "operator": gdf.get("operator"),
        "brand": gdf.get("brand"),
        "wheelchair": gdf.get("wheelchair_acces", gdf.get("wheelchair_access")),
        "emergency": gdf.get("emergency"),
    })

    # 2) Clean using helper
    tmp_clean = clean_vet_df(tmp, country=country, postcode_as_text=postcode_as_text)

    # 3) Write cleaned values back to target-schema columns
    gdf["clinic_name"]   = tmp_clean["name"]
    gdf["street"]        = tmp_clean["addr:street"]
    gdf["house_number"]  = tmp_clean["addr:housenumber"]
    gdf["city"]          = tmp_clean["addr:city"]
    gdf["postcode"]      = tmp_clean["addr:postcode"]
    gdf["phone_number"]  = tmp_clean.get("phone")
    gdf["website"]       = tmp_clean.get("website")
    gdf["email"]         = tmp_clean.get("email")
    gdf["opening_hours"] = tmp_clean.get("opening_hours")
    gdf["operator"]      = tmp_clean.get("operator")
    gdf["wheelchair_acces"] = tmp_clean.get("wheelchair")  # keep your column name
    gdf["emergency"]        = tmp_clean.get("emergency")

    # Optional: bring over the validity flag from your helper
    if "opening_hours_valid" in tmp_clean.columns:
        gdf["opening_hours_valid"] = tmp_clean["opening_hours_valid"]

    # 4) Rebuild full_address (street + house + postcode + city) when available
    def _addr_row(r):
        parts = [r.get("street"), r.get("house_number"), r.get("postcode"), r.get("city")]
        parts = [p for p in parts if isinstance(p, str) and p.strip() != ""]
        return ", ".join(parts) if parts else None
    gdf["full_address"] = gdf.apply(_addr_row, axis=1)

    # 5) Ensure geometry CRS and lat/lon columns exist (point or centroid)
    if getattr(gdf, "crs", None) is None:
        try:
            gdf.set_crs(epsg=4326, inplace=True, allow_override=True)
        except Exception:
            pass
    elif gdf.crs.to_epsg() != 4326:
        gdf = gdf.to_crs(4326)

    if "latitude" not in gdf.columns or "longitude" not in gdf.columns:
        lats, lons = zip(*gdf.geometry.map(_geo_latlon))
        gdf["latitude"] = lats
        gdf["longitude"] = lons

    # 6) (Safety) keep only veterinary amenity rows if column present
    if "amenity" in gdf.columns:
        gdf = gdf[gdf["amenity"].fillna("").astype(str).str.lower().eq("veterinary")]

    return gdf

# --- run it ---
clinics_clean = clean_clinics_target_schema(clinics_gdf, country="DE", postcode_as_text=True)

# quick peek / save
display(clinics_clean.head())
clinics_clean.drop(columns="geometry").to_csv("berlin_vet_clinics_clean.csv", index=False, encoding="utf-8")


In [None]:
# dedupe on raw OSM features
gdf_raw = _ensure_osmid_column(gdf_raw)
before = len(gdf_raw)
gdf_raw = gdf_raw.drop_duplicates(subset=["osmid"], keep="first").copy()
print(f"Dropped {before - len(gdf_raw)} exact duplicates on osmid")


In [None]:
before = len(clinics_gdf)
clinics_gdf = clinics_gdf.drop_duplicates(subset=["clinic_id"], keep="first").copy()
print(f"Dropped {before - len(clinics_gdf)} exact duplicates on clinic_id")


alligning bpt and taek

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point

# ---------- light utilities ----------
def pick_first_col(df: pd.DataFrame, candidates):
    """Return the first existing column (Series) from candidates or an NA Series."""
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series(pd.NA, index=df.index, dtype="object")

SYNONYMS = {
    "clinic_id":        ["clinic_id","osmid","osm_id","id","uid","facility_id"],
    "clinic_name":      ["clinic_name","name","practice","praxis","facility_name","betrieb"],
    "street":           ["street","addr:street","addr_street","address_street","strasse","straße"],
    "house_number":     ["house_number","addr:housenumber","housenumber","hnr","nr"],
    "postcode":         ["postcode","addr:postcode","zip","plz","postleitzahl"],
    "city":             ["city","addr:city","ort","stadt","gemeinde"],
    "phone_number":     ["phone_number","contact:phone","phone","telefon","tel"],
    "website":          ["website","contact:website","url","homepage","web"],
    "email":            ["email","contact:email","mail","e-mail"],
    "opening_hours":    ["opening_hours","hours","sprechzeiten","openinghours"],
    "operator":         ["operator","betreiber","owner"],
    "brand":            ["brand","kette"],
    "speciality":       ["speciality","specialty","veterinary:speciality","fachrichtung"],
    "wheelchair_acces": ["wheelchair_acces","wheelchair_access","wheelchair","barrierefrei","barrierefreiheit"],
    "amenity":          ["amenity","type","category"],
    "latitude":         ["latitude","lat","y"],
    "longitude":        ["longitude","lon","lng","x"],
}

def standardize_from_csv(df: pd.DataFrame, source_name: str, explicit_map: dict | None = None,
                         default_amenity: str = "veterinary") -> gpd.GeoDataFrame:
    """
    Map arbitrary CSV columns into your target schema.
    - `explicit_map`: optional dict {target_col: "source_col"} to override auto-mapping.
    - Adds a `source` column to track provenance.
    - Creates geometry from lat/lon if present; CRS = EPSG:4326.
    """
    explicit_map = explicit_map or {}
    out = pd.DataFrame(index=df.index)

    # 1) map all target fields
    for target, candidates in SYNONYMS.items():
        if target in explicit_map and explicit_map[target] in df.columns:
            out[target] = df[explicit_map[target]]
        else:
            out[target] = pick_first_col(df, candidates)

    # Defaults / fixes
    if out["amenity"].isna().all():
        out["amenity"] = default_amenity

    # Guarantee an id (if missing), prefix with source to avoid cross-source collisions
    if out["clinic_id"].isna().any():
        out["clinic_id"] = out["clinic_id"].fillna(pd.Series([f"{source_name}_{i}" for i in range(len(out))], index=out.index))

    # Build full_address (simple concatenation; will be rebuilt again after cleaning)
    def _addr_row(r):
        parts = [r.get("street"), r.get("house_number"), r.get("postcode"), r.get("city")]
        parts = [p for p in parts if isinstance(p, str) and p.strip() != ""]
        return ", ".join(parts) if parts else pd.NA
    out["full_address"] = out.apply(_addr_row, axis=1)

    # District placeholders
    out["district_id"] = pd.NA
    out["neighbourhood_id"] = pd.NA

    # 2) to GeoDataFrame (geometry from lat/lon if available)
    has_latlon = out["latitude"].notna() & out["longitude"].notna()
    geom = [Point(float(lon), float(lat)) if (pd.notna(lat) and pd.notna(lon)) else None
            for lat, lon in zip(out["latitude"], out["longitude"])]
    gdf = gpd.GeoDataFrame(out, geometry=geom, crs="EPSG:4326")

    # provenance
    gdf["source"] = source_name

    return gdf

# ---------- ALIGN TWO CSVs ----------
csv_a = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v3.csv"
csv_b = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/bpt_tierarztsuche_berlin.csv"

df_a = pd.read_csv(csv_a)
df_b = pd.read_csv(csv_b)



gdf_a_raw = standardize_from_csv(df_a, source_name="A", explicit_map=colmap_a)
gdf_b_raw = standardize_from_csv(df_b, source_name="B", explicit_map=colmap_b)

# ---------- CLEAN  ----------
gdf_a = clean_clinics_target_schema(gdf_a_raw, country="DE", postcode_as_text=True)
gdf_b = clean_clinics_target_schema(gdf_b_raw, country="DE", postcode_as_text=True)

# ---------- DEDUPE WITHIN EACH SOURCE (exact ID) ----------
gdf_a = gdf_a.drop_duplicates(subset=["clinic_id"], keep="first").copy()
gdf_b = gdf_b.drop_duplicates(subset=["clinic_id"], keep="first").copy()

# ---------- MERGE SOURCES ----------
combined = pd.concat([gdf_a, gdf_b], ignore_index=True)

# If both sources may share the same ids, keep first occurrence:
combined = combined.drop_duplicates(subset=["clinic_id"], keep="first").copy()

# ---------- SOFT DEDUPE (no/weak IDs): name + address key ----------
def _norm(s):
    from unicodedata import normalize
    import re
    if pd.isna(s): return ""
    s = normalize("NFKC", str(s)).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

combined["__name_key"] = combined["clinic_name"].map(_norm)
combined["__addr_key"] = (combined["street"].map(_norm) + " " +
                          combined["house_number"].map(_norm) + "|" +
                          combined["postcode"].map(_norm))

before_soft = len(combined)
combined = combined.drop_duplicates(subset=["__name_key","__addr_key"], keep="first").copy()
print(f"Soft de-dupe (name+address): {before_soft - len(combined)} rows dropped")

# ---------- OPTIONAL: SPATIAL + NAME MERGE (node vs polygon–like) ----------
# Reuse your `dedupe_clinics` if you have (it prefers polygons, keeps point geom).
# Only run if we have some geometries.
if "geometry" in combined and combined.geometry.notna().any():
    try:
        combined = dedupe_clinics(combined, distance_m=30, name_col="clinic_name", id_col="clinic_id")
    except Exception as e:
        print("Spatial dedupe skipped (error):", e)

# ---------- FINALIZE ----------
# Clean up helper keys
combined = combined.drop(columns=[c for c in ["__name_key","__addr_key"] if c in combined.columns])

# Ensure CRS and lat/lon present
if getattr(combined, "crs", None) is None:
    combined = combined.set_crs(4326, allow_override=True)
elif combined.crs.to_epsg() != 4326:
    combined = combined.to_crs(4326)

if "latitude" not in combined.columns or "longitude" not in combined.columns:
    combined["latitude"]  = combined.geometry.y
    combined["longitude"] = combined.geometry.x

# Save
combined.to_file("vet_clinics_aligned_merged.geojson", driver="GeoJSON")
combined.drop(columns="geometry").to_csv("vet_clinics_aligned_merged.csv", index=False, encoding="utf-8")

combined.head()


In [None]:
one mopre step in cleaning

In [None]:
import re
import pandas as pd

# --- Load only if the variable doesn't already exist ---
try:
    vet_clinics_aligned_merged
except NameError:
    vet_clinics_aligned_merged = pd.read_csv("vet_clinics_aligned_merged.csv")

# --- Split helpers ---
_HN_TRAIL_RE = re.compile(r"""
    ^\s*(?P<street>.*?\S)[,\s]*(?P<hn>\d{1,5}(?:\s?[A-Za-z])?(?:\s?[\/\-–—]\s?\d{1,5}\s?[A-Za-z]?)?)\s*$
""", re.X)

_HN_LEAD_RE = re.compile(r"""
    ^\s*(?P<hn>\d{1,5}(?:\s?[A-Za-z])?(?:\s?[\/\-–—]\s?\d{1,5}\s?[A-Za-z]?)?)[,\s]+(?P<street>.+?)\s*$
""", re.X)

def _split_street_hn(s: str):
    if not isinstance(s, str) or not s.strip():
        return (s, None)
    t = re.sub(r"[–—−]", "-", s.strip())
    t = re.sub(r"\s+", " ", t)
    m = _HN_TRAIL_RE.match(t) or _HN_LEAD_RE.match(t)
    if m:
        street = m.group("street").strip().rstrip(",")
        hn = re.sub(r"\s+", "", m.group("hn"))
        return (street, hn)
    return (t, None)

def clean_street_and_house_number(df: pd.DataFrame,
                                  street_col="street",
                                  house_col="house_number") -> pd.DataFrame:
    if street_col not in df.columns:
        raise KeyError(f"'{street_col}' column not found")

    # split
    split_series = df[street_col].map(_split_street_hn)
    new_street   = split_series.map(lambda t: t[0])
    extracted_hn = split_series.map(lambda t: t[1])

    # ensure house_number exists
    if house_col not in df.columns:
        df[house_col] = pd.NA

    # fill only where current house_number is empty
    has_existing = df[house_col].astype(str).str.strip().ne("").fillna(False)
    df.loc[~has_existing, house_col] = extracted_hn

    # update street
    df[street_col] = new_street
    return df

# --- Apply ONLY to this dataframe ---
vet_clinics_aligned_merged = clean_street_and_house_number(
    vet_clinics_aligned_merged,
    street_col="street",
    house_col="house_number"
)

# (Optional) save a cleaned copy
vet_clinics_aligned_merged.to_csv("vet_clinics_aligned_merged_cleaned.csv", index=False, encoding="utf-8")

# quick peek
vet_clinics_aligned_merged[["street", "house_number"]].head(20)


join toghether for end file

In [None]:
import os
import pandas as pd
import geopandas as gpd

# --- Pick where to load LEFT from (tries a few common filenames) ---
CANDIDATES = [
    "berlin_vet_clinics_clean.csv",
    "berlin_vet_clinics_clean.geojson",
]

def _load_any(path):
    if path.lower().endswith((".geojson", ".gpkg", ".shp", ".json")):
        return gpd.read_file(path)
    return pd.read_csv(path)

# Use in-memory variable if it already exists
left = globals().get("berlin_vet_clinics_clean", None)
if left is None:
    for p in CANDIDATES:
        if os.path.exists(p):
            try:
                left = _load_any(p)
                print(f"Loaded LEFT from: {p} → {type(left).__name__}, shape={left.shape}")
                break
            except Exception as e:
                print(f"Failed to load {p}: {e}")

if left is None:
    raise FileNotFoundError(
        "Couldn't load LEFT dataset. Put one of these files next to the notebook: "
        + ", ".join(CANDIDATES)
    )

# RIGHT dataset (your merged CSV)
right_path = "vet_clinics_aligned_merged_cleaned.csv"
right = pd.read_csv(right_path)
print(f"Loaded RIGHT from: {right_path} → DataFrame, shape={right.shape}")


In [None]:
%pip install rapidfuzz

import pandas as pd
import geopandas as gpd
import unicodedata, re, importlib.util as iu

# optional fuzzy matcher (no try/except)
if iu.find_spec("rapidfuzz") is not None:
    from rapidfuzz import fuzz, process
else:
    fuzz = process = None

# ---------- helpers ----------
def normalize_name(s):
    """Robust key for clinic names (case/accents/punct/stopwords stripped)."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s).strip()
    if not s:
        return None
    # de-accent → lowercase
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch)).lower()
    # remove common fillers/legal forms
    s = re.sub(r"(tierarztpraxis|tierklinik|tieraerztliche praxis|tierärztliche praxis|praxis|vet|veterinaer|veterinär|gmbh|ag|kg|ug|e\.k\.)", " ", s)
    s = re.sub(r"[\s\-/_,.&+()'’`]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

def _as_df(path):
    """Load GeoJSON/GeoPackage as GeoDataFrame, else CSV as DataFrame."""
    try:
        return gpd.read_file(path)
    except Exception:
        return pd.read_csv(path)

# ---------- load data (no try/except) ----------
left = globals().get("berlin_vet_clinics_clean", None)


right = pd.read_csv("vet_clinics_aligned_merged_cleaned.csv")

# ensure clinic_name column
for df in (left, right):
    if "clinic_name" not in df.columns and "name" in df.columns:
        df.rename(columns={"name": "clinic_name"}, inplace=True)

if "clinic_name" not in left.columns or "clinic_name" not in right.columns:
    raise KeyError("Both datasets must have a 'clinic_name' (or 'name') column.")

# ---------- normalized name keys ----------
left["name_key"]  = left["clinic_name"].map(normalize_name)
right["name_key"] = right["clinic_name"].map(normalize_name)

l_key = left.dropna(subset=["name_key"]).copy()
r_key = right.dropna(subset=["name_key"]).copy()

# ---------- schema diffs ----------
cols_left, cols_right = set(left.columns), set(right.columns)
new_cols_in_right     = sorted(cols_right - cols_left)
missing_cols_in_right = sorted(cols_left - cols_right)

print("=== Column differences ===")
print("New columns in RIGHT:", new_cols_in_right)
print("Columns missing in RIGHT (present in LEFT):", missing_cols_in_right)

# ---------- duplicates by normalized name ----------
dups_left  = l_key[l_key.duplicated("name_key", keep=False)].sort_values(["name_key","clinic_name"])
dups_right = r_key[r_key.duplicated("name_key", keep=False)].sort_values(["name_key","clinic_name"])

print("\n=== Duplicates (by normalized clinic_name) ===")
print(f"LEFT duplicates: {dups_left['name_key'].nunique()} names, {len(dups_left)} rows")
print(f"RIGHT duplicates: {dups_right['name_key'].nunique()} names, {len(dups_right)} rows")

# ---------- missing clinics (exact name_key match) ----------
missing_on_left  = r_key[~r_key["name_key"].isin(l_key["name_key"])]
missing_on_right = l_key[~l_key["name_key"].isin(r_key["name_key"])]

print("\n=== Missing clinics (by name) ===")
print(f"In RIGHT not in LEFT:  {missing_on_left['name_key'].nunique()} clinics")
print(f"In LEFT not in RIGHT:  {missing_on_right['name_key'].nunique()} clinics")

# ---------- write audit CSVs ----------
dups_left.to_csv("audit_left_duplicates_by_name.csv", index=False, encoding="utf-8")
dups_right.to_csv("audit_right_duplicates_by_name.csv", index=False, encoding="utf-8")
missing_on_left.to_csv("audit_missing_on_left_by_name.csv", index=False, encoding="utf-8")
missing_on_right.to_csv("audit_missing_on_right_by_name.csv", index=False, encoding="utf-8")

# ---------- optional fuzzy suggestions ----------
if process is not None:
    print("\n=== Fuzzy suggestions (RIGHT → LEFT) ===")
    left_keys  = l_key[["name_key","clinic_name"]].drop_duplicates()
    right_keys = r_key[["name_key","clinic_name"]].drop_duplicates()
    right_unmatched = right_keys[~right_keys["name_key"].isin(left_keys["name_key"])]

    lookup = dict(left_keys.values)
    left_key_list = list(left_keys["name_key"])

    suggestions = []
    for _, row in right_unmatched.iterrows():
        rk = row["name_key"]
        best = process.extractOne(rk, left_key_list, scorer=fuzz.token_set_ratio)
          

In [None]:
import os, re, unicodedata
import pandas as pd
import geopandas as gpd

# --------- CONFIG: set your file paths here ----------
LEFT_PATH  = "berlin_vet_clinics_clean.csv"                # your main dataset
RIGHT_PATH = "vet_clinics_aligned_merged_cleaned.csv"      # the aligned/merged CSV
# ------------------------------------------------------

def _load_table(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing file: {path}")
    low = path.lower()
    if low.endswith((".geojson", ".gpkg", ".shp", ".json")):
        return gpd.read_file(path)
    return pd.read_csv(path)

def _norm_name(s):
    if pd.isna(s): return None
    s = unicodedata.normalize("NFKC", str(s)).strip()
    s = re.sub(r"\s+", " ", s).lower()
    s = re.sub(r"[^\w &]", "", s)  # strip punctuation except word chars/space/&
    return s or None

# 1) Load both datasets (LEFT can be GeoDataFrame or DataFrame)
left  = _load_table(LEFT_PATH)
right = _load_table(RIGHT_PATH)

# 2) Ensure `clinic_name` exists
for df in (left, right):
    if df is None:
        raise ValueError("A dataset failed to load and is None.")
    if "clinic_name" not in df.columns and "name" in df.columns:
        df.rename(columns={"name": "clinic_name"}, inplace=True)

if "clinic_name" not in left.columns or "clinic_name" not in right.columns:
    raise KeyError("Both datasets must have a `clinic_name` column (rename `name` -> `clinic_name`).")

# 3) Preserve geometry if LEFT is a GeoDataFrame
left_is_gdf = isinstance(left, gpd.GeoDataFrame)
left_geom = left.geometry if left_is_gdf else None
if left_is_gdf:
    left = pd.DataFrame(left.drop(columns=left.geometry.name))

# 4) Build normalized join key
left["name_key"]  = left["clinic_name"].map(_norm_name)
right["name_key"] = right["clinic_name"].map(_norm_name)
left  = left[~left["name_key"].isna()].copy()
right = right[~right["name_key"].isna()].copy()

# 5) Quick audit
cols_new_in_right = sorted(set(right.columns) - set(left.columns) - {"name_key"})
dup_left  = left[left.duplicated("name_key", keep=False)].sort_values("name_key")
dup_right = right[right.duplicated("name_key", keep=False)].sort_values("name_key")
missing_in_left = sorted(set(right["name_key"]) - set(left["name_key"]))

print(f"New columns in RIGHT not in LEFT: {cols_new_in_right}")
print(f"Duplicate rows in LEFT (by clinic_name): {dup_left.shape[0]}")
print(f"Duplicate rows in RIGHT (by clinic_name): {dup_right.shape[0]}")
print(f"Names present in RIGHT but missing in LEFT: {len(missing_in_left)}")

# 6) Bring columns from RIGHT → LEFT
to_add = cols_new_in_right.copy()
overlap_fill = [c for c in right.columns
                if c in left.columns and c not in ("clinic_id","clinic_name","name_key")]

r_small = right[["name_key"] + to_add + overlap_fill].copy()
merged = left.merge(r_small, on="name_key", how="left", suffixes=("", "_r"))

# Fill NULLs in overlapping columns with RIGHT values
for c in overlap_fill:
    merged[c] = merged[c].fillna(merged[c + "_r"])
    merged.drop(columns=[c + "_r"], inplace=True)

# Remove helper key
merged.drop(columns=["name_key"], inplace=True)

# 7) Reattach geometry if LEFT was GeoDataFrame
if left_is_gdf:
    merged = gpd.GeoDataFrame(merged, geometry=left_geom.loc[merged.index], crs="EPSG:4326")

# 8) Save
out_csv = "berlin_vet_clinics_clean_enriched_from_aligned.csv"
merged.to_csv(out_csv, index=False, encoding="utf-8")
print("Saved:", out_csv, merged.shape)



In [None]:
last fixes

In [None]:
import pandas as pd, geopandas as gpd
from shapely.geometry import Point

def _ensure_geodf(df, crs="EPSG:4326"):
    """Make sure we have a GeoDataFrame with point geometries from lat/lon."""
    if isinstance(df, gpd.GeoDataFrame):
        gdf = df.copy()
        if gdf.crs is None: gdf.set_crs(crs, inplace=True, allow_override=True)
        elif gdf.crs.to_string() != crs: gdf = gdf.to_crs(crs)
        return gdf
    # build geometry from lat/lon
    if not {"latitude","longitude"}.issubset(df.columns):
        raise ValueError("Need latitude/longitude columns if input is a plain DataFrame.")
    gdf = gpd.GeoDataFrame(df.copy(),
                           geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
                           crs=crs)
    return gdf

def attach_ids_from_geojson(clinics, geojson_path, id_cols, out_cols):
    """
    Spatially joins polygons to clinics.
    - id_cols: columns in polygons to copy (e.g., ["district_id", "neighbourhood_id"])
    - out_cols: destination columns in clinics (same length as id_cols)
    """
    gdf = _ensure_geodf(clinics, "EPSG:4326")
    polys = gpd.read_file(geojson_path)
    if polys.crs is None: polys.set_crs("EPSG:4326", inplace=True, allow_override=True)
    else: polys = polys.to_crs("EPSG:4326")

    # If the polygon properties use different names, rename here once:
    # polys = polys.rename(columns={"bezirk_id":"district_id","ortsteil_id":"neighbourhood_id"})

    keep = [c for c in id_cols if c in polys.columns]
    if len(keep) != len(id_cols):
        missing = [c for c in id_cols if c not in polys.columns]
        raise KeyError(f"Missing columns in polygons: {missing}")

    joined = gpd.sjoin(gdf, polys[keep + ["geometry"]], how="left", predicate="within")
    for src, dst in zip(id_cols, out_cols):
        joined[dst] = joined[dst].fillna(joined[src]) if dst in joined.columns else joined[src]
        if src != dst: 
            # keep original dst if it was already filled; otherwise assign src
            mask = joined[dst].isna()
            joined.loc[mask, dst] = joined.loc[mask, src]
    joined = joined.drop(columns=[c for c in id_cols if c not in out_cols] + ["index_right"], errors="ignore")
    return joined

# --- Example usage (edit paths & property names to your files) ---
# If your clinics are in CSV:
clinics = pd.read_csv("berlin_vet_clinics_clean_enriched_from_aligned.csv")
# If they’re already a GeoDataFrame:
# clinics = gpd.read_file("berlin_vet_clinics_normalized.geojson")

# Fill from districts polygons
clinics = attach_ids_from_geojson(
    clinics,
    "lor_ortsteile.geojson",
    id_cols=["district_id"],           # or ["bezirk_id"] then use a rename above
    out_cols=["district_id"]
)

# Then fill from neighbourhood polygons
clinics = attach_ids_from_geojson(
    clinics,
    "lor_ortsteile.geojson",
    id_cols=["neighbourhood_id"],      # or ["ortsteil_id"]
    out_cols=["neighbourhood_id"]
)

# Save back out
# clinics.to_file("berlin_vet_clinics_with_admin.gpkg", layer="clinics", driver="GPKG")
# clinics.drop(columns="geometry").to_csv("berlin_vet_clinics_with_admin.csv", index=False, encoding="utf-8")
