cleaning osm data


step1

In [19]:
import pandas as pd
import geopandas as gpd
import numpy as np

def _ensure_osmid_column(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Make sure we have an 'osmid' column (OSMnx can put it in the index)."""
    if 'osmid' in gdf.columns:
        return gdf
    if isinstance(gdf.index, pd.MultiIndex):
        if 'osmid' in gdf.index.names:
            return gdf.reset_index()
        gdf = gdf.reset_index()
    else:
        gdf = gdf.reset_index()
    # Fallbacks
    if 'osmid' not in gdf.columns:
        if 'id' in gdf.columns:
            gdf = gdf.rename(columns={'id': 'osmid'})
        else:
            gdf['osmid'] = gdf.index.astype(str)
    return gdf

def _coalesce_cols(df: pd.DataFrame, cols):
    """Row-wise first-non-null across a list of optional columns."""
    out = pd.Series(pd.NA, index=df.index, dtype="object")
    for c in cols:
        if c in df.columns:
            out = out.fillna(df[c])
    return out

def normalize_clinic_geometries(gdf: gpd.GeoDataFrame, keep_footprint=True) -> gpd.GeoDataFrame:
    """Set CRS to EPSG:4326, convert non-points to centroids; optionally keep original footprint."""
    # CRS → 4326
    if gdf.crs is None:
        gdf = gdf.set_crs(4326, allow_override=True)
    else:
        gdf = gdf.to_crs(4326)

    if keep_footprint:
        gdf['geometry_footprint'] = gdf.geometry

    # Point or centroid
    geomtype = gdf.geometry.geom_type
    centroids = gdf.geometry.centroid
    new_geom = np.where(geomtype.eq('Point'), gdf.geometry, centroids)
    gdf = gdf.set_geometry(new_geom)
    gdf = gdf.set_crs(4326, allow_override=True)

    # Lat/Lon from point
    gdf['latitude']  = gdf.geometry.y
    gdf['longitude'] = gdf.geometry.x
    return gdf

def prune_and_rename_clinics(gdf: gpd.GeoDataFrame, keep_footprint=True) -> gpd.GeoDataFrame:
    gdf = _ensure_osmid_column(gdf).copy()

    # Filter to amenity=veterinary (if available)
    if 'amenity' in gdf.columns:
        gdf = gdf[gdf['amenity'].astype(str).str.lower().eq('veterinary')].copy()

    # Geometry normalization (point + lat/lon, keep footprint optionally)
    gdf = normalize_clinic_geometries(gdf, keep_footprint=keep_footprint)

    # Coalesce contact fields
    phone   = _coalesce_cols(gdf, ['contact:phone', 'phone'])
    website = _coalesce_cols(gdf, ['contact:website', 'website'])
    email   = _coalesce_cols(gdf, ['contact:email', 'email'])

    # Build full address
    def _full_address(row):
        a = " ".join([str(row.get('addr:street') or "").strip(),
                      str(row.get('addr:housenumber') or "").strip()]).strip()
        b = " ".join([str(row.get('addr:postcode') or "").strip(),
                      str(row.get('addr:city') or "").strip()]).strip()
        parts = [p for p in [a, b] if p]
        return ", ".join(parts) if parts else pd.NA

    gdf['full_address'] = gdf.apply(_full_address, axis=1)

    # Target schema mapping (rename/pass-through)
    rename_map = {
        'osmid': 'clinic_id',
        'name': 'clinic_name',
        'addr:street': 'street',
        'addr:housenumber': 'house_number',
        'addr:postcode': 'postcode',
        'addr:city': 'city',
        'veterinary:speciality': 'speciality',
        'wheelchair': 'wheelchair_acces',  # as requested (one 's')
        # keep: amenity, opening_hours, operator, brand, geometry
    }
    for old, new in rename_map.items():
        if old in gdf.columns:
            gdf = gdf.rename(columns={old: new})

    # Inject coalesced contact fields to target names
    gdf['phone_number'] = phone
    gdf['website']      = website
    gdf['email']        = email

    # Add district/neighbourhood placeholders (to be enriched later)
    gdf['district_id']      = pd.NA
    gdf['neighbourhood_id'] = pd.NA

    # Final column order
    final_cols = [
        'clinic_id', 'clinic_name',
        'street', 'house_number', 'postcode', 'city',
        'district_id', 'neighbourhood_id',
        'phone_number', 'website', 'email',
        'opening_hours', 'operator', 'brand',
        'speciality', 'wheelchair_acces',
        'full_address', 'latitude', 'longitude',
        'amenity', 'geometry'
    ]

    # Keep only requested columns that actually exist
    existing = [c for c in final_cols if c in gdf.columns]
    out = gdf[existing].copy()

    # Ensure proper dtypes for id/postcode if desired (keep as text here)
    if 'clinic_id' in out.columns:
        out['clinic_id'] = out['clinic_id'].astype(str)
    if 'postcode' in out.columns:
        out['postcode'] = out['postcode'].astype(str)

    return out


In [20]:
# 1) geting raw OSM features (point/polygon geometries)
from osmnx.features import features_from_place as get_features  # OSMnx ≥2
gdf_raw = get_features("Berlin, Germany", {"amenity": "veterinary"}).reset_index()

# 2) running the pruning/renaming + geometry normalization
clinics_gdf = prune_and_rename_clinics(gdf_raw, keep_footprint=True)

# 3) quick sanity checks
clinics_gdf.head(), clinics_gdf.crs, clinics_gdf.shape

clinics_gdf.to_file("berlin_vet_clinics_normalized.geojson", driver="GeoJSON")
clinics_gdf.drop(columns="geometry").to_csv("berlin_vet_clinics_normalized.csv", index=False, encoding="utf-8")



  centroids = gdf.geometry.centroid


step2

In [21]:
# --- Apply cleaning helpers to the target-schema GeoDataFrame ---

import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

def _geo_latlon(g):
    """Return (lat, lon) from geometry; centroid fallback; None if missing."""
    if g is None or g.is_empty:
        return (None, None)
    p = g
    if not isinstance(p, Point):
        try:
            p = g.centroid
        except Exception:
            return (None, None)
    return (float(p.y), float(p.x))

def clean_clinics_target_schema(clinics_gdf, country="DE", postcode_as_text=True):
    """
    Use your clean_vet_df() (which expects OSM-style column names),
    then map cleaned values back to the target schema on clinics_gdf.
    """
    gdf = clinics_gdf.copy()

    # 1) Build a temporary OSM-style frame the cleaner expects
    tmp = pd.DataFrame({
        "name": gdf.get("clinic_name"),
        "addr:street": gdf.get("street"),
        "addr:housenumber": gdf.get("house_number"),
        "addr:city": gdf.get("city"),
        "addr:postcode": gdf.get("postcode"),
        "phone": gdf.get("phone_number"),
        "website": gdf.get("website"),
        "email": gdf.get("email"),
        "opening_hours": gdf.get("opening_hours"),
        "operator": gdf.get("operator"),
        "brand": gdf.get("brand"),
        "wheelchair": gdf.get("wheelchair_acces", gdf.get("wheelchair_access")),
        "emergency": gdf.get("emergency"),
    })

    # 2) Clean using helper
    tmp_clean = clean_vet_df(tmp, country=country, postcode_as_text=postcode_as_text)

    # 3) Write cleaned values back to target-schema columns
    gdf["clinic_name"]   = tmp_clean["name"]
    gdf["street"]        = tmp_clean["addr:street"]
    gdf["house_number"]  = tmp_clean["addr:housenumber"]
    gdf["city"]          = tmp_clean["addr:city"]
    gdf["postcode"]      = tmp_clean["addr:postcode"]
    gdf["phone_number"]  = tmp_clean.get("phone")
    gdf["website"]       = tmp_clean.get("website")
    gdf["email"]         = tmp_clean.get("email")
    gdf["opening_hours"] = tmp_clean.get("opening_hours")
    gdf["operator"]      = tmp_clean.get("operator")
    gdf["wheelchair_acces"] = tmp_clean.get("wheelchair")  # keep your column name
    gdf["emergency"]        = tmp_clean.get("emergency")

    # Optional: bring over the validity flag from your helper
    if "opening_hours_valid" in tmp_clean.columns:
        gdf["opening_hours_valid"] = tmp_clean["opening_hours_valid"]

    # 4) Rebuild full_address (street + house + postcode + city) when available
    def _addr_row(r):
        parts = [r.get("street"), r.get("house_number"), r.get("postcode"), r.get("city")]
        parts = [p for p in parts if isinstance(p, str) and p.strip() != ""]
        return ", ".join(parts) if parts else None
    gdf["full_address"] = gdf.apply(_addr_row, axis=1)

    # 5) Ensure geometry CRS and lat/lon columns exist (point or centroid)
    if getattr(gdf, "crs", None) is None:
        try:
            gdf.set_crs(epsg=4326, inplace=True, allow_override=True)
        except Exception:
            pass
    elif gdf.crs.to_epsg() != 4326:
        gdf = gdf.to_crs(4326)

    if "latitude" not in gdf.columns or "longitude" not in gdf.columns:
        lats, lons = zip(*gdf.geometry.map(_geo_latlon))
        gdf["latitude"] = lats
        gdf["longitude"] = lons

    # 6) (Safety) keep only veterinary amenity rows if column present
    if "amenity" in gdf.columns:
        gdf = gdf[gdf["amenity"].fillna("").astype(str).str.lower().eq("veterinary")]

    return gdf

# --- run it ---
clinics_clean = clean_clinics_target_schema(clinics_gdf, country="DE", postcode_as_text=True)

# quick peek / save
display(clinics_clean.head())
clinics_clean.drop(columns="geometry").to_csv("berlin_vet_clinics_clean.csv", index=False, encoding="utf-8")


Unnamed: 0,clinic_id,clinic_name,street,house_number,postcode,city,district_id,neighbourhood_id,phone_number,website,...,opening_hours,operator,wheelchair_acces,full_address,latitude,longitude,amenity,geometry,emergency,opening_hours_valid
0,268917040,Tierarztpraxis am Urban,Baerwaldstraße,69.0,10961.0,Berlin,,,,,...,"Mo-Sa 10:00-12:00, Mo 17:00-19:00, Tu,We,Fr 16...",,no,"Baerwaldstraße, 69, 10961, Berlin",52.495684,13.405233,veterinary,POINT (13.40523 52.49568),,
1,299795048,Dr. med. vet. Elke Hartwig,Straße 48,67.0,13125.0,Berlin,,,49309437820.0,http://www.tierarztpraxis-hartwig.de/,...,"Mo,Tu,Th,Fr 10:00-12:00, Mo-Fr 15:00-18:00",,limited,"Straße 48, 67, 13125, Berlin",52.606286,13.479555,veterinary,POINT (13.47955 52.60629),,
2,347294456,Tierarztpraxis Dr. Bernhard Sörensen,Königsberger Straße,36.0,12207.0,Berlin,,,49307738321.0,https://www.tierarztpraxis-soerensen.de/,...,"Mo-Fr 09:00-20:00; Sa, Su 10:00-18:00",,yes,"Königsberger Straße, 36, 12207, Berlin",52.429722,13.320133,veterinary,POINT (13.32013 52.42972),,
3,394867279,Tierarztpraxis Jeanette Koepsel,,,,,,,,,...,,,,,52.535199,13.270573,veterinary,POINT (13.27057 52.5352),,
4,411550894,Kleintierarztpraxis Berlin Kaulsdorf,Planitzstraße,19.0,12621.0,Berlin,,,493053018585.0,https://www.tierarzt-kaulsdorf.de/,...,"Mo-Fr 09:00-19:00 open ""tel. Terminvereinbarun...",Dr. Berit Miels;Dr. Mathias Kochert,,"Planitzstraße, 19, 12621, Berlin",52.509511,13.589635,veterinary,POINT (13.58964 52.50951),,


In [22]:
# dedupe on raw OSM features
gdf_raw = _ensure_osmid_column(gdf_raw)
before = len(gdf_raw)
gdf_raw = gdf_raw.drop_duplicates(subset=["osmid"], keep="first").copy()
print(f"Dropped {before - len(gdf_raw)} exact duplicates on osmid")


Dropped 0 exact duplicates on osmid


In [23]:
before = len(clinics_gdf)
clinics_gdf = clinics_gdf.drop_duplicates(subset=["clinic_id"], keep="first").copy()
print(f"Dropped {before - len(clinics_gdf)} exact duplicates on clinic_id")


Dropped 0 exact duplicates on clinic_id


alligning bpt and taek

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point

# ---------- light utilities ----------
def pick_first_col(df: pd.DataFrame, candidates):
    """Return the first existing column (Series) from candidates or an NA Series."""
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series(pd.NA, index=df.index, dtype="object")

SYNONYMS = {
    "clinic_id":        ["clinic_id","osmid","osm_id","id","uid","facility_id"],
    "clinic_name":      ["clinic_name","name","practice","praxis","facility_name","betrieb"],
    "street":           ["street","addr:street","addr_street","address_street","strasse","straße"],
    "house_number":     ["house_number","addr:housenumber","housenumber","hnr","nr"],
    "postcode":         ["postcode","addr:postcode","zip","plz","postleitzahl"],
    "city":             ["city","addr:city","ort","stadt","gemeinde"],
    "phone_number":     ["phone_number","contact:phone","phone","telefon","tel"],
    "website":          ["website","contact:website","url","homepage","web"],
    "email":            ["email","contact:email","mail","e-mail"],
    "opening_hours":    ["opening_hours","hours","sprechzeiten","openinghours"],
    "operator":         ["operator","betreiber","owner"],
    "brand":            ["brand","kette"],
    "speciality":       ["speciality","specialty","veterinary:speciality","fachrichtung"],
    "wheelchair_acces": ["wheelchair_acces","wheelchair_access","wheelchair","barrierefrei","barrierefreiheit"],
    "amenity":          ["amenity","type","category"],
    "latitude":         ["latitude","lat","y"],
    "longitude":        ["longitude","lon","lng","x"],
}

def standardize_from_csv(df: pd.DataFrame, source_name: str, explicit_map: dict | None = None,
                         default_amenity: str = "veterinary") -> gpd.GeoDataFrame:
    """
    Map arbitrary CSV columns into your target schema.
    - `explicit_map`: optional dict {target_col: "source_col"} to override auto-mapping.
    - Adds a `source` column to track provenance.
    - Creates geometry from lat/lon if present; CRS = EPSG:4326.
    """
    explicit_map = explicit_map or {}
    out = pd.DataFrame(index=df.index)

    # 1) map all target fields
    for target, candidates in SYNONYMS.items():
        if target in explicit_map and explicit_map[target] in df.columns:
            out[target] = df[explicit_map[target]]
        else:
            out[target] = pick_first_col(df, candidates)

    # Defaults / fixes
    if out["amenity"].isna().all():
        out["amenity"] = default_amenity

    # Guarantee an id (if missing), prefix with source to avoid cross-source collisions
    if out["clinic_id"].isna().any():
        out["clinic_id"] = out["clinic_id"].fillna(pd.Series([f"{source_name}_{i}" for i in range(len(out))], index=out.index))

    # Build full_address (simple concatenation; will be rebuilt again after cleaning)
    def _addr_row(r):
        parts = [r.get("street"), r.get("house_number"), r.get("postcode"), r.get("city")]
        parts = [p for p in parts if isinstance(p, str) and p.strip() != ""]
        return ", ".join(parts) if parts else pd.NA
    out["full_address"] = out.apply(_addr_row, axis=1)

    # District placeholders
    out["district_id"] = pd.NA
    out["neighbourhood_id"] = pd.NA

    # 2) to GeoDataFrame (geometry from lat/lon if available)
    has_latlon = out["latitude"].notna() & out["longitude"].notna()
    geom = [Point(float(lon), float(lat)) if (pd.notna(lat) and pd.notna(lon)) else None
            for lat, lon in zip(out["latitude"], out["longitude"])]
    gdf = gpd.GeoDataFrame(out, geometry=geom, crs="EPSG:4326")

    # provenance
    gdf["source"] = source_name

    return gdf

# ---------- ALIGN TWO CSVs ----------
csv_a = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v3.csv"
csv_b = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/bpt_tierarztsuche_berlin.csv"

df_a = pd.read_csv(csv_a)
df_b = pd.read_csv(csv_b)



gdf_a_raw = standardize_from_csv(df_a, source_name="A", explicit_map=colmap_a)
gdf_b_raw = standardize_from_csv(df_b, source_name="B", explicit_map=colmap_b)

# ---------- CLEAN  ----------
gdf_a = clean_clinics_target_schema(gdf_a_raw, country="DE", postcode_as_text=True)
gdf_b = clean_clinics_target_schema(gdf_b_raw, country="DE", postcode_as_text=True)

# ---------- DEDUPE WITHIN EACH SOURCE (exact ID) ----------
gdf_a = gdf_a.drop_duplicates(subset=["clinic_id"], keep="first").copy()
gdf_b = gdf_b.drop_duplicates(subset=["clinic_id"], keep="first").copy()

# ---------- MERGE SOURCES ----------
combined = pd.concat([gdf_a, gdf_b], ignore_index=True)

# If both sources may share the same ids, keep first occurrence:
combined = combined.drop_duplicates(subset=["clinic_id"], keep="first").copy()

# ---------- SOFT DEDUPE (no/weak IDs): name + address key ----------
def _norm(s):
    from unicodedata import normalize
    import re
    if pd.isna(s): return ""
    s = normalize("NFKC", str(s)).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

combined["__name_key"] = combined["clinic_name"].map(_norm)
combined["__addr_key"] = (combined["street"].map(_norm) + " " +
                          combined["house_number"].map(_norm) + "|" +
                          combined["postcode"].map(_norm))

before_soft = len(combined)
combined = combined.drop_duplicates(subset=["__name_key","__addr_key"], keep="first").copy()
print(f"Soft de-dupe (name+address): {before_soft - len(combined)} rows dropped")

# ---------- OPTIONAL: SPATIAL + NAME MERGE (node vs polygon–like) ----------
# Reuse your `dedupe_clinics` if you have (it prefers polygons, keeps point geom).
# Only run if we have some geometries.
if "geometry" in combined and combined.geometry.notna().any():
    try:
        combined = dedupe_clinics(combined, distance_m=30, name_col="clinic_name", id_col="clinic_id")
    except Exception as e:
        print("Spatial dedupe skipped (error):", e)

# ---------- FINALIZE ----------
# Clean up helper keys
combined = combined.drop(columns=[c for c in ["__name_key","__addr_key"] if c in combined.columns])

# Ensure CRS and lat/lon present
if getattr(combined, "crs", None) is None:
    combined = combined.set_crs(4326, allow_override=True)
elif combined.crs.to_epsg() != 4326:
    combined = combined.to_crs(4326)

if "latitude" not in combined.columns or "longitude" not in combined.columns:
    combined["latitude"]  = combined.geometry.y
    combined["longitude"] = combined.geometry.x

# Save
combined.to_file("vet_clinics_aligned_merged.geojson", driver="GeoJSON")
combined.drop(columns="geometry").to_csv("vet_clinics_aligned_merged.csv", index=False, encoding="utf-8")

combined.head()


Soft de-dupe (name+address): 0 rows dropped
Exact-ID dedup: 0 dropped


  combined = pd.concat([gdf_a, gdf_b], ignore_index=True)


Unnamed: 0,clinic_id,clinic_name,street,house_number,postcode,city,phone_number,website,email,opening_hours,...,latitude,longitude,full_address,district_id,neighbourhood_id,geometry,source,emergency,opening_hours_valid,footprint
0,A_0,,Alt-Biesdorf,22,12683,Berlin,49305143760,,,,...,,,"Alt-Biesdorf, 22, 12683, Berlin",,,,A,,,
1,A_1,,Märkische Allee,258,12679,Berlin,49309322093,,,,...,,,"Märkische Allee, 258, 12679, Berlin",,,,A,,,
2,A_2,,Potsdamer Str.,23,14163,Berlin,4930201805750,,,,...,,,"Potsdamer Str., 23, 14163, Berlin",,,,A,,,
3,A_3,,Uhlandstr,151,10719,Berlin,493023362627,,,,...,,,"Uhlandstr, 151, 10719, Berlin",,,,A,,,
4,A_4,,Scharnweberstr.,136,13405,Berlin,49304127357,,,,...,,,"Scharnweberstr., 136, 13405, Berlin",,,,A,,,


In [None]:
one mopre step in cleaning

In [26]:
import re
import pandas as pd

# --- Load only if the variable doesn't already exist ---
try:
    vet_clinics_aligned_merged
except NameError:
    vet_clinics_aligned_merged = pd.read_csv("vet_clinics_aligned_merged.csv")

# --- Split helpers ---
_HN_TRAIL_RE = re.compile(r"""
    ^\s*(?P<street>.*?\S)[,\s]*(?P<hn>\d{1,5}(?:\s?[A-Za-z])?(?:\s?[\/\-–—]\s?\d{1,5}\s?[A-Za-z]?)?)\s*$
""", re.X)

_HN_LEAD_RE = re.compile(r"""
    ^\s*(?P<hn>\d{1,5}(?:\s?[A-Za-z])?(?:\s?[\/\-–—]\s?\d{1,5}\s?[A-Za-z]?)?)[,\s]+(?P<street>.+?)\s*$
""", re.X)

def _split_street_hn(s: str):
    if not isinstance(s, str) or not s.strip():
        return (s, None)
    t = re.sub(r"[–—−]", "-", s.strip())
    t = re.sub(r"\s+", " ", t)
    m = _HN_TRAIL_RE.match(t) or _HN_LEAD_RE.match(t)
    if m:
        street = m.group("street").strip().rstrip(",")
        hn = re.sub(r"\s+", "", m.group("hn"))
        return (street, hn)
    return (t, None)

def clean_street_and_house_number(df: pd.DataFrame,
                                  street_col="street",
                                  house_col="house_number") -> pd.DataFrame:
    if street_col not in df.columns:
        raise KeyError(f"'{street_col}' column not found")

    # split
    split_series = df[street_col].map(_split_street_hn)
    new_street   = split_series.map(lambda t: t[0])
    extracted_hn = split_series.map(lambda t: t[1])

    # ensure house_number exists
    if house_col not in df.columns:
        df[house_col] = pd.NA

    # fill only where current house_number is empty
    has_existing = df[house_col].astype(str).str.strip().ne("").fillna(False)
    df.loc[~has_existing, house_col] = extracted_hn

    # update street
    df[street_col] = new_street
    return df

# --- Apply ONLY to this dataframe ---
vet_clinics_aligned_merged = clean_street_and_house_number(
    vet_clinics_aligned_merged,
    street_col="street",
    house_col="house_number"
)

# (Optional) save a cleaned copy
vet_clinics_aligned_merged.to_csv("vet_clinics_aligned_merged_cleaned.csv", index=False, encoding="utf-8")

# quick peek
vet_clinics_aligned_merged[["street", "house_number"]].head(20)


Unnamed: 0,street,house_number
0,Alt-Biesdorf,22
1,Märkische Allee,258
2,Potsdamer Str.,23
3,Uhlandstr,151
4,Scharnweberstr.,136
5,Königsweg,65
6,Oertzenweg,19b
7,Königsweg,63
8,Königsberger Str.,36
9,Kanada Allee,12
