cleaning osm data


step1

In [None]:
# transform_osm_vets.py
# Clean + map OSM veterinary clinics (Berlin) for DB import

import pandas as pd
import numpy as np
from pathlib import Path

# === Step 1: Load exported OSM data ===
# (Use the stable file you saved with your fetch script.)
raw_file = "./vet_clinics/sources/berlin_vet_clinics_osm.csv"
df = pd.read_csv(raw_file)

# === Step 2: Field mapping (OSM -> target schema) ===
rename_map = {
    "osmid": "clinic_id",
    "name": "clinic_name",
    "amenity": "amenity",
    "addr:street": "street",
    "addr:housenumber": "house_number",
    "addr:postcode": "postcode",
    "addr:city": "city",
    "phone": "phone_number",
    "website": "website",
    "email": "email",
    "opening_hours": "opening_hours",
    "operator": "operator",
    "brand": "brand",
    "veterinary:speciality": "speciality",
    "wheelchair": "wheelchair_acces",
    "emergency": "emergency",
    "lat": "latitude",
    "lon": "longitude",
}

df = df.rename(columns=rename_map)

# Ensure all target columns exist (create empty if missing)
target_cols = [
    "clinic_id", "clinic_name", "amenity",
    "street", "house_number", "postcode", "city",
    "phone_number", "website", "email",
    "opening_hours", "operator", "brand",
    "speciality", "wheelchair_acces", "emergency",
    "latitude", "longitude",
    # placeholders to be filled later:
    "district_id", "neighbourhood_id",
    # convenience:
    "full_address",
]
for c in target_cols:
    if c not in df.columns:
        df[c] = ""

# === Step 3: Light cleaning ===

# Keep only amenity=veterinary (safety filter)
df["amenity"] = df["amenity"].astype(str)
df = df[df["amenity"].str.lower().eq("veterinary")].copy()

# Drop exact duplicates on clinic_id
if "clinic_id" in df.columns:
    df["clinic_id"] = df["clinic_id"].astype(str)
    before = len(df)
    df = df.drop_duplicates(subset=["clinic_id"], keep="first").copy()
    print(f"Dropped {before - len(df)} exact duplicate rows by clinic_id")

# Fill text fields
df["clinic_name"] = df["clinic_name"].fillna("").replace("", "Unknown Veterinary Clinic")
df["street"] = df["street"].fillna("").astype(str).str.strip()
df["house_number"] = df["house_number"].fillna("").astype(str).str.strip()
df["postcode"] = df["postcode"].fillna("").astype(str).str.strip()
df["city"] = df["city"].fillna("").replace("", "Berlin")

# Contacts
df["website"] = df["website"].fillna("").astype(str).str.strip()
df["phone_number"] = df["phone_number"].fillna("").astype(str).str.strip()
df["email"] = df["email"].fillna("").astype(str).str.strip()

# Opening hours / flags
df["opening_hours"] = df["opening_hours"].fillna("").astype(str).str.strip()
df["wheelchair_acces"] = df["wheelchair_acces"].fillna("").replace("", "unknown")
df["emergency"] = df["emergency"].fillna("").astype(str).str.strip()

# Coordinates
df["latitude"]  = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

# Placeholders for later spatial enrichment
df["district_id"] = df["district_id"].fillna("")
df["neighbourhood_id"] = df["neighbourhood_id"].fillna("")

# Full address convenience field
def _full_addr(r):
    a = " ".join([str(r.get("street", "")).strip(), str(r.get("house_number", "")).strip()]).strip()
    b = " ".join([str(r.get("postcode", "")).strip(), str(r.get("city", "")).strip()]).strip()
    return ", ".join([p for p in (a, b) if p])
df["full_address"] = df.apply(_full_addr, axis=1)

# Reorder/select columns
df = df[target_cols].copy()

# === Step 4: Save cleaned file ===
out_dir = Path("./vet_clinics/sources")
out_dir.mkdir(parents=True, exist_ok=True)
cleaned_file = out_dir / "vets_osm_berlin_cleaned_for_db.csv"
df.to_csv(cleaned_file, index=False, encoding="utf-8")

print(f"Cleaned data saved to {cleaned_file.resolve()} ({len(df)} rows)")


Dropped 0 exact duplicate rows by clinic_id
Cleaned data saved to /Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/vet_clinics/sources/vets_osm_berlin_cleaned_for_db.csv (178 rows)


In [59]:
import os; print(os.getcwd())




/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources
