In [None]:
import requests
import pandas as pd
import logging
import time

In [None]:
# Logging setup

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

OVERPASS_URL = "https://overpass-api.de/api/interpreter"

In [None]:
# Overpass Query

query = """
[out:json][timeout:60];
area[name="Berlin"]->.searchArea;
(
  node["amenity"~"restaurant|cafe|bar"](area.searchArea);
  way["amenity"~"restaurant|cafe|bar"](area.searchArea);
  relation["amenity"~"restaurant|cafe|bar"](area.searchArea);
);
out center;
"""

logging.info("Fetching venues from Overpass API...")
response = requests.post(
    OVERPASS_URL,
    data={"data": query},
    headers={"User-Agent": "berlin-venues-scraper/1.0"}
)
response.raise_for_status()
data = response.json()

elements = data.get("elements", [])
logging.info(f"Retrieved {len(elements)} elements from Overpass")

In [None]:
# Parse Results

records = []
for el in elements:
    tags = el.get("tags", {})
    lat = el.get("lat") or el.get("center", {}).get("lat")
    lon = el.get("lon") or el.get("center", {}).get("lon")

    records.append({
        "name": tags.get("name", "Unknown"),
        "category": tags.get("amenity", "Unknown"),
        "cuisine": tags.get("cuisine"),
        "address": f"{tags.get('addr:street', '')} {tags.get('addr:housenumber', '')}, "
                   f"{tags.get('addr:postcode', '')} {tags.get('addr:city', '')}".strip(", "),
        "latitude": lat,
        "longitude": lon,
        "website": tags.get("website"),
        "phone": tags.get("phone"),
        "opening_hours": tags.get("opening_hours"),
        "takeaway": tags.get("takeaway"),
        "wheelchair": tags.get("wheelchair")
    })

df = pd.DataFrame(records)
logging.info(f"Parsed {len(df)} records into DataFrame")

In [None]:
# Reverse Geolocation

def get_bezirk(lat, lon):
    """Retrieve district from Nominatim reverse geocoding"""
    url = "https://nominatim.openstreetmap.org/reverse"
    params = {"lat": lat, "lon": lon, "format": "json", "addressdetails": 1}
    headers = {"User-Agent": "berlin-venues-scraper/1.0"}
    try:
        r = requests.get(url, params=params, headers=headers, timeout=10)
        r.raise_for_status()
        data = r.json()
        return data.get("address", {}).get("suburb") \
            or data.get("address", {}).get("city_district") \
            or data.get("address", {}).get("borough")
    except requests.exceptions.RequestException as e:
        logging.warning(f"Error fetching district for ({lat}, {lon}): {e}")
        return None

def get_address(lat, lon):
    """Retrieve full formatted address from Nominatim"""
    url = "https://nominatim.openstreetmap.org/reverse"
    params = {"lat": lat, "lon": lon, "format": "json", "addressdetails": 1}
    headers = {"User-Agent": "berlin-venues-scraper/1.0"}
    try:
        r = requests.get(url, params=params, headers=headers, timeout=10)
        r.raise_for_status()
        data = r.json()
        return data.get("display_name")
    except requests.exceptions.RequestException as e:
        logging.warning(f"Error fetching address for ({lat}, {lon}): {e}")
        return None

# Apply reverse geolocation with throttling (to respect Nominatim usage policy)
districts, full_addresses = [], []
for i, row in df.iterrows():
    lat, lon = row["latitude"], row["longitude"]
    if pd.notna(lat) and pd.notna(lon):
        districts.append(get_bezirk(lat, lon))
        full_addresses.append(get_address(lat, lon))
        time.sleep(1)  # polite delay between requests
    else:
        districts.append(None)
        full_addresses.append(None)

df["district"] = districts
df["full_address"] = full_addresses

In [None]:
# Save to CSV

df.to_csv("berlin_venues_raw.csv", index=False, encoding="utf-8")
logging.info("Saved results to berlin_places.csv")