In [32]:
# Use magic commands for shell operations in Jupyter Notebook
!python -m venv .venv
!source .venv/bin/activate
!pip install requests pydantic rapidfuzz geopy pandas




In [33]:
FOURSQUARE_API_KEY = "fsq0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
YELP_API_KEY = "abcdefghijklmnopqrstuvwxyz1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ-_abcdefghijklmnopqrstuvwxyz1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ-_"


In [36]:
import os
GOOGLE_API_KEY = os.getenv("AIzaSyCJi4AYz3XqrSUQVGD6QCk2wVH4FCqT8xI")


In [37]:
headers = {"Authorization": FOURSQUARE_API_KEY}


In [38]:
from datetime import datetime, timezone
datetime.now(timezone.utc).isoformat()


'2025-08-13T09:50:40.696371+00:00'

In [43]:
import os
import json
import requests
from datetime import datetime, timezone
from typing import Optional
from pydantic import BaseModel
from dotenv import load_dotenv

# -------------------
# LOAD API KEYS
# -------------------
# Set API keys directly for testing if not already set
if 'FOURSQUARE_API_KEY' not in globals() or FOURSQUARE_API_KEY is None:
    FOURSQUARE_API_KEY = "fsq0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # <-- Replace with your actual Foursquare API key

if 'YELP_API_KEY' not in globals() or YELP_API_KEY is None:
    YELP_API_KEY = "yelp0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # <-- Replace with your actual Yelp API key

if not FOURSQUARE_API_KEY or not YELP_API_KEY:
    raise ValueError("❌ Missing API keys. Check your .env file or set them in the notebook.")

# -------------------
# BERLIN BOUNDS
# -------------------
BERLIN_BOUNDS = [13.0884, 52.3383, 13.7611, 52.6755]  # min_lon, min_lat, max_lon, max_lat

# -------------------
# DATA SCHEMA
# -------------------
class Venue(BaseModel):
    name: str
    latitude: float
    longitude: float
    cuisine_type: Optional[str]
    address: Optional[str]
    contact_info: Optional[str]
    source: str
    retrieved_at: str

# -------------------
# OPENSTREETMAP SCRAPER
# -------------------
def fetch_osm_venues():
    query = f"""
    [out:json];
    (
      node["amenity"~"restaurant|cafe|bar"]({BERLIN_BOUNDS[1]},{BERLIN_BOUNDS[0]},{BERLIN_BOUNDS[3]},{BERLIN_BOUNDS[2]});
    );
    out;
    """
    url = "https://overpass-api.de/api/interpreter"
    r = requests.post(url, data={"data": query})
    r.raise_for_status()
    data = r.json()

    venues = []
    for element in data.get("elements", []):
        venues.append(Venue(
            name=element["tags"].get("name", "Unknown"),
            latitude=element["lat"],
            longitude=element["lon"],
            cuisine_type=element["tags"].get("cuisine"),
            address=element["tags"].get("addr:full"),
            contact_info=element["tags"].get("contact:phone"),
            source="OpenStreetMap",
            retrieved_at=datetime.now(timezone.utc).isoformat()
        ))
    return venues

# -------------------
# FOURSQUARE SCRAPER
# -------------------
def fetch_foursquare_venues():
    headers = {"Authorization": FOURSQUARE_API_KEY}
    params = {
        "ll": "52.5200,13.4050",  # Berlin center
        "radius": 10000,
        "categories": "13065,13032,13023",  # restaurants, cafes, bars
        "limit": 50
    }
    r = requests.get("https://api.foursquare.com/v3/places/search", headers=headers, params=params)
    if r.status_code != 200:
        print("Foursquare error:", r.text)
        return []

    data = r.json()
    venues = []
    for item in data.get("results", []):
        venues.append(Venue(
            name=item["name"],
            latitude=item["geocodes"]["main"]["latitude"],
            longitude=item["geocodes"]["main"]["longitude"],
            cuisine_type=None,
            address=item.get("location", {}).get("formatted_address"),
            contact_info=None,
            source="Foursquare",
            retrieved_at=datetime.now(timezone.utc).isoformat()
        ))
    return venues

# -------------------
# YELP SCRAPER
# -------------------
def fetch_yelp_venues():
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {
        "term": "restaurant, cafe, bar",
        "latitude": 52.5200,
        "longitude": 13.4050,
        "limit": 50
    }
    r = requests.get("https://api.yelp.com/v3/businesses/search", headers=headers, params=params)
    if r.status_code != 200:
        print("Yelp error:", r.text)
        return []

    data = r.json()
    venues = []
    for item in data.get("businesses", []):
        venues.append(Venue(
            name=item["name"],
            latitude=item["coordinates"]["latitude"],
            longitude=item["coordinates"]["longitude"],
            cuisine_type=", ".join([c.get("title", "") for c in item.get("categories", [])]),
            address=", ".join(item.get("location", {}).get("display_address", [])),
            contact_info=item.get("phone"),
            source="Yelp",
            retrieved_at=datetime.now(timezone.utc).isoformat()
        ))
    return venues

# -------------------
# MAIN
# -------------------
if __name__ == "__main__":
    all_venues = []
    all_venues.extend(fetch_osm_venues())
    all_venues.extend(fetch_foursquare_venues())
    all_venues.extend(fetch_yelp_venues())

    print(f"✅ Collected {len(all_venues)} venues")

    with open("venues.jsonl", "w", encoding="utf-8") as f:
        for v in all_venues:
            f.write(json.dumps(v.model_dump(), ensure_ascii=False) + "\n")

Foursquare error: {"message":"Invalid request token."}
Yelp error: {"error": {"code": "VALIDATION_ERROR", "description": "'Bearer yelp0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' does not match '^(?i)Bearer [A-Za-z0-9\\\\-\\\\_]{128}$'", "field": "Authorization", "instance": "Bearer yelp0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}}
✅ Collected 8216 venues


In [44]:
!head -n 10 venues.jsonl


{"name": "Aida", "latitude": 52.5068638, "longitude": 13.3228585, "cuisine_type": "italian;pizza", "address": null, "contact_info": null, "source": "OpenStreetMap", "retrieved_at": "2025-08-13T09:52:49.269880+00:00"}
{"name": "Madame Ngo", "latitude": 52.5062119, "longitude": 13.3180811, "cuisine_type": "asian", "address": null, "contact_info": null, "source": "OpenStreetMap", "retrieved_at": "2025-08-13T09:52:49.271455+00:00"}
{"name": "Nam Thuân", "latitude": 52.5073199, "longitude": 13.3207804, "cuisine_type": "vietnamese", "address": null, "contact_info": null, "source": "OpenStreetMap", "retrieved_at": "2025-08-13T09:52:49.271466+00:00"}
{"name": "La Rose", "latitude": 52.5063184, "longitude": 13.2846256, "cuisine_type": "italian", "address": null, "contact_info": null, "source": "OpenStreetMap", "retrieved_at": "2025-08-13T09:52:49.271470+00:00"}
{"name": "Eiscafe Eisberg", "latitude": 52.5388551, "longitude": 13.3960965, "cuisine_type": null, "address": null, "contact_info": "+4