In [6]:
import sys
print(sys.getdefaultencoding()) 

utf-8


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, mean_squared_error
import xgboost as xgb
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.api import add_constant
from scipy.stats import zscore
import chardet
import requests
import json
from datetime import datetime

In [18]:
import requests
import json

# Overpass API endpoint
OVERPASS_URL = "http://overpass-api.de/api/interpreter"

# Bounding box for NYC: (south, west, north, east)
OVERPASS_QUERY = """
[out:json][timeout:60];
(
  node["amenity"="restaurant"](40.4774,-74.2589,40.9176,-73.7004);
  node["amenity"="cafe"](40.4774,-74.2589,40.9176,-73.7004);
);
out body;
"""

# Function to fetch data from Overpass API
def fetch_osm_data():
    print("Sending request to Overpass API...")
    try:
        response = requests.post(OVERPASS_URL, data={"data": OVERPASS_QUERY})
        response.raise_for_status()
        data = response.json()
        print(f"Raw response contains {len(data.get('elements', []))} elements")
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# Function to parse OSM data into the desired format
def parse_osm_business(node):
    tags = node.get("tags", {})
    
    # Map OSM tags to categories
    categories = []
    if tags.get("restaurant"):
        categories.append(tags["restaurant"].capitalize())
    if tags.get("cafe"):
        categories.append(tags["cafe"].capitalize())
    if not categories:
        categories = ["Unknown"]

    # Filter: only include if "Restaurant" or "Cafe" is in categories
    if not any(cat in ["Restaurant", "Cafe"] for cat in categories):
        return None

    # Build business data structure
    business_data = {
        "id": node.get("id"),
        "uuid": str(node.get("id")),
        "name": tags.get("name", "Unnamed Business"),
        "address": tags.get("addr:street", ""),
        "city": tags.get("addr:city", "New York"),
        "state": tags.get("addr:state", "NY"),
        "postal_code": tags.get("addr:postcode", ""),
        "latitude": node.get("lat"),
        "longitude": node.get("lon"),
        "stars": 0,
        "review_count": 0,
        "is_open": 1,
        "attributes": {
            "RestaurantsDelivery": "False",
            "OutdoorSeating": "False",
            "BusinessAcceptsCreditCards": "True",
            "BusinessParking": "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
            "BikeParking": "True",
            "RestaurantsPriceRange2": "1",
            "RestaurantsTakeOut": "True",
            "ByAppointmentOnly": "False",
            "WiFi": "u'no'",
            "Alcohol": "u'none'",
            "Caters": "False"
        },
        "categories": categories,
        "hours": {
            "Monday": "7:0-20:0",
            "Tuesday": "7:0-20:0",
            "Wednesday": "7:0-20:0",
            "Thursday": "7:0-20:0",
            "Friday": "7:0-21:0",
            "Saturday": "7:0-21:0",
            "Sunday": "7:0-21:0"
        },
        "summary": f"{tags.get('name', 'Unnamed Business')} is an establishment in New York City, categorized as {', '.join(categories)}."
    }
    return business_data

# Main function to collect and save data
def collect_nyc_data(filename="nyc_osm_businesses.json"):
    data = fetch_osm_data()
    if not data:
        print("No data returned. Check your internet connection or the Overpass API status.")
        return
    
    elements = data.get("elements", [])
    print(f"Found {len(elements)} raw elements before filtering")
    
    if not elements:
        print("No businesses found. The query returned no results for the NYC bounding box.")
        return
    
    # Print raw node data for the first 10 items
    print("\nRaw node data for the first 10 items:")
    for i, node in enumerate(elements[:2], 1):
        print(f"\nItem {i}:")
        print(json.dumps(node, indent=4))
    
    # Process and filter businesses
    all_businesses = []
    for node in elements:
        business = parse_osm_business(node)
        if business:
            all_businesses.append(business)
    
    if not all_businesses:
        print("No restaurants or cafes found in the data.")
        return
    
    # Save to JSON file
    with open(filename, "w") as f:
        json.dump(all_businesses, f, indent=4)
    
    print(f"Saved {len(all_businesses)} restaurants and cafes to {filename}")

# Run the script
if __name__ == "__main__":
    collect_nyc_data()

Sending request to Overpass API...
Raw response contains 10820 elements
Found 10820 raw elements before filtering

Raw node data for the first 10 items:

Item 1:
{
    "type": "node",
    "id": 296568074,
    "lat": 40.7385968,
    "lon": -74.0303486,
    "tags": {
        "amenity": "restaurant",
        "name": "The Brass Rail",
        "wikidata": "Q7719863"
    }
}

Item 2:
{
    "type": "node",
    "id": 305499273,
    "lat": 40.7433179,
    "lon": -74.0285824,
    "tags": {
        "addr:city": "Hoboken",
        "addr:housenumber": "61",
        "addr:street": "6th St",
        "amenity": "restaurant",
        "name": "Court Street",
        "opening_hours": "Mo-Sa 16:30-23:00; Su 11:00-15:00,16:30-22:00",
        "phone": "+1-201-795-4515",
        "website": "https://www.courtstreet.com/"
    }
}
No restaurants or cafes found in the data.
