In [6]:
import sys
print(sys.getdefaultencoding()) 

utf-8


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, mean_squared_error
import xgboost as xgb
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.api import add_constant
from scipy.stats import zscore
import chardet
import requests
import json
from datetime import datetime

In [17]:
import requests
import json

# Overpass API endpoint
OVERPASS_URL = "http://overpass-api.de/api/interpreter"

# Bounding box for NYC: (south, west, north, east)
OVERPASS_QUERY = """
[out:json][timeout:60];
(
  node["amenity"="restaurant"](40.4774,-74.2589,40.9176,-73.7004);
  node["amenity"="cafe"](40.4774,-74.2589,40.9176,-73.7004);
);
out body;
"""

# Function to fetch data from Overpass API
def fetch_osm_data():
    print("Sending request to Overpass API...")
    try:
        response = requests.post(OVERPASS_URL, data={"data": OVERPASS_QUERY})
        response.raise_for_status()
        data = response.json()
        print(f"Raw response contains {len(data.get('elements', []))} elements")
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# Function to parse OSM data into the desired format
def parse_osm_business(node):
    tags = node.get("tags", {})
    
    # Map OSM tags to categories
    categories = []
    if tags.get("restaurant"):
        categories.append(tags["restaurant"].capitalize())
    if tags.get("cafe"):
        categories.append(tags["cafe"].capitalize())
    if not categories:
        categories = ["Unknown"]

    # Filter: only include if "Restaurant" or "Cafe" is in categories
    if not any(cat in ["Restaurant", "Cafe"] for cat in categories):
        return None

    # Build business data structure
    business_data = {
        "id": node.get("id"),
        "uuid": str(node.get("id")),
        "name": tags.get("name", "Unnamed Business"),
        "address": tags.get("addr:street", ""),
        "city": tags.get("addr:city", "New York"),
        "state": tags.get("addr:state", "NY"),
        "postal_code": tags.get("addr:postcode", ""),
        "latitude": node.get("lat"),
        "longitude": node.get("lon"),
        "stars": 0,
        "review_count": 0,
        "is_open": 1,
        "attributes": {
            "RestaurantsDelivery": "False",
            "OutdoorSeating": "False",
            "BusinessAcceptsCreditCards": "True",
            "BusinessParking": "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
            "BikeParking": "True",
            "RestaurantsPriceRange2": "1",
            "RestaurantsTakeOut": "True",
            "ByAppointmentOnly": "False",
            "WiFi": "u'no'",
            "Alcohol": "u'none'",
            "Caters": "False"
        },
        "categories": categories,
        "hours": {
            "Monday": "7:0-20:0",
            "Tuesday": "7:0-20:0",
            "Wednesday": "7:0-20:0",
            "Thursday": "7:0-20:0",
            "Friday": "7:0-21:0",
            "Saturday": "7:0-21:0",
            "Sunday": "7:0-21:0"
        },
        "summary": f"{tags.get('name', 'Unnamed Business')} is an establishment in New York City, categorized as {', '.join(categories)}."
    }
    return business_data

# Main function to collect and save data
def collect_nyc_data(filename="nyc_osm_businesses.json"):
    data = fetch_osm_data()
    if not data:
        print("No data returned. Check your internet connection or the Overpass API status.")
        return
    
    elements = data.get("elements", [])
    print(f"Found {len(elements)} raw elements before filtering")
    
    if not elements:
        print("No businesses found. The query returned no results for the NYC bounding box.")
        return
    
    # Print raw node data for the first 10 items
    print("\nRaw node data for the first 10 items:")
    for i, node in enumerate(elements[:10], 1):
        print(f"\nItem {i}:")
        print(json.dumps(node, indent=4))
    
    # Process and filter businesses
    all_businesses = []
    for node in elements:
        business = parse_osm_business(node)
        if business:
            all_businesses.append(business)
    
    if not all_businesses:
        print("No restaurants or cafes found in the data.")
        return
    
    # Save to JSON file
    with open(filename, "w") as f:
        json.dump(all_businesses, f, indent=4)
    
    print(f"Saved {len(all_businesses)} restaurants and cafes to {filename}")

# Run the script
if __name__ == "__main__":
    collect_nyc_data()

Sending request to Overpass API...
Raw response contains 10819 elements
Found 10819 raw elements before filtering

Raw node data for the first 10 items:

Item 1:
{
    "type": "node",
    "id": 296568074,
    "lat": 40.7385968,
    "lon": -74.0303486,
    "tags": {
        "amenity": "restaurant",
        "name": "The Brass Rail",
        "wikidata": "Q7719863"
    }
}

Item 2:
{
    "type": "node",
    "id": 305499273,
    "lat": 40.7433179,
    "lon": -74.0285824,
    "tags": {
        "addr:city": "Hoboken",
        "addr:housenumber": "61",
        "addr:street": "6th St",
        "amenity": "restaurant",
        "name": "Court Street",
        "opening_hours": "Mo-Sa 16:30-23:00; Su 11:00-15:00,16:30-22:00",
        "phone": "+1-201-795-4515",
        "website": "https://www.courtstreet.com/"
    }
}

Item 3:
{
    "type": "node",
    "id": 357618253,
    "lat": 40.7408217,
    "lon": -73.784295,
    "tags": {
        "addr:city": "Fresh Meadows",
        "addr:housenumber": "61-0

In [4]:
business.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [5]:
 filtered = business[business['state'] == 'NY']

In [6]:
filtered.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours


In [2]:
import psycopg, os
from PIL import Image
from IPython.display import display
import pandas as pd
from tabulate import tabulate
from pymongo import MongoClient


['"CBT",NA,2021-06-30,"2021","Q3","pretaxincomemargin",0.134133,"calculations","percentage","Pre Tax Income Margin",NA,"Cabot Corporation"\n', '"CBT",NA,2021-06-30,"2021","Q3","adjweightedavebasicsharesos",56700000,"calculations","shares","Adjusted Weighted Average Basic Shares Outstanding",NA,"Cabot Corporation"\n', '"CBT",NA,2021-06-30,"2021","Q3","adjbasiceps",1.48,"calculations","usdpershare","Adjusted Basic Earnings per Share",NA,"Cabot Corporation"\n', '"CBT",NA,2021-06-30,"2021","Q3","adjweightedavedilutedsharesos",5.7e+07,"calculations","shares","Adjusted Weighted Average Diluted Shares Outstanding",NA,"Cabot Corporation"\n', '"CBT",NA,2021-06-30,"2021","Q3","adjdilutedeps",1.48,"calculations","usdpershare","Adjusted Diluted Earnings per Share",NA,"Cabot Corporation"\n']


        ticker start_date   end_date fiscal_year fiscal_period  \
2736129    CBT        NaT 2021-06-30        2021            Q3   
2736130    CBT        NaT 2021-06-30        2021            Q3   
2736131    CBT        NaT 2021-06-30        2021            Q3   
2736132    CBT        NaT 2021-06-30        2021            Q3   
2736133    CBT        NaT 2021-06-30        2021            Q3   

                        intrinio_tag         value financial_statement  \
2736129                     nwctorev  2.104930e-01        calculations   
2736130              normalizednopat  1.013171e+08        calculations   
2736131        normalizednopatmargin  1.104880e-01        calculations   
2736132           pretaxincomemargin  1.341330e-01        calculations   
2736133  adjweightedavebasicsharesos  5.670000e+07        calculations   

              units                                          line_item  \
2736129  percentage                     Net Working Capital to Revenue   
2736130   

Data insertion completed3.


In [7]:
conn = psycopg.connect(
        host="apan-postgres",
        port='5432',
        dbname="db",
        user="admin",
        password="PassW0rd")

Data insertion completed4.
