In [3]:
import requests, pandas as pd, urllib.parse, time

## Using OpenStreetMap and Google Maps to get a sample of Singapore reviews

We use OpenStreetMap to get the name, coordinates and address of different places in Singapore (food, retail, fitness, tourism and health). With these metadata, we search Google Maps for these places and get the exact URL for 100 of these places for each category. 

After obtaining the URL to these 500 places, we further sample 50 reviews from each place using Apify.

In [2]:
# Singapore bounding box (approx): south, west, north, east
SG_BBOX = (1.130, 103.590, 1.470, 104.090)

CATEGORIES = [
    # (overpass filter, label)
    ('amenity~"restaurant|cafe|fast_food|food_court|marketplace|bar"', 'food'),
    ('shop~"supermarket|convenience|mall|department_store"', 'retail'),
    ('leisure~"fitness_centre|sports_centre"', 'fitness'),
    ('tourism~"attraction|museum|hotel|guest_house|hostel"', 'tourism'),
    ('amenity~"clinic|hospital|pharmacy"', 'health'),
]

def overpass_query(filter_expr, bbox):
    s,w,n,e = bbox
    return f"""
    [out:json][timeout:60];
    (
      node[{filter_expr}]({s},{w},{n},{e});
      way[{filter_expr}]({s},{w},{n},{e});
      relation[{filter_expr}]({s},{w},{n},{e});
    );
    out center tags 2000;
    """

def fetch_osm(filter_expr, label):
    q = overpass_query(filter_expr, SG_BBOX)
    r = requests.post("https://overpass-api.de/api/interpreter", data={'data': q})
    r.raise_for_status()
    data = r.json().get('elements', [])
    rows = []
    for el in data:
        tags = el.get('tags', {})
        name = tags.get('name') or ''
        lat = el.get('lat') or (el.get('center', {}) or {}).get('lat')
        lon = el.get('lon') or (el.get('center', {}) or {}).get('lon')
        if not name or lat is None or lon is None:
            continue
        addr = ", ".join(filter(None, [
            tags.get('addr:housenumber'),
            tags.get('addr:street'),
            tags.get('addr:postcode'),
            tags.get('addr:city') or tags.get('addr:suburb') or 'Singapore'
        ]))
        rows.append({
            'source':'osm','category_label':label,
            'name':name, 'address':addr, 'lat':lat, 'lng':lon,
            'osm_id':f"{el.get('type','node')}/{el.get('id')}",
            'raw_tags':tags
        })
    return pd.DataFrame(rows)

# 1) Pull OSM POIs across categories
dfs = []
for fexpr, label in CATEGORIES:
    df = fetch_osm(fexpr, label)
    print(label, len(df))
    dfs.append(df)
    time.sleep(1.2)  # be nice to Overpass

poi = pd.concat(dfs, ignore_index=True)

# 2) Deduplicate by (name, ~50m rounded lat/lng)
poi['lat50'] = (poi['lat']*2000).round().astype(int)
poi['lng50'] = (poi['lng']*2000).round().astype(int)
poi = poi.sort_values(['name','lat50','lng50']).drop_duplicates(['name','lat50','lng50'])

# 3) Build Google Maps search URLs (no API key needed)
def gm_search_url(name, lat, lng, address='Singapore'):
    q = urllib.parse.quote_plus(f"{name} {address}")
    return f"https://www.google.com/maps/search/?api=1&query={q}&query_place_id="  # place_id blank for now

poi['gmaps_search_url'] = poi.apply(lambda r: gm_search_url(r['name'], r['lat'], r['lng'], r['address'] or 'Singapore'), axis=1)

# Optional: sample evenly across categories to avoid bias
SAMPLE_PER_CAT = 100  # tune to your needs
sampled = (
    poi.groupby('category_label', group_keys=False)
       .apply(lambda g: g.sample(min(SAMPLE_PER_CAT, len(g)), random_state=42))
       .reset_index(drop=True)
)

# Save candidate links
sampled[['category_label','name','address','lat','lng','gmaps_search_url']].to_csv("sg_place_candidates.csv", index=False)
print("Saved sg_place_candidates.csv with", len(sampled), "rows")

food 1903
retail 1658
fitness 408
tourism 1259
health 755
Saved sg_place_candidates.csv with 500 rows


  .apply(lambda g: g.sample(min(SAMPLE_PER_CAT, len(g)), random_state=42))
