In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def scrape_livingcost_tables(city: str, country: str, region: str = None, groups: list = None) -> pd.DataFrame:
    city_url = city.lower().replace(" ", "-")
    country_url = country.lower().replace(" ", "-")
    region_url = region.lower().replace(" ", "-") if region else ""
    label = f"{city.title()}, {country.title()}"

    if region_url:
        url = f"https://livingcost.org/cost/{country_url}/{region_url}/{city_url}"
    else:
        url = f"https://livingcost.org/cost/{country_url}/{city_url}"

    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    target_groups = set(groups) if groups else set()
    tables = soup.find_all("table")
    records = []

    for table in tables:
        caption = table.find("caption")
        if not caption:
            continue
        group = caption.get_text(strip=True)
        if target_groups and group not in target_groups:
            continue  # Skip if not a desired group

        rows = table.find_all("tr")
        for row in rows:
            cols = row.find_all(["td", "th"])
            if len(cols) == 2:
                item = cols[0].get_text(strip=True)
                price_span = cols[1].find("span", attrs={"data-usd": True})
                price = price_span.get_text(strip=True) if price_span else "N/A"
                records.append([group, item, price, label])

    df = pd.DataFrame(records, columns=["Group", "Category", "Price", "City"])
    return df


In [None]:
cities = [
    {"city": "Berlin", "country": "Germany", "region": "bb", "filename": "berlin_living_cost.csv"},
    {"city": "Hamburg", "country": "Germany", "region": "hh", "filename": "hamburg_living_cost.csv"},
    {"city": "Munich", "country": "Germany", "region": "by", "filename": "munich_living_cost.csv"},
    {"city": "Cologne", "country": "Germany", "region": "nw", "filename": "cologne_living_cost.csv"},
    {"city": "Frankfurt", "country": "Germany", "region": "he", "filename": "frankfurt_living_cost.csv"},
    {"city": "Cape-Town", "country": "South-Africa", "region": "", "filename": "capeTown_living_cost.csv"},
    {"city": "Johannesburg", "country": "South-Africa", "region": "", "filename": "johannesburg_living_cost.csv"},
    {"city": "Durban", "country": "South-Africa", "region": "", "filename": "durban_living_cost.csv"},
    {"city": "Pretoria", "country": "South-Africa", "region": "", "filename": "pretoria_living_cost.csv"},
    {"city": "Ggeberha", "country": "South-Africa", "region": "", "filename": "geberha_living_cost.csv"}

]

groups_to_extract = ["Eating Out", "Rent & Utilities", "Groceries", "Transportation", "Other"]

for location in cities:
    df = scrape_livingcost_tables(
        city=location["city"],
        country=location["country"],
        region=location["region"],
        groups=groups_to_extract
    )
    path = f"../data/{location['filename']}"
    df.to_csv(path, index=False)
    print(f"Saved: {path}")
