In [64]:
# ================================
#  DATA PREPARATION PIPELINE (IMPROVED)
# ================================

import pandas as pd
import numpy as np
import re
import unicodedata

# -------------------------
# 1) LOAD
# -------------------------
df = pd.read_csv("zurich.csv")

print("Shape before cleaning:", df.shape)
print("\nColumns:\n", df.columns)
print("\nMissing values before cleaning:")
print(df.isna().sum())

# -------------------------
# 2) CLEAN RATING (ONCE)
# -------------------------
df["rating"] = (
    df["rating"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.strip()
)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Keep only rows with rating (common for rating-based analysis)
df = df.dropna(subset=["rating"]).copy()

# -------------------------
# 3) CLEAN MINIMUM (ONCE)
# -------------------------
df["minimum_clean"] = (
    df["minimum"]
    .astype(str)
    .str.replace("Min.", "", regex=False)
    .str.replace("CHF", "", regex=False)
    .str.replace(",", ".", regex=False)
    .str.strip()
)
df["minimum_clean"] = pd.to_numeric(df["minimum_clean"], errors="coerce").fillna(0)

# -------------------------
# 4) REMOVE DUPLICATES (ONCE)
# -------------------------
df = df.drop_duplicates(subset=["name", "web_scraper_start_url"]).copy()

Shape before cleaning: (201, 6)

Columns:
 Index(['web_scraper_order', 'web_scraper_start_url', 'name', 'rating', 'description', 'minimum'], dtype='object')

Missing values before cleaning:
web_scraper_order        0
web_scraper_start_url    0
name                     0
rating                   1
description              0
minimum                  0
dtype: int64


In [65]:
# ==========================================================
# 5) CUISINE TAXONOMY ENRICHMENT (NO DIETARY FLAGS / NO DRINKS / NO DESSERTS)
# ==========================================================

def normalize_text(s: str) -> str:
    """Lowercase, normalize unicode, unify separators, remove extra spaces."""
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("/", " ").replace("&", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def split_tags(description: str) -> list:
    """Split comma-separated tags, normalize, de-duplicate (order preserved)."""
    desc = normalize_text(description)
    if desc == "":
        return []

    parts = [normalize_text(p) for p in desc.split(",")]
    parts = [p for p in parts if p]

    seen, out = set(), []
    for p in parts:
        if p not in seen:
            out.append(p)
            seen.add(p)
    return out


# These labels are NOT cuisines (marketing / platform tags)
STOP_TAGS = {
    "lokale geheimtipps",
    "angebote",
    "traditionell",
    "geschäfte",
}

def filter_stop_tags(tags: list) -> list:
    if not isinstance(tags, list):
        return []
    return [t for t in tags if t not in STOP_TAGS]


# --- Clean & split tags
df["description_clean"] = df["description"].apply(normalize_text)
df["cuisine_tags_raw"] = df["description"].apply(split_tags)
df["cuisine_tags"] = df["cuisine_tags_raw"].apply(filter_stop_tags)

df["num_cuisines_raw"] = df["cuisine_tags_raw"].apply(len)
df["num_cuisines"] = df["cuisine_tags"].apply(len)

# ----------------------------------------------------------
# TAXONOMY RULES (ONLY REAL CUISINES + grocery_retail)
#   - beverages_snacks removed
#   - desserts removed
# ----------------------------------------------------------
TAXONOMY_RULES = [
    ("grocery_retail", [
        r"\blebensmittel\b", r"\bsupermarkt\b", r"\bkiosk\b", r"\bshop\b",
        r"\bmetzgerei\b", r"\bb(a|ä)ckerei\b"
    ]),
    ("asian", [
        r"\basiatisch\b", r"\bthail(a|ä)ndisch\b", r"\bchinesisch\b",
        r"\bvietnamesisch\b", r"\bjapanisch\b", r"\bsushi\b",
        r"\bkorean\b", r"\bindisch\b", r"\bmomos?\b", r"\bdumplings\b", r"\bpho\b"
    ]),
    ("middle_eastern", [
        r"\bkebab\b", r"\bd(o|ö)ner\b", r"\bfalafel\b",
        r"\barabisch\b", r"\blibanesisch\b", r"\bt(u|ü)rkisch\b",
        r"\bpersisch\b", r"\bburek\b"
    ]),
    ("american", [
        r"\bamerikanisch\b", r"\bburger\b", r"\bsandwich(es)?\b",
        r"\bwraps?\b", r"\bh(u|ü)hnchen\b", r"\bfried chicken\b"
    ]),
    ("european", [
        r"\bitalienisch\b", r"\bpizza\b", r"\bpasta\b",
        r"\bgriechisch\b", r"\bbalkank(u|ü)che\b"
    ]),
    ("latin", [r"\bmexikanisch\b", r"\btacos?\b"]),
    ("african", [r"\bafrikanisch\b", r"\bafro\b"]),
]

# Treat these as generic/non-cuisine for grouping
GENERIC_TAGS = {
    "snacks",
    "getränke", "getränke snacks", "getranke snacks",
    "kaffee", "coffee", "tee", "bubble tea", "smoothie", "smoothies",
    "nachspeisen", "dessert", "desserts", "donut", "donuts", "eis"
}

def tag_to_groups(tag: str) -> list:
    if not tag or tag in GENERIC_TAGS:
        return []
    groups = []
    for group, patterns in TAXONOMY_RULES:
        for pat in patterns:
            if re.search(pat, tag):
                groups.append(group)
                break
    return sorted(set(groups))

def tags_to_groups(tags: list) -> list:
    if not isinstance(tags, list) or not tags:
        return []
    out = []
    for t in tags:
        out.extend(tag_to_groups(t))
    return sorted(set(out))

df["cuisine_groups"] = df["cuisine_tags"].apply(tags_to_groups)
df["num_cuisine_groups"] = df["cuisine_groups"].apply(len)

# ----------------------------------------------------------
# PRIMARY CUISINE SELECTION (NO beverages_snacks / desserts)
# ----------------------------------------------------------
PRIMARY_PRIORITY = [
    "asian",
    "middle_eastern",
    "american",
    "european",
    "latin",
    "african",
    "grocery_retail",
]

def choose_primary_group(groups: list) -> str:
    if not isinstance(groups, list) or not groups:
        return "unknown_other"
    for g in PRIMARY_PRIORITY:
        if g in groups:
            return g
    return "unknown_other"

df["primary_cuisine_group"] = df["cuisine_groups"].apply(choose_primary_group)

In [66]:
# ==========================================================
# 6) SHOW RESULTS (CLEAN + ORDERED OUTPUT)
#   -> only keep primary_cuisine_group for reporting
# ==========================================================

pretty = df.copy()

# If you still want cuisine_groups readable for debugging, keep this:
pretty["cuisine_groups"] = pretty["cuisine_groups"].apply(
    lambda x: ", ".join(x) if isinstance(x, list) else ""
)

cols_show = [
    "name",
    "rating",
    "minimum_clean",
    "primary_cuisine_group"
]

pretty_sorted = pretty.sort_values(
    by=["primary_cuisine_group", "rating"],
    ascending=[True, False]
)

pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", 40)

print("\n--- CLEAN REPORT VIEW (sample 20 rows) ---")
print(pretty_sorted.loc[:, cols_show].head(20).to_string(index=False))

print("\n--- primary_cuisine_group distribution ---")
print(pretty["primary_cuisine_group"].value_counts(dropna=False))

summary = (
    pretty.groupby("primary_cuisine_group", dropna=False)
          .agg(
              n=("name", "count"),
              avg_rating=("rating", "mean"),
              avg_minimum=("minimum_clean", "mean"),
              median_minimum=("minimum_clean", "median"),
          )
          .sort_values("n", ascending=False)
)

print("\n--- summary by primary_cuisine_group ---")
print(summary.to_string())


--- CLEAN REPORT VIEW (sample 20 rows) ---
                                                name  rating  minimum_clean primary_cuisine_group
                              Happen Afro Restaurant     5.0           20.0               african
                                    Restaurant Mesob     4.9           20.0               african
                                 S-Family Restaurant     4.7           20.0               african
                                  Habesha Restaurant     4.5           20.0               african
                           Cindys Bistro - Afro Deli     4.3           20.0               african
                                     Taste of Africa     4.3           20.0               african
                                     Cafe Bar Balkan     5.0           35.0              american
                           Restaurant Pizzeria Monti     4.9           60.0              american
                                Kana (fried chicken)     4.8           20.

In [67]:
# ==========================================================
# 7) FINAL CLEAN SAVE (minimal enriched dataset)
# ==========================================================

final_cols = [
    "name",
    "rating",
    "minimum_clean",
    "description_clean",
    "primary_cuisine_group"
]

# Keep only required columns
df_final = df.loc[:, final_cols].copy()

# Final deduplication (by restaurant name)
df_final = df_final.drop_duplicates(subset=["name"]).reset_index(drop=True)

# Save clean enriched dataset
df_final.to_csv("zurich_enriched_taxonomy.csv", index=False)

print("Saved clean file: zurich_enriched_taxonomy.csv")
print(df_final.head(10).to_string(index=False))


Saved clean file: zurich_enriched_taxonomy.csv
                                                name  rating  minimum_clean         description_clean primary_cuisine_group
                                 mit&ohne kebab - HB     4.0           15.0 kebab, lokale geheimtipps        middle_eastern
               Black Tap Craft Burgers & Beer Zurich     4.3           18.0      amerikanisch, burger              american
                                      Bierhalle Wolf     4.1           18.0   snacks, getränke snacks         unknown_other
                              MrBeast Burger© Zürich     3.3           30.0      snacks, amerikanisch              american
                             Starbucks Limmatstrasse     3.8            0.0         frühstück, kaffee         unknown_other
                     McDonald's® Zürich Löwenstrasse     4.1           18.0      amerikanisch, burger              american
Holy Cow! Gourmet Burger Co. ZÜRICH ZÄHRINGERSTRASSE     4.7           18.0          