In [4]:
# ================================
#  DATA PREPARATION PIPELINE
# ================================

import pandas as pd

# 1) Load dataset
df = pd.read_csv("zurich.csv")

# -------------------------
# BASIC OVERVIEW
# -------------------------

print("Shape before cleaning:", df.shape)
print("\nColumns:\n", df.columns)

print("\nInfo before cleaning:")
print(df.info())

print("\nMissing values before cleaning:")
print(df.isna().sum())

# -------------------------
# CLEAN RATING COLUMN
# -------------------------
# Ratings like '4,3' → 4.3 (numeric)

df["rating"] = (
    df["rating"]
    .astype(str)
    .str.replace(",", ".", regex=False)
)

df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# -------------------------
# CLEAN MINIMUM ORDER VALUE
# -------------------------
# "Min. 15,00CHF" → 15.00

df["minimum_clean"] = (
    df["minimum"]
    .astype(str)
    .str.replace("Min.", "", regex=False)
    .str.replace("CHF", "", regex=False)
    .str.replace(",", ".", regex=False)
    .str.strip()
)

df["minimum_clean"] = pd.to_numeric(df["minimum_clean"], errors="coerce")

# -------------------------
# REMOVE DUPLICATES
# -------------------------

df = df.drop_duplicates(subset=["name", "web_scraper_start_url"])

# -------------------------
# EXTRACT MAIN FOOD CATEGORY
# from description column
# -------------------------
# Examp


Shape before cleaning: (201, 6)

Columns:
 Index(['web_scraper_order', 'web_scraper_start_url', 'name', 'rating',
       'description', 'minimum'],
      dtype='object')

Info before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   web_scraper_order      201 non-null    object
 1   web_scraper_start_url  201 non-null    object
 2   name                   201 non-null    object
 3   rating                 200 non-null    object
 4   description            201 non-null    object
 5   minimum                201 non-null    object
dtypes: object(6)
memory usage: 9.6+ KB
None

Missing values before cleaning:
web_scraper_order        0
web_scraper_start_url    0
name                     0
rating                   1
description              0
minimum                  0
dtype: int64


In [5]:
df = df.dropna(subset=["rating"])


In [6]:
df["rating"] = (
    df["rating"]
    .astype(str)
    .str.replace(",", ".", regex=False)
)

df["rating"] = pd.to_numeric(df["rating"], errors="coerce")


In [7]:
df["rating"].describe()


count    195.000000
mean       4.264615
std        0.497032
min        1.700000
25%        4.000000
50%        4.300000
75%        4.600000
max        5.000000
Name: rating, dtype: float64

In [8]:
df = df.drop_duplicates(subset=["name", "web_scraper_start_url"])


In [9]:
df["minimum_clean"] = (
    df["minimum"]
    .astype(str)
    .str.replace("Min.", "", regex=False)
    .str.replace("CHF", "", regex=False)
    .str.replace(",", ".", regex=False)
    .str.strip()
)

df["minimum_clean"] = pd.to_numeric(df["minimum_clean"], errors="coerce")


In [10]:
df["high_rating"] = (df["rating"] >= 4.5).astype(int)


In [11]:
df.info()
df.isna().sum()
df.head()


<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 200
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web_scraper_order      195 non-null    object 
 1   web_scraper_start_url  195 non-null    object 
 2   name                   195 non-null    object 
 3   rating                 195 non-null    float64
 4   description            195 non-null    object 
 5   minimum                195 non-null    object 
 6   minimum_clean          186 non-null    float64
 7   high_rating            195 non-null    int32  
dtypes: float64(2), int32(1), object(5)
memory usage: 12.9+ KB


Unnamed: 0,web_scraper_order,web_scraper_start_url,name,rating,description,minimum,minimum_clean,high_rating
0,1766595122-1,https://www.just-eat.ch/lieferservice/essen/zu...,mit&ohne kebab - HB,4.0,"Kebab, Lokale Geheimtipps","Min. 15,00CHF",15.0,0
1,1766595122-2,https://www.just-eat.ch/lieferservice/essen/zu...,Black Tap Craft Burgers & Beer Zurich,4.3,"Amerikanisch, Burger","Min. 18,00CHF",18.0,0
2,1766595122-3,https://www.just-eat.ch/lieferservice/essen/zu...,Bierhalle Wolf,4.1,"Snacks, Getränke/Snacks","Min. 18,00CHF",18.0,0
3,1766595122-4,https://www.just-eat.ch/lieferservice/essen/zu...,MrBeast Burger© Zürich,3.3,"Snacks, Amerikanisch","Min. 30,00CHF",30.0,0
4,1766595122-5,https://www.just-eat.ch/lieferservice/essen/zu...,Starbucks Limmatstrasse,3.8,"Frühstück, Kaffee",Kein Mindestbestellwert,,0


In [12]:
df["minimum_clean"] = df["minimum_clean"].fillna(0)


In [13]:
df["minimum_clean"].describe()


count    195.000000
mean      29.333333
std       22.884665
min        0.000000
25%       20.000000
50%       20.000000
75%       30.000000
max      150.000000
Name: minimum_clean, dtype: float64

In [14]:
df.info()
df.isna().sum()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 200
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web_scraper_order      195 non-null    object 
 1   web_scraper_start_url  195 non-null    object 
 2   name                   195 non-null    object 
 3   rating                 195 non-null    float64
 4   description            195 non-null    object 
 5   minimum                195 non-null    object 
 6   minimum_clean          195 non-null    float64
 7   high_rating            195 non-null    int32  
dtypes: float64(2), int32(1), object(5)
memory usage: 12.9+ KB


Unnamed: 0,web_scraper_order,web_scraper_start_url,name,rating,description,minimum,minimum_clean,high_rating
0,1766595122-1,https://www.just-eat.ch/lieferservice/essen/zu...,mit&ohne kebab - HB,4.0,"Kebab, Lokale Geheimtipps","Min. 15,00CHF",15.0,0
1,1766595122-2,https://www.just-eat.ch/lieferservice/essen/zu...,Black Tap Craft Burgers & Beer Zurich,4.3,"Amerikanisch, Burger","Min. 18,00CHF",18.0,0
2,1766595122-3,https://www.just-eat.ch/lieferservice/essen/zu...,Bierhalle Wolf,4.1,"Snacks, Getränke/Snacks","Min. 18,00CHF",18.0,0
3,1766595122-4,https://www.just-eat.ch/lieferservice/essen/zu...,MrBeast Burger© Zürich,3.3,"Snacks, Amerikanisch","Min. 30,00CHF",30.0,0
4,1766595122-5,https://www.just-eat.ch/lieferservice/essen/zu...,Starbucks Limmatstrasse,3.8,"Frühstück, Kaffee",Kein Mindestbestellwert,0.0,0


In [15]:
df = df.drop(columns=["high_rating"])
