## Cleaning & parsing
    * Cleans data from berlin_venues_raw created by the venues_craper

In [21]:
# Loaded variable 'df' from URI

df = pd.read_csv(r'/Users/giovanigoltara/Documents/webeet/layered-populate-data-pool-da/venues/sources/berlin_venues_raw.csv')

In [22]:
# Drop rows with missing data in columns: 'name', 'district'
df = df.dropna(subset=['name', 'district'])

# Drop duplicate rows across all columns
df = df.drop_duplicates()

In [23]:
# Opening Hours parser code
import re
from datetime import datetime

DAY_ORDER = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"]

def _normalize_day_text(s: str) -> str:
    t = (s or "").strip()
    t = t.replace("–", "-").replace("—", "-").replace("−", "-").replace(" to ", "-")
    low = t.lower()
    repl = [
        (r'\bpublic\s*holidays?\b', 'PH'), (r'\bph\b', 'PH'),
        (r'\bmonday\b', 'Mo'), (r'\bmon\b', 'Mo'), (r'\bmo\b', 'Mo'),
        (r'\btuesday\b', 'Tu'), (r'\btues\b', 'Tu'), (r'\btue\b', 'Tu'), (r'\btu\b', 'Tu'),
        (r'\bwednesday\b', 'We'), (r'\bweds\b', 'We'), (r'\bwed\b', 'We'), (r'\bwe\b', 'We'),
        (r'\bthursday\b', 'Th'), (r'\bthurs\b', 'Th'), (r'\bthur\b', 'Th'), (r'\bthu\b', 'Th'),
        (r'\bfriday\b', 'Fr'), (r'\bfri\b', 'Fr'), (r'\bfr\b', 'Fr'),
        (r'\bsaturday\b', 'Sa'), (r'\bsat\b', 'Sa'), (r'\bsa\b', 'Sa'),
        (r'\bsunday\b', 'Su'), (r'\bsun\b', 'Su'), (r'\bsu\b', 'Su'),
    ]
    for pat, rep in repl:
        low = re.sub(pat, rep, low)
    return re.sub(r'\s+', ' ', low).strip()

def _expand_days_token(tok: str):
    tok = tok.strip()
    if not tok: return []
    if tok == "PH": return ["PH"]
    if "-" in tok:
        a, b = [x.strip() for x in tok.split("-", 1)]
        if a in DAY_ORDER and b in DAY_ORDER:
            ai, bi = DAY_ORDER.index(a), DAY_ORDER.index(b)
            return DAY_ORDER[ai:bi+1] if ai <= bi else DAY_ORDER[ai:] + DAY_ORDER[:bi+1]
        return []
    return [tok] if tok in DAY_ORDER else []

def _parse_time_value(t: str) -> str:
    t = t.strip()
    if t.lower() in {"midnight", "24", "24:00"}: return "00:00"
    if t.lower() in {"noon", "12pm"}: return "12:00"
    if re.match(r'^\d{1,2}:\d{1,2}$', t):
        h, m = t.split(":"); return f"{int(h):02d}:{int(m):02d}"
    if re.match(r'^\d{1,2}$', t):
        return f"{int(t):02d}:00"
    return t

def _parse_segment(seg: str):
    seg = _normalize_day_text(seg)
    mdig = re.search(r'\d', seg)
    day_part = seg[:mdig.start()].strip().rstrip(",") if mdig else seg
    times_part = seg[mdig.start():].strip() if mdig else ""
    if day_part:
        day_tokens = [p.strip() for p in day_part.split(",") if p.strip()]
        days = []
        for tok in day_tokens:
            days += _expand_days_token(tok)
    else:
        days = DAY_ORDER[:]
    times = []
    if times_part:
        for tseg in [x.strip() for x in re.split(r',|\s*/\s*', times_part) if x.strip()]:
            if tseg.endswith("+"):
                times.append([_parse_time_value(tseg[:-1]), "late"])
            elif "-" in tseg:
                a, b = tseg.split("-", 1)
                times.append([_parse_time_value(a), _parse_time_value(b)])
            else:
                tok = tseg.lower()
                if tok in {"closed", "off"}:
                    times.append(["closed", "closed"])
                else:
                    times.append([_parse_time_value(tseg), ""])
    return days, times

def opening_hours_to_dict(text: str):
    if not isinstance(text, str) or text.strip() == "" or "missing" in text.lower():
        return None
    result = {}
    segments = [s.strip() for s in re.split(r';|\||·', text) if s.strip()]
    if not segments: segments = [text.strip()]
    for seg in segments:
        days, times = _parse_segment(seg)
        if not times: continue
        for d in days:
            result.setdefault(d, []).extend(times)
    return result

# --- APPLY transformation directly to df ---
df["opening_hours_dict"] = df["opening_hours"].apply(opening_hours_to_dict)

In [27]:
# Cleans empty spaces on the phone column
df["phone"] = (
    df["phone"]
    .str.strip()                         # remove leading/trailing
    .str.replace(r"\s+", " ", regex=True)  # collapse multiple spaces
)

In [28]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv(r'/Users/giovanigoltara/Documents/webeet/layered-populate-data-pool-da/venues/sources/berlin_venues_cleaned.csv', index=False)