## Cleaning & parsing
    * Cleans data from berlin_venues_raw created by the venues_craper

In [254]:
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine, text
import warnings

warnings.filterwarnings("ignore")

In [255]:
# Loaded variable 'df' from URI

df = pd.read_csv(r'/Users/giovanigoltara/Documents/webeet/layered-populate-data-pool-da/venues/sources/berlin_venues_raw.csv')

In [256]:
# Drop rows with missing data in columns: 'name', 'district'
df = df.dropna(subset=['name', 'district'])

# Drop duplicate rows across all columns
df = df.drop_duplicates()

In [257]:
# Opening Hours parser code
import re
from datetime import datetime

DAY_ORDER = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"]

def _normalize_day_text(s: str) -> str:
    t = (s or "").strip()
    t = t.replace("–", "-").replace("—", "-").replace("−", "-").replace(" to ", "-")
    low = t.lower()
    repl = [
        (r'\bpublic\s*holidays?\b', 'PH'), (r'\bph\b', 'PH'),
        (r'\bmonday\b', 'Mo'), (r'\bmon\b', 'Mo'), (r'\bmo\b', 'Mo'),
        (r'\btuesday\b', 'Tu'), (r'\btues\b', 'Tu'), (r'\btue\b', 'Tu'), (r'\btu\b', 'Tu'),
        (r'\bwednesday\b', 'We'), (r'\bweds\b', 'We'), (r'\bwed\b', 'We'), (r'\bwe\b', 'We'),
        (r'\bthursday\b', 'Th'), (r'\bthurs\b', 'Th'), (r'\bthur\b', 'Th'), (r'\bthu\b', 'Th'),
        (r'\bfriday\b', 'Fr'), (r'\bfri\b', 'Fr'), (r'\bfr\b', 'Fr'),
        (r'\bsaturday\b', 'Sa'), (r'\bsat\b', 'Sa'), (r'\bsa\b', 'Sa'),
        (r'\bsunday\b', 'Su'), (r'\bsun\b', 'Su'), (r'\bsu\b', 'Su'),
    ]
    for pat, rep in repl:
        low = re.sub(pat, rep, low)
    return re.sub(r'\s+', ' ', low).strip()

def _expand_days_token(tok: str):
    tok = tok.strip()
    if not tok: return []
    if tok == "PH": return ["PH"]
    if "-" in tok:
        a, b = [x.strip() for x in tok.split("-", 1)]
        if a in DAY_ORDER and b in DAY_ORDER:
            ai, bi = DAY_ORDER.index(a), DAY_ORDER.index(b)
            return DAY_ORDER[ai:bi+1] if ai <= bi else DAY_ORDER[ai:] + DAY_ORDER[:bi+1]
        return []
    return [tok] if tok in DAY_ORDER else []

def _parse_time_value(t: str) -> str:
    t = t.strip()
    if t.lower() in {"midnight", "24", "24:00"}: return "00:00"
    if t.lower() in {"noon", "12pm"}: return "12:00"
    if re.match(r'^\d{1,2}:\d{1,2}$', t):
        h, m = t.split(":"); return f"{int(h):02d}:{int(m):02d}"
    if re.match(r'^\d{1,2}$', t):
        return f"{int(t):02d}:00"
    return t

def _parse_segment(seg: str):
    seg = _normalize_day_text(seg)
    mdig = re.search(r'\d', seg)
    day_part = seg[:mdig.start()].strip().rstrip(",") if mdig else seg
    times_part = seg[mdig.start():].strip() if mdig else ""
    if day_part:
        day_tokens = [p.strip() for p in day_part.split(",") if p.strip()]
        days = []
        for tok in day_tokens:
            days += _expand_days_token(tok)
    else:
        days = DAY_ORDER[:]
    times = []
    if times_part:
        for tseg in [x.strip() for x in re.split(r',|\s*/\s*', times_part) if x.strip()]:
            if tseg.endswith("+"):
                times.append([_parse_time_value(tseg[:-1]), "late"])
            elif "-" in tseg:
                a, b = tseg.split("-", 1)
                times.append([_parse_time_value(a), _parse_time_value(b)])
            else:
                tok = tseg.lower()
                if tok in {"closed", "off"}:
                    times.append(["closed", "closed"])
                else:
                    times.append([_parse_time_value(tseg), ""])
    return days, times

def opening_hours_to_dict(text: str):
    if not isinstance(text, str) or text.strip() == "" or "missing" in text.lower():
        return None
    result = {}
    segments = [s.strip() for s in re.split(r';|\||·', text) if s.strip()]
    if not segments: segments = [text.strip()]
    for seg in segments:
        days, times = _parse_segment(seg)
        if not times: continue
        for d in days:
            result.setdefault(d, []).extend(times)
    return result

# --- APPLY transformation directly to df ---
df["opening_hours_dict"] = df["opening_hours"].apply(opening_hours_to_dict)

In [258]:
# Cleans spaces in the phone column
df["phone"] = df["phone"].apply(lambda x: str(x).replace(" ", "") if pd.notna(x) else np.nan)

In [259]:
# Changes the name of the actual district column to 'neighborhood'
df.rename(columns={"district": "neighborhood"}, inplace=True) 

In [260]:
# Creates a new column 'district' thorugh a lookup dictionary
# Berlin Ortsteil (neighborhood) to official 12 Bezirke mapping

neighborhood_to_district = {
    # Mitte
    "Mitte": "Mitte",
    "Moabit": "Mitte",
    "Tiergarten": "Mitte",
    "Wedding": "Mitte",
    "Gesundbrunnen": "Mitte",
    "Wedding-Mitte": "Mitte",
    "Hansaviertel" : "Mitte",


    # Friedrichshain-Kreuzberg
    "Friedrichshain": "Friedrichshain-Kreuzberg",
    "Kreuzberg": "Friedrichshain-Kreuzberg",
    "Alt-Treptow": "Friedrichshain-Kreuzberg",
    "Oberschöneweide": "Friedrichshain-Kreuzberg", 

    # Pankow
    "Prenzlauer Berg": "Pankow",
    "Weißensee": "Pankow",
    "Pankow": "Pankow",
    "Blankenburg": "Pankow",
    "Heinersdorf": "Pankow",
    "Karow": "Pankow",
    "Niederschönhausen": "Pankow",
    "Rosenthal": "Pankow",
    "Wilhelmsruh": "Pankow",
    "Buch": "Pankow",
    "Französisch Buchholz": "Pankow",
    "Blankenfelde": "Pankow",
    "Buchholz": "Pankow",
    "Stadtrandsiedlung Malchow": "Pankow",
    

    # Charlottenburg-Wilmersdorf
    "Charlottenburg": "Charlottenburg-Wilmersdorf",
    "Wilmersdorf": "Charlottenburg-Wilmersdorf",
    "Schmargendorf": "Charlottenburg-Wilmersdorf",
    "Grunewald": "Charlottenburg-Wilmersdorf",
    "Westend": "Charlottenburg-Wilmersdorf",
    "Halensee": "Charlottenburg-Wilmersdorf",
    "Charlottenburg-Nord": "Charlottenburg-Wilmersdorf",

    # Spandau
    "Spandau": "Spandau",
    "Haselhorst": "Spandau",
    "Siemensstadt": "Spandau",
    "Staaken": "Spandau",
    "Gatow": "Spandau",
    "Kladow": "Spandau",
    "Hakenfelde": "Spandau",
    "Falkenhagener Feld": "Spandau",
    "Wilhelmstadt": "Spandau",


    # Steglitz-Zehlendorf
    "Steglitz": "Steglitz-Zehlendorf",
    "Lichterfelde": "Steglitz-Zehlendorf",
    "Lankwitz": "Steglitz-Zehlendorf",
    "Zehlendorf": "Steglitz-Zehlendorf",
    "Dahlem": "Steglitz-Zehlendorf",
    "Nikolassee": "Steglitz-Zehlendorf",
    "Wannsee": "Steglitz-Zehlendorf",
    "Teltowkanal": "Steglitz-Zehlendorf",
    "Zehlendorf-Mitte": "Steglitz-Zehlendorf", 
    "Schlachtensee": "Steglitz-Zehlendorf",

    # Tempelhof-Schöneberg
    "Schöneberg": "Tempelhof-Schöneberg",
    "Friedenau": "Tempelhof-Schöneberg",
    "Tempelhof": "Tempelhof-Schöneberg",
    "Mariendorf": "Tempelhof-Schöneberg",
    "Marienfelde": "Tempelhof-Schöneberg",
    "Lichtenrade": "Tempelhof-Schöneberg",
    "Tempelhof-Süd": "Tempelhof-Schöneberg",


    # Neukölln
    "Neukölln": "Neukölln",
    "Britz": "Neukölln",
    "Buckow": "Neukölln",
    "Rudow": "Neukölln",
    "Gropiusstadt": "Neukölln",

    # Treptow-Köpenick
    "Alt-Treptow": "Treptow-Köpenick",
    "Plänterwald": "Treptow-Köpenick",
    "Baumschulenweg": "Treptow-Köpenick",
    "Johannisthal": "Treptow-Köpenick",
    "Niederschöneweide": "Treptow-Köpenick",
    "Altglienicke": "Treptow-Köpenick",
    "Adlershof": "Treptow-Köpenick",
    "Bohnsdorf": "Treptow-Köpenick",
    "Oberschöneweide": "Treptow-Köpenick",
    "Köpenick": "Treptow-Köpenick",
    "Friedrichshagen": "Treptow-Köpenick",
    "Rahnsdorf": "Treptow-Köpenick",
    "Grünau": "Treptow-Köpenick",
    "Müggelheim": "Treptow-Köpenick",
    "Schmöckwitz": "Treptow-Köpenick",
    "Königs Wusterhausen": "Treptow-Köpenick",


    # Marzahn-Hellersdorf
    "Marzahn": "Marzahn-Hellersdorf",
    "Biesdorf": "Marzahn-Hellersdorf",
    "Kaulsdorf": "Marzahn-Hellersdorf",
    "Mahlsdorf": "Marzahn-Hellersdorf",
    "Hellersdorf": "Marzahn-Hellersdorf",
    "Falkenberg": "Marzahn-Hellersdorf",


    # Lichtenberg
    "Fennpfuhl": "Lichtenberg",
    "Rummelsburg": "Lichtenberg",
    "Karlshorst": "Lichtenberg",
    "Friedrichsfelde": "Lichtenberg",
    "Lichtenberg": "Lichtenberg",
    "Falkenberg": "Lichtenberg",
    "Malchow": "Lichtenberg",
    "Wartenberg": "Lichtenberg",
    "Neu-Hohenschönhausen": "Lichtenberg",
    "Alt-Hohenschönhausen": "Lichtenberg",


    # Reinickendorf
    "Reinickendorf": "Reinickendorf",
    "Tegel": "Reinickendorf",
    "Konradshöhe": "Reinickendorf",
    "Heiligensee": "Reinickendorf",
    "Frohnau": "Reinickendorf",
    "Hermsdorf": "Reinickendorf",
    "Waidmannslust": "Reinickendorf",
    "Lübars": "Reinickendorf",
    "Wittenau": "Reinickendorf",
    "Märkisches Viertel": "Reinickendorf",
    "Borsigwalde": "Reinickendorf",

}

# Create new standardized 12-district column
df["district"] = df["neighborhood"].map(neighborhood_to_district)

# Find missing values (if any neighborhood not in mapping)
missing = df[df["district"].isna()]["neighborhood"].unique()
print("Unmapped neighborhoods:", missing)
    

Unmapped neighborhoods: ['Villavicencio' 'Kensington']


In [261]:
# delete rows with missing values in 'district' column
df = df.dropna(subset=['district']) 

In [262]:
# Extract the first 5-digit number (German postal code)
df["postal_code"] = (
    df["address"]
    .astype(str)
    .str.extract(r"(\d{5})")[0]
    .fillna("")
    .str.strip()  # simpler than apply(lambda x: x.strip())
)
# Change column type to string for column: 'postal_code'
df = df.astype({'postal_code': 'string'})

# Cleans the postal code column by removing trailing '.0' if present
df["postal_code"] = df["postal_code"].str.replace(r"\.0$", "", regex=True)

In [263]:
# Creates id for each row
df["id"] = ["V" + str(i).zfill(4) for i in range(1, len(df) + 1)]

In [264]:
# Creates column district_id 
# District ID mapping for new database
district_mapping = {
        'Mitte': '01',
        'Friedrichshain-Kreuzberg': '02',
        'Pankow': '03',
        'Charlottenburg-Wilmersdorf': '04',
        'Spandau': '05',
        'Steglitz-Zehlendorf': '06',
        'Tempelhof-Schöneberg': '07',
        'Neukölln': '08',
        'Treptow-Köpenick': '09',
        'Marzahn-Hellersdorf': '10',
        'Lichtenberg': '11',
        'Reinickendorf': '12'  
}
df["district_id"] = df["district"].map(district_mapping).astype(str).str.zfill(2)

# Strip spaces from district names first
df["district_clean"] = df["district"].astype(str).str.strip()

# Map using the dictionary
df["district_id"] = df["district_clean"].map(district_mapping)

# Force as string and ensure 2 digits
df["district_id"] = df["district_id"].astype(str).str.zfill(2)

In [265]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv(r'/Users/giovanigoltara/Documents/webeet/layered-populate-data-pool-da/venues/sources/berlin_venues_cleaned.csv', index=False)

In [266]:
df.head()  # Display the first few rows of the cleaned DataFrame

Unnamed: 0,name,category,cuisine,address,lat,lon,website,phone,opening_hours,takeaway,wheelchair,neighborhood,opening_hours_dict,district,postal_code,id,district_id,district_clean
0,Aida,restaurant,italian;pizza,"Knesebeckstraße , 10623 Berlin",52.506864,13.322859,https://www.aida-restaurant.de/,493031806750.0,Mo-Sa 11:30-21:00; Su 15:00-21:00,,no,Charlottenburg,"{'Mo': [['11:30', '21:00']], 'Tu': [['11:30', ...",Charlottenburg-Wilmersdorf,10623,V0001,4,Charlottenburg-Wilmersdorf
1,Madame Ngo,restaurant,asian,"Kantstraße 30, 10623 Berlin",52.506212,13.318081,,4915753604089.0,"Mo-Sa 12:00-16:30, 18:00-22:00; Su 12:00-16:30...",,limited,Charlottenburg,"{'Mo': [['12:00', '16:30'], ['18:00', '22:00']...",Charlottenburg-Wilmersdorf,10623,V0002,4,Charlottenburg-Wilmersdorf
2,Nam Thuân,restaurant,vietnamese,"Pestalozzistraße 106, 10625 Berlin",52.50732,13.32078,,,,,no,Charlottenburg,,Charlottenburg-Wilmersdorf,10625,V0003,4,Charlottenburg-Wilmersdorf
3,La Rose,restaurant,italian,"Neue Kantstraße 19, 14057 Berlin",52.506318,13.284626,,,,,,Charlottenburg,,Charlottenburg-Wilmersdorf,14057,V0004,4,Charlottenburg-Wilmersdorf
4,Eiscafe Eisberg,cafe,,"Brunnenstraße 55, 13355 Berlin",52.538855,13.396097,http://www.eis-berg.de/,,,,yes,Gesundbrunnen,,Mitte,13355,V0005,1,Mitte


In [267]:
# Change the name of columns wheelchair, lat, lon, district_clean 
df.rename(columns={
    "wheelchair": "wheelchair_accessible",
    "lat": "latitude",
    "lon": "longitude",
    "district_clean": "district"
}, inplace=True)


In [277]:
engine = create_engine(
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

In [281]:
# Define the SQL CREATE TABLE statement
create_table_sql = """
CREATE TABLE IF NOT EXISTS venues (
    district_id VARCHAR(2),
    venue_id VARCHAR(10) PRIMARY KEY,
    name VARCHAR(200), 
    category VARCHAR(100),
    cuisine VARCHAR(100),
    phone VARCHAR(50),
    address VARCHAR(200),
    coordinates VARCHAR(200),
    latitude DECIMAL(9,6),
    longitude DECIMAL(9,6),
    website VARCHAR(200),
    opening_hours_dict JSONB,
    opening_hours VARCHAR(500),
    postal_code VARCHAR(10),
    neighborhood VARCHAR(100),
    district VARCHAR(100),
    takeaway VARCHAR(10),
    wheelchair_accessible VARCHAR(10),
    CONSTRAINT district_id_fk
            FOREIGN KEY (district)
            REFERENCES test_berlin_data.districts(district)
            ON DELETE RESTRICT
            ON UPDATE CASCADE
);
"""

# Execute the CREATE TABLE statement
with engine.connect() as conn:
    conn.execute(text(create_table_sql))

print("Table 'venues' created successfully (if it did not exist).")

Table 'venues' created successfully (if it did not exist).


In [282]:
df.to_sql(
    name='berlin_venues',
    con=engine,
    schema='test_berlin_data',
    if_exists='replace',
    index=False
)
print("venues data uploaded successfully!")

DuplicateColumnError: A column with name 'district' is already present in table 'berlin_venues'.