## Function to parse raw listing info into type, price, rooms, floor, address, and postal code, with cleaning and normalization

In [None]:
# Imports
import pandas as pd
import re

# --- 1. Normalize Property Types ---
def normalize_type(raw_type):
    """
    Standardizes property type descriptions.
    """
    if not raw_type:
        return None
    raw_type = raw_type.lower()
    if "maisonette" in raw_type:
        return "Maisonette"
    elif any(x in raw_type for x in ["stadthaus", "doppelhaushälfte", "reihenmittelhaus", "einfamilienhaus", "rheinhaus"]):
        return "Haus"
    elif any(x in raw_type for x in ["terrassenwohnung", "penthouse", "wohnung"]):
        return "Wohnung"
    elif "loft" in raw_type:
        return "Loft"
    elif "bungalow" in raw_type:
        return "Bungalow"
    elif "studio" in raw_type:
        return "Studio"
    else:
        return raw_type.capitalize()

# --- 2. Regex Patterns ---
TYPE_PATTERN = r'^(Wohnung|Haus|Bungalow|Studio|WG|Maisonette|Stadthaus|Doppelhaushälfte|Reihenmittelhaus|Einfamilienhaus|Terrassenwohnung|Penthouse|Loft)'
PRICE_PATTERN = r'(\d{1,3}(?:\.\d{3})*|\d+)\s*€'
ROOMS_PATTERN = r'(\d+(?:,\d)?)\s*Zimmer'
SURFACE_PATTERN = r'(\d+(?:,\d+)?)\s*m²'
FLOOR_PATTERN = r'(\d+)(?:/\d+)?\.?\s*Geschoss|EG|Souterrain|Keller'
FREI_AB_PATTERN = r'frei ab (sofort|\d{2}[./]\d{2}[./]\d{2,4})'

# --- 3. Parse a Single Row ---
def parse_row(text):
    """
    Extracts structured info from a listing's text block.
    """
    text = text.replace('\xa0', ' ').strip()
    text = re.sub(FREI_AB_PATTERN, '', text, flags=re.IGNORECASE).strip()

    # Property type
    type_match = re.search(TYPE_PATTERN, text)
    type_val = type_match.group(1) if type_match else None

    # Price
    price_match = re.search(PRICE_PATTERN, text)
    price_val = int(price_match.group(1).replace('.', '')) if price_match else None

    # Number of rooms
    rooms_match = re.search(ROOMS_PATTERN, text)
    rooms_val = float(rooms_match.group(1).replace(',', '.')) if rooms_match else None

    # Surface area
    surface_match = re.search(SURFACE_PATTERN, text)
    surface_val = float(surface_match.group(1).replace(',', '.')) if surface_match else None

    # Floor
    floor_match = re.search(FLOOR_PATTERN, text, flags=re.IGNORECASE)
    floor_val = 0 if "EG" in text else (
        re.sub(r"\.? Geschoss", "", floor_match.group(0)).strip() if floor_match else None
    )

    # Postal code (last 5-digit number)
    postal_match = re.search(r"(\d{5})(?!.*\d{5})", text)
    postal_code = postal_match.group(1) if postal_match else None

    # City/region and street/number
    city_region, street, house_number = None, None, None
    if postal_code:
        before_postal = text.rsplit(postal_code, 1)[0].strip()
        # Heuristically infer city/region (last 2 words before postal code)
        city_region = " ".join(before_postal.split()[-2:])
        # Extract street + house number
        street_match = re.search(
            r'([A-Za-zÄÖÜäöüß\-\s]+(?:straße|str\.?|Str\.?))\s+(\d+\w*)',
            before_postal, flags=re.IGNORECASE
        )
        if street_match:
            street, house_number = street_match.group(1).strip(), street_match.group(2).strip()

    return pd.Series({
        'type': type_val,
        'price_euro': price_val,
        'number_of_rooms': rooms_val,
        'surface_m2': surface_val,
        'floor': floor_val,
        'street': street,
        'house_number': house_number,
        'city_region': city_region,
        'postal_code': postal_code
    })

# --- 4. Main Cleaning Function ---
def clean_listings(df, raw_info_col):
    """
    Applies regex parsing to each row and splits out region/city.
    """
    parsed = df[raw_info_col].apply(parse_row)
    df = pd.concat([df, parsed], axis=1)

    def split_region_city(text):
        if pd.isna(text):
            return pd.Series([None, None])
        parts = text.strip().split()
        return (
            pd.Series([" ".join(parts[:-1]), parts[-1]]) if len(parts) > 1
            else pd.Series([None, text])
        )

    df[['region', 'city']] = df['city_region'].apply(split_region_city)
    df['street'] = df['street'].str.replace(r"\.? Geschoss", "", regex=True).str.strip()
    return df

# --- 5. Apply Cleaning and Post-Processing ---
# Clean and parse raw listing info
cleaned = clean_listings(data, 'raw_info')

# Remove placeholder/image rows
cleaned = cleaned[~cleaned['raw_info'].str.contains('image-placeholder', case=False, na=False)].copy()

# Mark first tenant
cleaned["first_tenant"] = cleaned["raw_info"].str.contains("Erstbezug", case=False, na=False)
cleaned["first_tenant"] = cleaned["first_tenant"].map({True: "yes", False: "no"})

# Drop unnecessary columns
if "Unnamed: 0" in cleaned.columns:
    cleaned.drop(columns=["Unnamed: 0"], inplace=True)

# Ensure city is set
def fix_city(df):
    df.loc[df['city'].isna() & df['raw_info'].str.contains("Berlin", case=False, na=False), 'city'] = "Berlin"
    return df

cleaned = fix_city(cleaned)

# --- 6. Optionally, Normalize Property Types ---
cleaned['type_normalized'] = cleaned['type'].apply(normalize_type)
