# Scrape the wohnverdient.de

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
from functools import partial
from typing import Iterator
import pandas as pd
import re
import requests
import urllib.parse


In [None]:
base = "https://wohnverdient.de"
path = "/mietangebote/wohnungen/freiburg-im-breisgau-deutschland?"
radius_map = {
    "1km": 7729,
    "3km": 7730,
    "5km": 7731,
    "10km": 7732,
    "20km": 7733,
    "30km": 7734,
}
params = {
    "mile": radius_map["30km"],
    "basic_rent_min": 900,
    "basic_rent_max": 1500,
    "living_area_from_min": 45,
}
url_params = urllib.parse.urlencode(params)
url = f"{base}{path}{url_params}"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", class_="tricky-link")

pages = []
for link in links:
    print("Link text:", link.get_text())
    href = link.get("href")
    print("URL:", href)
    url = f"{base}{href}"
    r = requests.get(url)
    pages.append((url, r.text))

In [None]:
def get_div_strings_by_class(s: BeautifulSoup, class_: str) -> list[str]:
    return [t.strip() for t in list(s.find("div", class_=class_).find_all(string=True))]

def get_re_groups(regex: str, raw_str: str) -> str:
    return ''.join([v if v is not None else "" for v in re.search(regex, raw_str).groups()])

def parse_date(date_str: str) -> datetime:
    return datetime.strptime(date_str, '%d.%m.%y')

def cleanup(raw_ad: dict[str, str]) -> dict[str, str]:
    ad = {}
    ad["title"], ad["location"] = raw_ad["title"]
    ad["price"] = get_re_groups(r'^(\d+)(?:\.(\d+))?', raw_ad["price"][0])
    ad["seller"] = raw_ad["seller"][0]

    info = {}
    info_map = {
        "Veröffentlicht am": ("publish_date", parse_date),
        "Letzte Aktualisierung": ("update_date", parse_date),
        "Wohnfläche": ("space", r'^(\d+)'),
        "Anzahl Zimmer": "rooms",
        "Kaltmiete": ("km", r'^(\d+)'),
        "Nebenkosten": ("nbk", r'^(\d+)'),
        "Warmmiete": ("wm", r'^(\d+)'),
        "Bezugsfrei ab": ("free_from", parse_date),
        "Beschreibung": "description",
        "Lage": "location",
        "Objekttyp": "object_type",
        "Etage": "floor",
        "Ausstattung": "equipment",
        "Ausstattungsmerkmale": "equipment_desc",
        "Baujahr": "construction_year",
    }
    info_iter = iter(raw_ad["info"])
    for item in info_iter:
        # skip non-keys
        if not item.endswith(':'):
            continue

        # skip keys not in info map
        raw_key = item.rstrip(':')
        if not raw_key in info_map:
            continue

        # process valid keys as needed
        key = info_map[raw_key]
        value = next(info_iter)
        if isinstance(key, str):
            ad[key] = value
        elif isinstance(key, tuple):
            handler = key[1]
            if isinstance(handler, str):
                ad[key[0]] = get_re_groups(handler, value)
            elif callable(handler):
                ad[key[0]] = handler(value)
    return ad


ads = []
for url, page in pages:
    s = BeautifulSoup(page, "html.parser")
    data_map = {
        "title": "dtl-title",
        "price": "dtl-price",
        "seller": "seller-name",
        "info": "item-info"
    }
    raw_ad = { k: get_div_strings_by_class(s, v) for k, v in data_map.items() }
    
    ad = cleanup(raw_ad)
    ad["url"] = url
    ads.append(ad)

In [None]:
df = pd.DataFrame(ads)
df = df.sort_values(by=["publish_date", "update_date"], ascending=False)
first_cols = [
    "publish_date",
    "update_date",
    "title",
    "price",
    "wm",
    "km",
    "nbk",
    "rooms",
    "space",
    "object_type",
    "location",
]
cols = first_cols + list(set(df.columns) - set(first_cols) - set(["url"])) + ["url"]
df = df[cols].drop_duplicates(subset=["wm", "space", "location"])
with pd.option_context(
    'display.max_rows', None,
    'display.max_columns', None,
    'display.width', None,
    'display.max_colwidth', None
):
    display(df)