In [2]:
import requests
from bs4 import BeautifulSoup
from pypdf import PdfReader
import pandas as pd
from datetime import date, timedelta

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

In [11]:
def visit_page(url, day, hidden=True):
    options = Options()
    if hidden:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 10)

    driver.get(url)

    # 1. Select dropdown
    select = Select(wait.until(EC.element_to_be_clickable((By.ID, "id3"))))
    select.select_by_value("0")

    # 2. Wait until Wicket loads the expanded form
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#id1 form")))
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#id1 form")))

    # Set Von
    von = wait.until(EC.element_to_be_clickable((By.ID, "id7")))
    von.clear()
    von.send_keys(day)
    von.send_keys(Keys.TAB)  # Important: trigger onchange event

    # Set Bis
    bis = wait.until(EC.element_to_be_clickable((By.ID, "id8")))
    bis.clear()
    bis.send_keys(day)
    bis.send_keys(Keys.TAB)

    # 3. Re-find the **new** submit button created by Wicket
    search_button = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#id1 form button[type=submit]"))
    )

    # 4. Now click
    # Wait until button is present AND clickable
    search_button = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#id1 form button[type=submit]"))
    )

    # Scroll button into view
    driver.execute_script("arguments[0].scrollIntoView(true);", search_button)

    # Click using JS (forces Wicket to receive the click)
    driver.execute_script("arguments[0].click();", search_button)

    # Wait for results container to reload
    wait.until(lambda d: d.find_element(By.ID, "id2").get_attribute("hidden") is None)

    # Important: Wicket rewrote the DOM → re-locate the dropdown fresh
    items_per_page_dropdown = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "select.itemsperpage"))
    )
    if "Es wurden keine Einträge gefunden!" in driver.page_source:
        # 6. Change to 100 results
        select_items = Select(items_per_page_dropdown)
        select_items.select_by_value("3")  # = “100”

        # Wait for Wicket to reload results again
        wait.until(lambda d: "1 - 10 von" not in d.page_source)

    html = driver.page_source
    driver.quit()
    return html


In [12]:
def get_meta_data(soup_item):
    data = {}

    # get name
    div = soup_item.find("div", class_="flex-grow-1")
    name = div.get_text(separator=" ", strip=True)
    data["name"] = name

    # get metadata
    rows = soup_item.find_all("div", class_="keyvalue-row")
    for row in rows:
        key_div = row.find("div", class_="keyvalue-key")
        value_div = row.find("div", class_="keyvalue-value")
        key = key_div.get_text(strip=True).rstrip(":")
        value = value_div.get_text(" ", strip=True)

        data[key] = value

    # get pdf link
    base = "https://risi.muenchen.de/risi/"
    a = soup_item.find("a", class_= "downloadlink text-nohyphens")
    try:
        pdf_href = a["href"]
    except TypeError:
        pdf_href = "No pdf found"
    data["document_link"] = base + pdf_href


    return name, data

In [25]:
def download_pdf(name, entry, path):
    link = entry['document_link']
    if "No pdf found" not in link:
        response = requests.get(link)
        name = name.replace('/', ' ')
        file_Path = f'{path}/{name.replace(' ', '_')}.pdf'

        if response.status_code == 200:
            with open(file_Path, 'wb') as file:
                file.write(response.content)

        reader = PdfReader(file_Path)

        for page in reader.pages:
            text = page.extract_text()
            entry['document_content'] = text


In [26]:
# select dates to load items from
start_date = date(2024, 1, 1)
end_date = date(2024, 6, 1)
delta = timedelta(days=1)

#load html from all dates
url = f"https://risi.muenchen.de/risi/erweitertesuche?15&objekt=SITZUNGSVORLAGE;jsessionid=A5933341DAEC20EAFAE1BDD677C67FD8?0"

html_pages = []
error_counter = 0
i = 1
while start_date <= end_date:
    current_day = start_date.strftime("%d.%m.%Y")
    print(f"{i}: Downloading {current_day}...", end = "")

    try:
        html_page = visit_page(url, current_day, hidden=True)
        print("✅")
        start_date += delta
        i += 1
        error_counter = 0

    except TypeError as e:
        error_counter += 1
        print(" ")
        if error_counter > 3:
            print(f"WARNING: {start_date} not fetched successfully.")
            start_date += delta
            i += 1

    if "schränken Sie Ihre Suche ein" in html_page:
        print("WARNING: Not all items fetched successfully.")

    html_pages.append(html_page)


1: Downloading 01.01.2024...✅
2: Downloading 02.01.2024...✅
3: Downloading 03.01.2024...✅
4: Downloading 04.01.2024...✅
5: Downloading 05.01.2024...✅
6: Downloading 06.01.2024...✅
7: Downloading 07.01.2024...✅
8: Downloading 08.01.2024...✅
9: Downloading 09.01.2024...✅
10: Downloading 10.01.2024...✅
11: Downloading 11.01.2024...✅
12: Downloading 12.01.2024...✅
13: Downloading 13.01.2024...✅
14: Downloading 14.01.2024...✅
15: Downloading 15.01.2024...✅
16: Downloading 16.01.2024...✅
17: Downloading 17.01.2024...✅
18: Downloading 18.01.2024...✅
19: Downloading 19.01.2024...✅
20: Downloading 20.01.2024...✅
21: Downloading 21.01.2024...✅
22: Downloading 22.01.2024...✅
23: Downloading 23.01.2024...✅
24: Downloading 24.01.2024...✅
25: Downloading 25.01.2024...✅
26: Downloading 26.01.2024...✅
27: Downloading 27.01.2024...✅
28: Downloading 28.01.2024...✅
29: Downloading 29.01.2024...✅
30: Downloading 30.01.2024...✅
31: Downloading 31.01.2024...✅
32: Downloading 01.02.2024...✅
33: Downloading 0

In [27]:

len(html_pages)

153

In [28]:
# get all items and fill data dict
data = {}
j=0
for page in html_pages:
    j+=1
    soup = BeautifulSoup(page, "html.parser")
    soup = BeautifulSoup(page, "html.parser")
    soup_items = soup.find_all("li", class_="list-group-item even")
    soup_items += soup.find_all("li", class_="list-group-item odd")

    print("Page",j, "Items:", len(soup_items))

    i = 0
    for item in soup_items:
        i+=1
        name, item_data = get_meta_data(item)
        data[name] = item_data

Page 1 Items: 0
Page 2 Items: 0
Page 3 Items: 1
Page 4 Items: 0
Page 5 Items: 0
Page 6 Items: 0
Page 7 Items: 0
Page 8 Items: 1
Page 9 Items: 9
Page 10 Items: 6
Page 11 Items: 2
Page 12 Items: 2
Page 13 Items: 0
Page 14 Items: 0
Page 15 Items: 5
Page 16 Items: 3
Page 17 Items: 4
Page 18 Items: 6
Page 19 Items: 3
Page 20 Items: 0
Page 21 Items: 0
Page 22 Items: 1
Page 23 Items: 7
Page 24 Items: 8
Page 25 Items: 10
Page 26 Items: 5
Page 27 Items: 0
Page 28 Items: 0
Page 29 Items: 2
Page 30 Items: 8
Page 31 Items: 4
Page 32 Items: 4
Page 33 Items: 7
Page 34 Items: 0
Page 35 Items: 0
Page 36 Items: 4
Page 37 Items: 10
Page 38 Items: 5
Page 39 Items: 4
Page 40 Items: 4
Page 41 Items: 0
Page 42 Items: 0
Page 43 Items: 3
Page 44 Items: 0
Page 45 Items: 5
Page 46 Items: 8
Page 47 Items: 2
Page 48 Items: 0
Page 49 Items: 0
Page 50 Items: 0
Page 51 Items: 5
Page 52 Items: 7
Page 53 Items: 6
Page 54 Items: 4
Page 55 Items: 0
Page 56 Items: 0
Page 57 Items: 2
Page 58 Items: 4
Page 59 Items: 6
Page

In [29]:
print(len(data))

452


In [30]:
# download all pdfs
for name, entry in data.items():
    download_pdf(name, entry, "antraege")

In [31]:
# fill database
df = pd.DataFrame.from_dict(data, orient="index")
df["Gestellt am"] = pd.to_datetime(df["Gestellt am"], format="%d.%m.%Y")
df = df.sort_values("Gestellt am")

In [32]:
# create csv
df.to_csv("data.csv", index=False)

In [33]:
print(len(data))

452
