In [None]:
import requests
from bs4 import BeautifulSoup
import json
from typing import Dict, List


def scrape_voting_behavior(vote_id: str) -> Dict:
    """
    Scrapes voting behavior data from the Bundestag website for a specific vote ID,
    capturing date, title, descriptive text, overall results, and results by party.
    """
    url = (
        f"https://www.bundestag.de/parlament/plenum/abstimmung/abstimmung?id={vote_id}"
    )

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Parse date
    date_span = soup.select_one("span.bt-date")
    date = date_span.get_text(strip=True) if date_span else None

    # Parse title
    title_h1 = soup.select_one("h1.bt-artikel__title")
    title = title_h1.get_text(strip=True) if title_h1 else None

    # Parse subtitle
    subtitle_p = soup.select_one("article.bt-artikel p")
    subtitle = subtitle_p.get_text(strip=True) if subtitle_p else None

    # parse detail text
    article_content_div = soup.select_one("article.bt-artikel .bt-standard-content")
    if article_content_div:
        paragraphs = article_content_div.find_all("p")
        detail_text = "\n\n".join(
            p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)
        )
    else:
        detail_text = ""

    # Overall results
    overall = {}
    overall_chart = soup.select_one(".bt-bar-chart .bt-chart-legend")
    if overall_chart:
        for li in overall_chart.find_all("li"):
            label_class = li.get("class", [])
            count = li.find("span").get_text(strip=True)

            if "bt-legend-ja" in label_class:
                overall["yes"] = count
            elif "bt-legend-nein" in label_class:
                overall["no"] = count
            elif "bt-legend-enthalten" in label_class:
                overall["abstain"] = count
            elif "bt-legend-na" in label_class:
                overall["not_voted"] = count

    # Parse number of members
    members_span = soup.select_one(".bt-bar-chart .bt-teaser-text-chart h3")
    if members_span:
        text = members_span.get_text(strip=True)
        maybe_nums = [token for token in text.split() if token.isdigit()]
        if maybe_nums:
            overall["members"] = maybe_nums[0]

    # Result by party
    by_party = []
    party_blocks = soup.select(".bt-teaser-chart-solo")
    for pb in party_blocks:
        party_name = pb.get("data-value", "").strip()
        h4 = pb.select_one("h4.bt-chart-fraktion")
        party_members = None
        if h4:
            span = h4.select_one("span")
            if span:
                pm_text = span.get_text(strip=True)
                pm_tokens = pm_text.split()
                if pm_tokens and pm_tokens[0].isdigit():
                    party_members = pm_tokens[0]

        # Now parse the yes/no/abstain/not_voted results
        results_dict = {}
        ul = pb.select_one(".bt-chart-legend")
        if ul:
            for li in ul.find_all("li"):
                li_text = li.get_text(strip=True)
                li_parts = li_text.split()
                if len(li_parts) > 1:
                    count = li_parts[0]

                    label = li_parts[1].lower()

                    if "ja" in label:
                        results_dict["yes"] = count
                    elif "nein" in label:
                        results_dict["no"] = count
                    elif "enthalten" in label:
                        results_dict["abstain"] = count
                    elif "nicht" in label:  # "Nicht abg."
                        results_dict["not_voted"] = count

        by_party.append({"party": party_name, "members": party_members, **results_dict})

    # Parse links
    link_list = soup.select_one(".bt-linkliste")
    if link_list:
        links = []
        for li in link_list.find_all("li"):
            a = li.select_one("a")
            if a:
                href = a.get("href", "")
                link_title = a.get("title", "")
                links.append({"url": href, "title": link_title})

    # Build final dict
    return {
        "id": vote_id,
        "url": url,
        "date": date,
        "title": title,
        "subtitle": subtitle,
        "detail_text": detail_text,
        "links": links,
        "voting_results": {"overall": overall, "by_party": by_party},
    }

In [None]:
# Iterates from 939 down to 1 and scrapes each vote. Writes the results to a file.
import os
from time import sleep

if not os.path.exists("../votes"):
    os.makedirs("../votes")

for vote_id in range(939, 1, -1):
    data = scrape_voting_behavior(str(vote_id))
    with open(f"../votes/vote_{vote_id}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    sleep(30)

In [None]:
# Iterate through all the votes from 377 to 939 and open them
chars = []
longest = 0
longest_id = 0

for vote_id in range(377, 940):
    with open(f"../votes/vote_{vote_id}.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    chars.append(len(data["detail_text"]))

    if len(data["detail_text"]) > longest:
        longest = len(data["detail_text"])
        longest_id = vote_id

print(f"Average number of characters per vote: {sum(chars) / len(chars)}")
print(f"Longest vote: {max(chars)}")
print(f"Shortest vote: {min(chars)}")

with open(f"../votes/vote_{longest_id}.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print(data["detail_text"])