In [1]:
import os
from bs4 import BeautifulSoup

html_dir = "html"
offset = 19
amount = 1

# List to store BeautifulSoup objects
soups = dict()

# Loop through each file in the specified directory
for filename in sorted(os.listdir(html_dir))[offset:offset+amount]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(html_dir, filename)
        # Open and read the HTML file
        with open(file_path, 'r', encoding="latin-1") as file:
            # Create a BeautifulSoup object and append it to the list
            soup = BeautifulSoup(file, 'html.parser')
            soups[filename] = soup

In [2]:
len(soups)

1

# Capture the vote subjects

In [6]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import re
from copy import copy

def is_subject(tag):
    return (tag.name == "p" and "Titre2NL" in tag.get('class', [])) or \
        (tag.name == "h2" and tag.get('class', []) == [] and not tag.find('span', attrs={'lang': 'FR'}))

def is_section(tag):
    return (tag.name == "p" and ("Titre1NL" in tag.get('class', []) or "Titre1FR" in tag.get('class', []))) or \
        (tag.name == "h1" and tag.get('class', []))

@dataclass
class Vote:
    session_id: str
    nr_within_session: int = -1
    subject: str = ""

votes: Dict[Tuple[str, int], Vote] = dict()
def close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text):
    if current_vote.subject and current_vote.nr_within_session != -1:
        votes[(session_id, current_vote.nr_within_session)] = copy(current_vote)
    current_vote = Vote(session_id=session_id, subject=tag_text + "\n")
    return current_vote

for i, (filename, soup) in enumerate(soups.items()):
    naamstemmingen = soup.find_all(lambda tag: ((tag.name == "p" and tag.get("class") == ["Titre1NL"]) or (tag.name == "h1")) and "naamstemmingen" in tag.text.lower().strip())
    for naamstemming in naamstemmingen:
        session_id = f"{i+1+offset:04d}"
        print(f"{session_id}: {naamstemming}")
        print("-"*100)

        current_vote: Vote = Vote(session_id=session_id)
        last_tag_was_subject = False
        for j, tag in enumerate(naamstemming.next_siblings):
            tag_text = " ".join(tag.strings).lower()

            # Next siblings also produces a nonsense string (like \n) after every real tag
            # So skip all even next tags, which will be nonsense (but annoyingly, a different nonsense every document)
            if j % 2 == 0:
                continue

            # A bold (titre2) sentence in dutch signifies a subject of a vote. It can be a new subject
            # or a new line in the current subject. Depends on whether the previous sentence was also a subject
            # or not
            if is_subject(tag):
                # Every subtitle in between "Naamstemmingen" and the next H1 title is considered part of a vote
                # If the previous tag was a subject, this new line is part of the previous one
                print(f"\nFound SUBTITLE: {tag_text}")
                if last_tag_was_subject:
                    current_vote.subject += tag_text + "\n"
                else:
                    # If not: consider this to be a new vote, close off the old one and reset
                    current_vote = close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)
                last_tag_was_subject = True
            elif is_section(tag):
                # If we hit the next Title, the voting section is done
                # This should never happen right after a subtitle, otherwise we won't have the vote nr anyway
                # However, we're searching for "naamstemmingen" so it might happen the french version is next immediately below
                # it's also a title, so skip that
                if "votes nominatifs" in tag_text:
                    last_tag_was_subject = False
                    continue
                # If this is truly a next title, close off the current vote and break the loop
                current_vote = close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)
                break
            else:
                # In this case it's normal text
                # First, check if it contains the vote nr (aka we're in between vote headers)
                vote_match = re.search(r'\(stemming\/vote(?:\s|&nbsp;)* *\d+\)', tag_text, re.IGNORECASE)
                if vote_match:
                    # Extract the matched text
                    matched_text = vote_match.group()
                    print(f"Found VOTE: {matched_text}")
                    # Extract the number, assuming it's the last part of the matched string
                    number = int(re.search(r'\d+', matched_text).group())
                    # Set the current vote to that number
                    current_vote.nr_within_session = number
                last_tag_was_subject = False
            
        # If this is truly a next title, close off the current vote and break the loop
        close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)

0020: <p class="Titre1NL"><a name="_Toc35851493"><span lang="FR" style="mso-ansi-language:
FR">Naamstemmingen</span></a><span lang="FR" style="mso-ansi-language:FR"><o:p></o:p></span></p>
----------------------------------------------------------------------------------------------------

Found SUBTITLE: 14 aangehouden amendementen op
het voorstel van resolutie over de toetreding van belgië tot het internationaal
verbodsverdrag op kernwapens (372/1-7)
Found VOTE: (stemming/vote 1)
Found VOTE: (stemming/vote 2)
Found VOTE: (stemming/vote 3)
Found VOTE: (stemming/vote 4)
Found VOTE: (stemming/vote 5)
Found VOTE: (stemming/vote 6)
Found VOTE: (stemming/vote 7)
Found VOTE: (stemming/vote 7)
0020: <p class="Titre1NL"><a name="_Toc35851499"><span lang="NL">Naamstemmingen <i>(voortzetting)</i></span></a></p>
----------------------------------------------------------------------------------------------------

Found SUBTITLE: 18 geheel van het voorstel van
resolutie over de toetreding van belgi

In [7]:
votes

{('0020',
  7): Vote(session_id='0020', nr_within_session=7, subject='14 aangehouden amendementen op\nhet voorstel van resolutie over de toetreding van belgië tot het internationaal\nverbodsverdrag op kernwapens (372/1-7)\n'),
 ('0020',
  8): Vote(session_id='0020', nr_within_session=8, subject='18 geheel van het voorstel van\nresolutie over de toetreding van belgië tot het internationaal verbods\xadverdrag\nop kernwapens, zoals geamendeerd (372/6+7)\n')}

# Capture the date of the session

In [8]:
re.search(
    r"([1-9]|[1-2][0-9]|3[0-1])[\s\\n]*(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)[\s\\n]*(\d{4})",
    "\n".join(list(soup.strings)),
    re.IGNORECASE
).group().replace("\n", "")

'16 januari 2020'

In [9]:
for soup in soups.values():
    print(re.search(
        r"([1-9]|[1-2][0-9]|3[0-1])[\s\\n]*(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)[\s\\n]*(\d{4})",
        "\n".join(list(soup.strings)),
        re.IGNORECASE
    ).group())
    print("\n")

16 
januari 
2020




# Capture the names from the vote details

In [11]:
import logging


for i, (filename, soup) in enumerate(soups.items()):
    session_id = f"{i+1+offset:04d}"
    vote_details = soup.find(lambda tag: ((tag.name == "p" and tag.get("class") == ["Titre1NL"]) or (tag.name == "h1")) and "detail" in tag.text.lower() and "naamstemmingen" in tag.text.lower())
    if vote_details:
        if vote_details.name == "p" and vote_details.get("class", [""])[0].startswith("Titre1"):
            vote_tag = vote_details
        else:
            # Other vote details are inside a table tag, that table tag is on the same level as the vote details, so to use next_siblings
            # we want to go up to the table tag
            vote_tag = None
            for parent in vote_details.parents:
                if parent.name == "table" or ():
                    vote_tag = parent
        
        if vote_tag is None:
            raise ValueError("The naamstemmingen details are not in the expected place!")
        
        # Now iterate over the tags on this level, this should include either a table of results (yes, no or abstention) as well as the list of names
        current_vote = None
        for tag in vote_tag.next_siblings:
            tag_text = " ".join(tag.strings).lower()
            stemming_match = re.search(r"naamstemming: (\d{3})", tag_text, re.IGNORECASE)

            if not stemming_match:
                continue

            stemming_nr = stemming_match.groups()[0]
            if stemming_nr:
                current_vote = votes.get((session_id, int(stemming_nr)))
                if not current_vote:
                    logging.warning(f"Did not find vote {session_id} -> {stemming_nr}")
                    continue
                print((session_id, int(stemming_nr)))

            # Parse the YES names, NO names and ABSTENTION names
            if current_vote:
                print(tag_text)
                if "ja" in tag_text and "oui" in tag_text:
                    print("ja")
                elif "nee" in tag_text and "non" in tag_text:
                    print("nee")
                elif "abstentions" and "onthoudingen" in tag_text:
                    print("onthouden")
                

            

In [51]:
stemming_nr

'002'