In [10]:
import os
from bs4 import BeautifulSoup

html_dir = "html"
offset = 82
amount = 1

# List to store BeautifulSoup objects
soups = dict()

# Loop through each file in the specified directory
for filename in sorted(os.listdir(html_dir))[offset:offset+amount]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(html_dir, filename)
        # Open and read the HTML file
        with open(file_path, 'r', encoding="latin-1") as file:
            # Create a BeautifulSoup object and append it to the list
            soup = BeautifulSoup(file, 'html.parser')
            soups[filename] = soup

In [11]:
len(soups)

309

In [12]:
from dataclasses import dataclass
from typing import List, Optional
import re
from copy import copy

def is_subject(tag):
    return (tag.name == "p" and "Titre2NL" in tag.get('class', [])) or \
        (tag.name == "h2" and tag.get('class', []) == [] and not tag.find('span', attrs={'lang': 'FR'}))

def is_section(tag):
    return (tag.name == "p" and ("Titre1NL" in tag.get('class', []) or "Titre1FR" in tag.get('class', []))) or \
        (tag.name == "h1" and tag.get('class', []))

@dataclass
class Vote:
    session_id: str
    nr_within_session: int = -1
    subject: str = ""

votes: List[Vote] = []
for i, (filename, soup) in enumerate(soups.items()):
    naamstemmingen = soup.find_all(lambda tag: tag.name == "p" and tag.get("class") == ["Titre1NL"] and "Naamstemmingen" in tag.text)
    for naamstemming in naamstemmingen:
        session_id = f"{i+1+offset:04d}"
        print(f"{session_id}: {naamstemming}")
        print("-"*100)

        current_vote: Vote = Vote(session_id=session_id)
        last_tag_was_subject = False
        for tag in naamstemming.next_siblings:
            tag_text = " ".join(tag.strings).lower()

            # A bold (titre2) sentence in dutch signifies a subject of a vote. It can be a new subject
            # or a new line in the current subject. Depends on whether the previous sentence was also a subject
            # or not
            if is_subject(tag):
                # Every subtitle in between "Naamstemmingen" and the next H1 title is considered part of a vote
                # If the previous tag was a subject, this new line is part of the previous one
                print(f"Found SUBTITLE: {tag_text}")
                if last_tag_was_subject:
                    current_vote.subject += tag_text + "\n"
                else:
                    # If not: consider this to be a new vote, close off the old one and reset
                    if current_vote.subject and current_vote.nr_within_session != -1:
                        votes.append(copy(current_vote))
                    current_vote = Vote(session_id=session_id, subject=tag_text + "\n")
                last_tag_was_subject = True
            elif is_section(tag):
                # If we hit the next Title, the voting section is done
                # This should never happen right after a subtitle, otherwise we won't have the vote nr anyway
                # However, we're searching for "naamstemmingen" so it might happen the french version is next immediately below
                # it's also a title, so skip that
                if "votes nominatifs" in tag_text:
                    last_tag_was_subject = False
                    continue
                # If this is truly a next title, close off the current vote and break the loop
                if current_vote.subject and current_vote.nr_within_session != -1:
                    votes.append(copy(current_vote))
                current_vote = Vote(session_id=session_id, subject=tag_text + "\n")
                break
            else:
                # In this case it's normal text
                # First, check if it contains the vote nr (aka we're in between vote headers)
                vote_match = re.search(r'\(stemming\/vote(?:\s|&nbsp;)* *\d+\)', tag_text, re.IGNORECASE)
                if vote_match:
                    # Extract the matched text
                    matched_text = vote_match.group()
                    print(f"Found VOTE: {matched_text}")
                    # Extract the number, assuming it's the last part of the matched string
                    number = int(re.search(r'\d+', matched_text).group())
                    # Set the current vote to that number
                    current_vote.nr_within_session = number
                last_tag_was_subject = False
            
        # If this is truly a next title, close off the current vote and break the loop
        if current_vote.subject and current_vote.nr_within_session != -1:
            votes.append(copy(current_vote))
        

        
        print("\n\n")
        if i == 10:
            break

0004: <p class="Titre1NL"><a name="_Toc21073224"><span lang="FR-BE" style="mso-ansi-language:
FR-BE">Naamstemmingen</span></a><span lang="FR-BE" style="mso-ansi-language:FR-BE"><o:p></o:p></span></p>
----------------------------------------------------------------------------------------------------
Found SUBTITLE: 09 aangehouden amendementen en
artikelen van het wetsontwerp tot opening van voorlopige kredieten voor de
maanden augustus, september en oktober 2019 (25/1-5)
Found VOTE: (stemming/vote 1)
Found VOTE: (stemming/vote 2)
Found VOTE: (stemming/vote 3)
Found VOTE: (stemming/vote 4)
Found VOTE: (stemming/vote 5)
Found VOTE: (stemming/vote 6)
Found VOTE: (stemming/vote 7)
Found VOTE: (stemming/vote 8)
Found VOTE: (stemming/vote 9)
Found VOTE: (stemming/vote 10)
Found VOTE: (stemming/vote 11)
Found VOTE: (stemming/vote 12)
Found SUBTITLE: 10 geheel van het wetsontwerp
tot opening van voorlopige kredieten voor de maanden augustus, september en
oktober 2019 (25/4+1)
Found VOTE: (stem

In [13]:
votes

[Vote(session_id='0004', nr_within_session=12, subject='09 aangehouden amendementen en\nartikelen van het wetsontwerp tot opening van voorlopige kredieten voor de\nmaanden augustus, september en oktober\xa02019 (25/1-5)\n'),
 Vote(session_id='0004', nr_within_session=13, subject='10 geheel van het wetsontwerp\ntot opening van voorlopige kredieten voor de maanden augustus, september en\noktober\xa02019 (25/4+1)\n'),
 Vote(session_id='0005', nr_within_session=1, subject='27 voorstel van resolutie\ninzake steun voor de beslissing van de federale regering de dato\n28\xa0juni\xa02019 waardoor belgië zich ertoe verbindt deel te nemen aan\nhet instex-mechanisme (395/2)\n'),
 Vote(session_id='0006', nr_within_session=1, subject='20 ovse \x96 benoeming van de\nplaatsvervangende leden  (voortzetting)\n'),
 Vote(session_id='0009', nr_within_session=1, subject='18 voorstel tot wijziging van\nartikel\xa0149 van het reglement van de kamer van volksvertegenwoordigers\nbetreffende de samenstelling van