In [23]:
import os
from bs4 import BeautifulSoup

html_dir = "html"
offset = 20
amount = 10

# List to store BeautifulSoup objects
soups = dict()

# Loop through each file in the specified directory
for filename in sorted(os.listdir(html_dir))[offset:offset+amount]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(html_dir, filename)
        # Open and read the HTML file
        with open(file_path, 'r', encoding="latin-1") as file:
            # Create a BeautifulSoup object and append it to the list
            soup = BeautifulSoup(file, 'html.parser')
            soups[filename] = soup

In [24]:
len(soups)

10

In [42]:
from dataclasses import dataclass
from typing import List, Optional
import re
from copy import copy

def is_subtitle(tag):
    return (tag.name == "p" and "Titre2NL" in tag.get('class', [])) or \
        (tag.name == "h2" and tag.get('class', []) == [] and not tag.find('span', attrs={'lang': 'FR'}))

def is_title(tag):
    return (tag.name == "p" and ("Titre1NL" in tag.get('class', []) or "Titre1FR" in tag.get('class', []))) or \
        (tag.name == "h1" and tag.get('class', []))

@dataclass
class Vote:
    session_id: str
    nr_within_session: int = -1
    subject: str = ""

votes: List[Vote] = []
for i, (filename, soup) in enumerate(soups.items()):
    naamstemmingen = soup.find_all(string=lambda text: "Naamstemmingen" in text)
    for naamstemming in naamstemmingen:
        session_id = f"{i+1+offset:04d}"
        print(f"{session_id}: {naamstemming.parent}")
        print("-"*100)

        current_vote: Vote = Vote(session_id=session_id)
        for tag in naamstemming.parent.find_all_next():
            tag_text = " ".join(tag.strings)

            # A bold (titre2) sentence in dutch signifies a subject of a vote. It can be a new subject
            # or a new line in the current subject. Depends on whether the previous sentence was also a subject
            # or not
            if is_subtitle(tag):
                # Every subtitle in between "Naamstemmingen" and the next H1 title is considered part of a vote
                current_vote.subject += tag_text + "\n"
                # print(f"Found SUBTITLE: {tag_text}")
            elif is_title(tag):
                # If we hit the next Title, the voting section is done
                # This should never happen right after a subtitle, otherwise we won't have the vote nr anyway
                # However, we're searching for "naamstemmingen" so it might happen the french version is next, so skip that
                if tag_text.lower() == "votes nominatifs":
                    continue
                # print(f"Found TITLE: {tag_text}")
                break
            else:
                # In this case it's normal text
                # First, check if it contains the vote nr (aka we're in between vote headers)
                vote_match = re.search(r'\(stemming\/vote(?: |&nbsp;)\d+\)', tag_text, re.IGNORECASE)
                if vote_match:
                    print(f"Found VOTE: {tag_text.strip().split("\n")[0]}")
                    # Extract the matched text
                    matched_text = vote_match.group()
                    # Extract the number, assuming it's the last part of the matched string
                    number = int(re.search(r'\d+', matched_text).group())
                    # Only when this is the first vote OR the vote nr is different from the previous vote, we can set the vote nr
                    # Reason being: find_all_next will iterate through ALL tags, so it will capture a vote many times. (table > th > tr > ...)
                    if not votes or number != votes[-1].nr_within_session:
                        current_vote.nr_within_session = number
                # Close off vote and reset the current vote
                if current_vote.subject and current_vote.nr_within_session != -1:
                    # print(f"Closing VOTE: {current_vote.nr_within_session}: {current_vote.subject}")
                    votes.append(copy(current_vote))
                    current_vote = Vote(session_id=session_id)
                    print(f"Vote {votes[-1].nr_within_session}: {votes[-1].subject}")
                    print("\n")
        

        
        print("\n\n")
        if i == 10:
            break

0021: <span lang="FR-BE" style="mso-ansi-language:
FR-BE">Naamstemmingen</span>
----------------------------------------------------------------------------------------------------
Found VOTE: (Stemming/vote 1) 
Vote 1: 17 Aangehouden amendementen en
artikel van het wetsvoorstel tot wijziging van de wet van 10 mei 2007
ter bestrijding van discriminatie tussen vrouwen en mannen wat het discrimi­natie­verbod
op vaderschap of meemoeder­schap, borstvoeding en medisch begeleide
voortplanting betreft (nieuw opschrift) (165/1-11)



Found VOTE: (Stemming/vote 1)
Found VOTE: (Stemming/vote 1)
Found VOTE: (Stemming/vote 1)
Found VOTE: (Stemming/vote 1)
Found VOTE: (Stemming/vote 1)
Found VOTE: (Stemming/vote 2) 
Found VOTE: (Stemming/vote 2)
Found VOTE: (Stemming/vote 2)
Found VOTE: (Stemming/vote 2)
Found VOTE: (Stemming/vote 2)
Found VOTE: (Stemming/vote 2)
Vote 2: 18 Goedkeuring van de agenda






0022: <span lang="NL">Naamstemmingen</span>
--------------------------------------------------

In [31]:
first_target_tag