In [None]:
import os
from bs4 import BeautifulSoup

html_dir = "html"
offset = 0
amount = 309

# List to store BeautifulSoup objects
soups = dict()

# Loop through each file in the specified directory
for filename in sorted(os.listdir(html_dir))[offset:offset+amount]:
    # Check if the file is an HTML file
    if filename.endswith('.html'):
        # Construct full file path
        file_path = os.path.join(html_dir, filename)
        # Open and read the HTML file
        with open(file_path, 'r', encoding="latin-1") as file:
            # Create a BeautifulSoup object and append it to the list
            soup = BeautifulSoup(file, 'html.parser')
            soups[filename] = soup

In [None]:
len(soups)

# Capture the vote subjects

In [None]:
soups

In [None]:
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
import re
from copy import copy

def is_subject(tag):
    return (tag.name == "p" and "Titre2NL" in tag.get('class', [])) or \
        (tag.name == "h2" and tag.get('class', []) == [] and not tag.find('span', attrs={'lang': 'FR'}))

def is_section(tag):
    return (tag.name == "p" and ("Titre1NL" in tag.get('class', []) or "Titre1FR" in tag.get('class', []))) or \
        (tag.name == "h1" and tag.get('class', []))

def is_votecounts_table(tag):
    "should return True for beginning of a table"
    return (tag.name == "table") and (len(tag) == 11)  

def parse_votes_table(tag, debug=True):
    # feed this the tag that made is_count return True
    # Initialize an empty list to store the parsed data
    parsed_data = {}

    # Iterate through each row of the table
    for i, row in enumerate(tag.find_all('tr')):
        if i == 0:
            continue  # skip the first row which states the vote id within the subject

        row_data = []
        for cell in row.find_all('td'):
            # Extract the text from the cell and strip any leading/trailing whitespace
            cell_text = cell.get_text(strip=True)
            row_data.append(cell_text)

        if debug:   # todo make check an assertion once all cases covered by is_votecounts_table
            check = (len(row_data) == 3) and (row_data[0] in ['Ja', "Nee", "Totaal", "Onthoudingen"])
            if not check:
                Warning("Possibly invalid table passed is_votecounts_table")
        
        try:
            parsed_data['/'.join([row_data[0], row_data[-1]])] = int(row_data[1])
        except:
            Warning("Non-int votecount, possibly invalid format passing is_votecounts_table")
            parsed_data['/'.join([row_data[0], row_data[-1]])] = row_data[1]
    return parsed_data


@dataclass
class Vote:
    session_id: str
    nr_within_session: int = -1
    subject: str = ""
    summary_ynd: dict = field(default_factory=lambda: {})
    yay: List[str] = field(default_factory=lambda: [])
    nay: List[str] = field(default_factory=lambda: [])
    dunno: List[str] = field(default_factory=lambda: [])

votes: Dict[Tuple[str, int], Vote] = dict()
def close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text):
    if current_vote.subject and current_vote.nr_within_session != -1:
        votes[(session_id, current_vote.nr_within_session)] = copy(current_vote)
    current_vote = Vote(session_id=session_id, subject=tag_text + "\n")
    return current_vote


for i, (filename, soup) in enumerate(soups.items()):
    # Find the sections in the html file (contained in soup) which contain "naamstemmingen" (case insensitive) as a h1-header or the Titre1Nl-format:
    naamstemmingen = soup.find_all(
        lambda tag: ((tag.name == "p" and tag.get("class") == ["Titre1NL"]) or (tag.name == "h1")) and "naamstemmingen" in tag.text.lower().strip()
        )
    for naamstemming in naamstemmingen:
        session_id = f"{i+1+offset:04d}"
        print(f"{session_id}: {naamstemming}")
        print("-"*100)

        current_vote: Vote = Vote(session_id=session_id)
        last_tag_was_subject = False
        # .next_siblings looks at the tag ahead of naamstemming in the corresponding soup.
        for j, tag in enumerate(naamstemming.next_siblings):
            tag_text = " ".join(tag.strings).lower()

            # Next siblings also produces a nonsense string (like \n) after every real tag
            # So skip all even next tags, which will be nonsense (but annoyingly, a different nonsense every document)
            if j % 2 == 0:
                continue

            # A bold (titre2) sentence in dutch signifies a subject of a vote. It can be a new subject
            # or a new line in the current subject. Depends on whether the previous sentence was also a subject
            # or not
            if is_subject(tag):
                # Every subtitle in between "Naamstemmingen" and the next H1 title is considered part of a vote
                # If the previous tag was a subject, this new line is part of the previous one
                print(f"\nFound SUBTITLE: {tag_text}")
                if last_tag_was_subject:
                    current_vote.subject += tag_text + "\n"
                else:
                    # If not: consider this to be a new vote, close off the old one and reset
                    current_vote = close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)
                last_tag_was_subject = True
            elif is_section(tag):
                # If we hit the next Title, the voting section is done
                # This should never happen right after a subtitle, otherwise we won't have the vote nr anyway
                # However, we're searching for "naamstemmingen" so it might happen the french version is next immediately below
                # it's also a title, so skip that
                if "votes nominatifs" in tag_text:
                    last_tag_was_subject = False
                    continue
                # If this is truly a next title, close off the current vote and break the loop
                current_vote = close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)
                break

            else:
                # In this case it's normal text
                # First, check if it contains the vote nr (aka we're in between vote headers)
                vote_match = re.search(r'\(stemming\/vote(?:\s|&nbsp;)* *\d+\)', tag_text, re.IGNORECASE)
                if vote_match:
                    # Extract the matched text
                    matched_text = vote_match.group()
                    print(f"Found VOTE: {matched_text}")
                    # Extract the number, assuming it's the last part of the matched string
                    number = int(re.search(r'\d+', matched_text).group())
                    # Set the current vote to that number
                    current_vote.nr_within_session = number

                    # todo turn check into assert once cases covered
                    if not is_votecounts_table(tag):  # matching stemming/vote should only find tables:
                        Warning(f"Tag containing stemming/vote is not a votecountstable: {tag}")
                    # If we encounter a table with counted votes, we should not yet have a filled-in voting table in the current vote.
                    print ("[debug] number in session and parsed tables present: ", number, len(current_vote.summary_ynd))
                    current_vote.summary_ynd[number] = parse_votes_table(tag)
                   
                last_tag_was_subject = False
            
        # If this is truly a next title, close off the current vote and break the loop
        close_vote_and_start_new(Vote, votes, session_id, current_vote, tag_text)

In [None]:
def is_votecounts_table(tag):
    "should return True for beginning of a table"
    return (tag.name == "table") and (len(tag) == 11)  

def parse_votes_table(table):
    # feed this the tag that made is_count return True
    # Initialize an empty list to store the parsed data
    parsed_data = {}

    # Iterate through each row of the table
    for i, row in enumerate(table.find_all('tr')):
        if i == 0:
            continue  # skip the first row which states the vote id within the subject

        row_data = []
        for cell in row.find_all('td'):
            # Extract the text from the cell and strip any leading/trailing whitespace
            cell_text = cell.get_text(strip=True)
            row_data.append(cell_text)
        assert (len(row_data) == 3) and (row_data[0] in ['Ja', "Nee", "Totaal", "Onthoudingen"])
        parsed_data['/'.join([row_data[0], row_data[-1]])] = int(row_data[1])
    return parsed_data


for i, (filename, soup) in enumerate(list(soups.items())[3:4]):
    # Find the sections in the html file (contained in soup) which contain "naamstemmingen" (case insensitive) as a h1-header or the Titre1Nl-format:
    naamstemmingen = soup.find_all(
        lambda tag: ((tag.name == "p" and tag.get("class") == ["Titre1NL"]) or (tag.name == "h1")) and "naamstemmingen" in tag.text.lower().strip()
        )
    for naamstemming in naamstemmingen:
        session_id = f"{i+1+offset:04d}"
        print(f"{session_id}: {naamstemming}")
        print("-"*100)

        current_vote: Vote = Vote(session_id=session_id)
        last_tag_was_subject = False
        # .next_siblings looks at the tag ahead of naamstemming in the corresponding soup.
        for j, tag in enumerate(naamstemming.next_siblings):
            tag_text = " ".join(tag.strings).lower()
        
           # print(tag_text)
            if j % 2 == 0:
                continue

            if is_votecounts_table(tag):
                print(parse_votes_table(tag))


# Capture the date of the session

In [None]:
re.search(
    r"([1-9]|[1-2][0-9]|3[0-1])[\s\\n]*(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)[\s\\n]*(\d{4})",
    "\n".join(list(soup.strings)),
    re.IGNORECASE
).group().replace("\n", "")

In [None]:
for soup in soups.values():
    print(re.search(
        r"([1-9]|[1-2][0-9]|3[0-1])[\s\\n]*(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)[\s\\n]*(\d{4})",
        "\n".join(list(soup.strings)),
        re.IGNORECASE
    ).group())
    print("\n")

# Capture the names from the vote details

In [None]:
import logging

from common.text_corrections import fix_name


for i, (filename, soup) in enumerate(soups.items()):
    full_text = soup.text
    session_id = f"{i+1+offset:04d}"

    # Look for a line in the HTML where "details" and "naamstemmingen" occur, below which is listed who voted Y/N/eh
    vote_details = soup.find_all(lambda tag: ((tag.name == "p" and tag.get("class") == ["Titre1NL"]) or (tag.name == "h1")) and "detail" in tag.text.lower() and "naamstemmingen" in tag.text.lower())
    if vote_details:
        if len(vote_details) > 1:
            logging.warning(f"Found more than 1 occurence of Details of Naamstemmingen in document nr {session_id}")
            vote_details = vote_details[-1]
        else:
            vote_details = vote_details[0]
        
        naamstemming_index = full_text.find(vote_details.text)
        detailed_text = full_text[naamstemming_index:]

        split_on_vote = re.split(r"Vote[\s| ]*nominatif[\s| ]*-[\s| ]*Naamstemming:[\s| ]*(\d*)", detailed_text)

        for i in range(1, len(split_on_vote), 2):
            nr_within_session = int(split_on_vote[i])
            print(nr_within_session)
            vote_body = split_on_vote[i+1].strip()
            
            split_body = re.split(r"(Oui|Ja|Non|Nee|Abstentions|Onthoudingen)[\s| ]*(\d*)[\s| ]*(?:Oui|Ja|Non|Nee|Abstentions|Onthoudingen)", vote_body)
            
            for j in range(1, len(split_body), 3):
                vote = split_body[j].lower()
                amount_of_votes = split_body[j+1]
                names = split_body[j+2].replace("\n", " ")
                names = [n.strip() for n in names.split(",") if n.strip()]
                names = [' '.join(n.split()) for n in names]
                names = [fix_name(name, swap_first_last_name=False) for name in names]

                print(f"{amount_of_votes} votes {vote}: {names}")

                if vote in ("ja", "oui"):
                    if (session_id, nr_within_session) in votes:
                        votes[(session_id, nr_within_session)].yay = names
                elif vote in ("nee", "non"):
                    if (session_id, nr_within_session) in votes:
                        votes[(session_id, nr_within_session)].nay = names
                elif vote in ("abstentions", "onthoudingen"):
                    if (session_id, nr_within_session) in votes:
                        votes[(session_id, nr_within_session)].dunno = names
                else:
                    raise ValueError(f"Vote is not a valid value: {vote} for session: {session_id}")
        print("\n")
        

In [None]:
votes

# Check the vote counts against the detailed naamstemmingen (WIP nnn)
they should match.

In [None]:
# todo adapt from The Mighty PyCoder AKA Big Sonck's code to extract summarized voting data


# Some initial stats

In [None]:
import json

with open("../../members.json", "r") as fp:
    name_metadata = json.load(fp)
name_metadata

In [None]:
from typing import Counter

from common.text_corrections import fix_name


all_names = []
for vote in votes.values():
    all_names.extend([f"{name}: {[m['party'] for m in name_metadata if fix_name(m['name'], swap_first_last_name=True) == name]}" for name in vote.yay])

Counter(all_names)

In [None]:
from typing import Counter

from common.text_corrections import fix_name


all_names = []
for vote in votes.values():
    all_names.extend([f"{name}: {[m['party'] for m in name_metadata if fix_name(m['name'], swap_first_last_name=True) == name]}" for name in vote.nay])

Counter(all_names)

In [None]:
from typing import Counter

from common.text_corrections import fix_name


all_names = []
for vote in votes.values():
    all_names.extend([f"{name}: {[m['party'] for m in name_metadata if fix_name(m['name'], swap_first_last_name=True) == name]}" for name in vote.dunno])

Counter(all_names)

In [None]:
# Get all unique parties
from common.text_corrections import fix_name, fix_party


name_to_party = dict()
for member in name_metadata:
    party = fix_party(member["party"])
    name = fix_name(member["name"])
    name_to_party[name] = party

set(name_to_party.values())


In [None]:
def fix_name(name):
    name = name.replace("Flahaux André", "Flahaut André")
    name = name.replace(".", "")
    name = name.replace("'", "")
    name = unidecode(name)
    return name

In [None]:
# Mapping from person ID to party ID
from collections import defaultdict

INCLUDE_DUNNO = False

person_to_party = {fix_name(m['name'], swap_first_last_name=True): m["party"] for m in name_metadata}
parties = set(person_to_party.values())

dissenting_party_votes = {party: 0 for party in parties}
party_votes = {party: 0 for party in parties}
lone_wolfs = defaultdict(int)
dissenters = defaultdict(int)

# For each vote, find the majority per party
for vote in votes.values():
    for party in parties:
        yay = [n for n in vote.yay if n and person_to_party[n] == party]
        nay = [n for n in vote.nay if n and person_to_party[n] == party]
        dunno = [n for n in vote.dunno if n and person_to_party[n] == party]

        votecounts = [len(yay), len(nay), len(dunno)]
        party_votes[party] += sum(votecounts)
        if votecounts.count(0) == 2:
            # No dissenting vote, skipping
            continue
        # This is where it gets interesting, at least one person voted against their party!
        # print(party, votecounts, vote.subject)
        # Let's keep track of which people dissent and in which party they are
        majority_idx = votecounts.index(max(votecounts))

        vote_sets_to_count = [yay, nay]
        if INCLUDE_DUNNO:
            vote_sets_to_count.append(dunno)
        
        for i, votelist in enumerate(vote_sets_to_count):
            if i == majority_idx:
                continue
            # We're adding the amount of non-majority votes to the dissenting party votes here
            dissenting_party_votes[party] += len(votelist)
            if len(votelist) == 1:
                lone_wolfs[votelist[0]] += 1
            for name in votelist:
                dissenters[name] += 1

print("Dissenting Parties:")
print("=" * 25)
print(dissenting_party_votes)
for party in parties:
    percentage_dissenting = round(100 * (dissenting_party_votes[party] / party_votes[party]), 2)
    print(f"{party}: {percentage_dissenting}%")
print("\n")

print("Dissenters:")
print("=" * 25)
for name, amount in sorted(dissenters.items(), key=lambda x: x[1], reverse=True):
    print(f"{name} ({person_to_party[name]}): {amount}")
print("\n")

print("Lone Wolfs (they were alone in their dissenting vote):")
print("=" * 25)
for name, amount in sorted(lone_wolfs.items(), key=lambda x: x[1], reverse=True):
    print(f"{name} ({person_to_party[name]}): {amount}")

In [None]:
party_votes

In [None]:
dissenting_party_votes