In [None]:
# Installatiecel (eenmalig uitvoeren in Colab / Jupyter)
# Installeert python-docx indien nog niet aanwezig

import sys

try:
    import docx  # noqa: F401
    print('python-docx is al geïnstalleerd')
except ModuleNotFoundError:
    print('Installeer python-docx...')
    !{sys.executable} -m pip install python-docx


In [None]:
# Cel 2 — upload klassement (.txt of .docx), converteer en download Word-resultaat (.docx)

from pathlib import Path
import re
import base64
import json as _json
from io import BytesIO

from IPython.display import Javascript, display
from ipywidgets import FileUpload, VBox, HBox, Button, Text, Label

from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn


# ----------------------------
# Parsing (zelfde idee als de HTML-versie)
# ----------------------------
NUMBER_RE = re.compile(r"^\s*\d+\.\s*")

GOALS_RE = re.compile(r"\s*-\s*(\d+)\s+doelpunt(?:en)?\s*$", re.IGNORECASE)

def looks_like_player_stat_line(line: str) -> bool:
    # Bewust simpel: klassementregels lijken vaak op "1. Teamnaam ..." en geen kop
    s = line.strip()
    lower = s.lower()
    if "(" in s and ")" in s:
        return True
    if "-" in s and re.search(r"\b\d+\b", s) and "doelpunt" in lower:
        return True
    return False

def is_section_heading(line: str) -> bool:
    s = line.strip()
    if not s:
        return False
    if NUMBER_RE.match(s):
        return False
    upper = s.upper()
    if "KLASSE" not in upper and "DIVISIE" not in upper:
        return False
    if looks_like_player_stat_line(s):
        return False
    return True

def strip_source_rank_number(line: str) -> str:
    return re.sub(r"^\s*\d+\.\s*", "", line, count=1)

def parse_sections(text: str):
    """
    Parseer inputtekst naar secties (titel + groepen regels).

    Regels:
    - Sectiekoppen: regels met 'KLASSE' of 'DIVISIE'.
    - Een regel met '- <N> doelpunt(en)' start (of wisselt) de huidige goals-groep.
    - Regels zonder goals-suffix horen bij de laatst geziene goals-groep (zelfde lijstnummer).
    - Legacy ondersteuning: een regel die begint met '1.' start altijd een nieuwe groep.

    Volgorde uit de bron blijft exact behouden.
    """
    lines = text.splitlines()

    sections = []
    current_title = None
    current_groups = []
    current_group = []
    current_goals = None

    def flush_group():
        nonlocal current_group, current_groups
        if current_group:
            current_groups.append(current_group)
            current_group = []

    def flush_section():
        nonlocal current_groups, current_goals, sections, current_title
        if current_title and current_groups:
            sections.append((current_title, current_groups))
        current_groups = []
        current_goals = None

    for raw in lines:
        line = raw.strip()
        if not line:
            continue

        if is_section_heading(line):
            flush_group()
            flush_section()
            current_title = line
            continue

        # legacy: "1. ..." start altijd een nieuwe groep
        if NUMBER_RE.match(line):
            flush_group()
            stripped = strip_source_rank_number(line)
            current_group = [stripped]
            m = GOALS_RE.search(stripped)
            current_goals = int(m.group(1)) if m else None
            continue

        m = GOALS_RE.search(line)
        if m:
            goals = int(m.group(1))
            if current_goals is None:
                flush_group()
                current_group = [line]
                current_goals = goals
            elif goals != current_goals:
                flush_group()
                current_group = [line]
                current_goals = goals
            else:
                # zelfde goals, maar suffix opnieuw: zelfde item
                if not current_group:
                    current_group = [line]
                else:
                    current_group.append(line)
        else:
            # vervolgregel (zelfde goals-groep)
            if not current_group:
                current_group = [line]
            else:
                current_group.append(line)

    flush_group()
    flush_section()
    return sections

# ----------------------------
# DOCX numbering (robuust + herstart per sectie)
# ----------------------------
def _ensure_abstract_decimal_numbering(doc: Document, bold_number: bool = True) -> int:
    '''
    Maak een abstractNum voor een single-level decimal list met "%1."
    (optioneel) bold nummer. Geeft abstractNumId terug.
    '''
    numbering = doc.part.numbering_part.numbering_definitions._numbering  # CT_Numbering

    existing_abs = [
        int(n.get(qn("w:abstractNumId")))
        for n in numbering.findall(qn("w:abstractNum"))
        if n.get(qn("w:abstractNumId")) is not None
    ]
    abstract_id = (max(existing_abs) + 1) if existing_abs else 1

    abstract = OxmlElement("w:abstractNum")
    abstract.set(qn("w:abstractNumId"), str(abstract_id))

    mlt = OxmlElement("w:multiLevelType")
    mlt.set(qn("w:val"), "singleLevel")
    abstract.append(mlt)

    lvl = OxmlElement("w:lvl")
    lvl.set(qn("w:ilvl"), "0")

    start = OxmlElement("w:start")
    start.set(qn("w:val"), "1")
    lvl.append(start)

    numfmt = OxmlElement("w:numFmt")
    numfmt.set(qn("w:val"), "decimal")
    lvl.append(numfmt)

    lvltext = OxmlElement("w:lvlText")
    lvltext.set(qn("w:val"), "%1.")
    lvl.append(lvltext)

    suff = OxmlElement("w:suff")
    suff.set(qn("w:val"), "space")
    lvl.append(suff)

    if bold_number:
        rpr = OxmlElement("w:rPr")
        b = OxmlElement("w:b")
        b.set(qn("w:val"), "1")
        rpr.append(b)
        lvl.append(rpr)

    # standaard-inspringing
    ppr = OxmlElement("w:pPr")
    ind = OxmlElement("w:ind")
    ind.set(qn("w:left"), "720")
    ind.set(qn("w:hanging"), "360")
    ppr.append(ind)
    lvl.append(ppr)

    abstract.append(lvl)
    numbering.append(abstract)
    return abstract_id

def _new_numid_starting_at_1(doc: Document, abstract_id: int) -> int:
    '''Maak nieuw numId en forceer start bij 1 via startOverride.'''
    numbering = doc.part.numbering_part.numbering_definitions._numbering

    existing_num = [
        int(n.get(qn("w:numId")))
        for n in numbering.findall(qn("w:num"))
        if n.get(qn("w:numId")) is not None
    ]
    num_id = (max(existing_num) + 1) if existing_num else 1

    num = OxmlElement("w:num")
    num.set(qn("w:numId"), str(num_id))

    absref = OxmlElement("w:abstractNumId")
    absref.set(qn("w:val"), str(abstract_id))
    num.append(absref)

    lvl_override = OxmlElement("w:lvlOverride")
    lvl_override.set(qn("w:ilvl"), "0")
    start_override = OxmlElement("w:startOverride")
    start_override.set(qn("w:val"), "1")
    lvl_override.append(start_override)
    num.append(lvl_override)

    numbering.append(num)
    return num_id

def _apply_numid_to_paragraph(paragraph, num_id: int, ilvl: int = 0) -> None:
    p = paragraph._p
    ppr = p.get_or_add_pPr()

    numpr = ppr.find(qn("w:numPr"))
    if numpr is None:
        numpr = OxmlElement("w:numPr")
        ppr.append(numpr)

    ilvl_el = numpr.find(qn("w:ilvl"))
    if ilvl_el is None:
        ilvl_el = OxmlElement("w:ilvl")
        numpr.append(ilvl_el)
    ilvl_el.set(qn("w:val"), str(ilvl))

    numid_el = numpr.find(qn("w:numId"))
    if numid_el is None:
        numid_el = OxmlElement("w:numId")
        numpr.append(numid_el)
    numid_el.set(qn("w:val"), str(num_id))


# ----------------------------
# Input lezen (txt/docx upload)
# ----------------------------
def extract_text_from_upload(item: dict, filename: str) -> str:
    name = (filename or "").lower()
    if name.endswith(".docx"):
        tmp_path = Path("_uploaded_input.docx")
        tmp_path.write_bytes(item["content"])
        doc = Document(str(tmp_path))
        lines = []
        for p in doc.paragraphs:
            t = p.text.strip()
            if t:
                lines.append(t)
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for ln in cell.text.splitlines():
                        t = ln.strip()
                        if t:
                            lines.append(t)
        return "\n".join(lines)

    try:
        return item["content"].decode("utf-8")
    except UnicodeDecodeError:
        return item["content"].decode("cp1252")


# ----------------------------
# Conversie naar DOCX
# ----------------------------
def klassement_text_to_docx_bytes(klassement_text: str) -> bytes:
    '''
    - Sectiekop: KAPITALEN + BOLD
    - Genummerde lijst: per sectie start opnieuw bij 1
    - Per groep: één paragraaf (Enter) = nieuw nummer
    - Regels binnen groep: Shift+Enter
    '''
    doc = Document()
    sections = parse_sections(klassement_text)

    abstract_id = _ensure_abstract_decimal_numbering(doc, bold_number=True)

    for title, groups in sections:
        p_title = doc.add_paragraph()
        r_title = p_title.add_run(title.upper())
        r_title.bold = True

        num_id = _new_numid_starting_at_1(doc, abstract_id)

        for group in groups:
            if not group:
                continue
            p = doc.add_paragraph()
            _apply_numid_to_paragraph(p, num_id, ilvl=0)

            p.add_run(group[0])
            for extra in group[1:]:
                br = p.add_run()
                br.add_break()
                p.add_run(extra)

    bio = BytesIO()
    doc.save(bio)
    return bio.getvalue()


# ----------------------------
# Download helper (DOCX)
# ----------------------------
def trigger_download_docx(path: Path):
    data = path.read_bytes()
    b64 = base64.b64encode(data).decode("ascii")
    js = f"""
    (function() {{
      var a = document.createElement('a');
      a.href = 'data:application/vnd.openxmlformats-officedocument.wordprocessingml.document;base64,{b64}';
      a.download = {_json.dumps(path.name)};
      document.body.appendChild(a);
      a.click();
      document.body.removeChild(a);
    }})();
    """
    display(Javascript(js))


# ----------------------------
# UI
# ----------------------------
klassement_upload = FileUpload(accept=".txt,.docx", multiple=False, description="Upload (.txt/.docx)")
out_name = Text(value="klassement_output.docx", description="Output-bestand:")
run_button = Button(description="Converteer", button_style="primary")
status_label = Label(value="")

def on_run_clicked(_):
    if not klassement_upload.value:
        status_label.value = "Upload eerst een klassementbestand (.txt of .docx)."
        return

    filename = list(klassement_upload.value.keys())[0]
    item = list(klassement_upload.value.values())[0]

    try:
        text = extract_text_from_upload(item, filename)
    except Exception as e:
        status_label.value = f"Fout bij inlezen bestand: {e}"
        return

    try:
        docx_bytes = klassement_text_to_docx_bytes(text)
    except Exception as e:
        status_label.value = f"Fout bij converteren: {e}"
        return

    out_path = Path(out_name.value or "klassement_output.docx")
    out_path.write_bytes(docx_bytes)
    status_label.value = f"Gereed: {out_path.name}"
    trigger_download_docx(out_path)

run_button.on_click(on_run_clicked)

display(VBox([
    klassement_upload,
    out_name,
    HBox([run_button]),
    status_label
]))


## Gebruik

1. Run **cel 1** één keer om `python-docx` te installeren.
2. Run **cel 2** om de tool te laden.
3. Upload een `.txt` of `.docx` klassement.
4. Klik op **Converteer** om het Word-bestand (`.docx`) te downloaden.
