# Cue Print tags → Word (.docx)

Dit Colab-notebook zet een `.txt` met tags om naar een Word-document (`.docx`).

**Regels (zoals afgesproken):**
- `<subhead_lead>` (divisie/klasse) komt **1x** als kop (BOLD + UPPERCASE).
- Daarna volgen alle wedstrijden (`<subhead>`) als **1 alinea per wedstrijd**.
- Facts (`<howto_facts>`) worden **altijd italic** als er tekst is, op een nieuwe regel met **Shift+Enter** (soft line break).
- Lege `<howto_facts>`: **overslaan**.
- Tussen competitieblokken: **1 lege alinea**.
- Output toont alleen **technische tellingen** (geen inhoud logging).


In [None]:
!pip -q install python-docx

## 1) Upload je bronbestand (.txt)

In [None]:
from google.colab import files

uploaded = files.upload()
if not uploaded:
    raise ValueError("Geen bestand geüpload.")

INPUT_PATH = next(iter(uploaded.keys()))
print("Inputbestand ontvangen.")


## 2) Parse: tags → tokens → items

In [None]:
import re
from dataclasses import dataclass
from typing import List, Optional

TAG_PATTERN = re.compile(r"<(subhead_lead|subhead|howto_facts)>(.*?)</\1>", re.DOTALL | re.IGNORECASE)

@dataclass
class Token:
    kind: str
    text: str

def extract_tokens(raw: str) -> List[Token]:
    tokens: List[Token] = []
    for m in TAG_PATTERN.finditer(raw):
        kind = m.group(1).lower().strip()
        text = (m.group(2) or "").replace("\r\n", "\n").replace("\r", "\n").strip()
        tokens.append(Token(kind=kind, text=text))
    return tokens

@dataclass
class Item:
    header: str
    subhead: str
    facts: Optional[str]  # None = geen facts / leeg

def tokens_to_items(tokens: List[Token]) -> List[Item]:
    items: List[Item] = []
    current_header: Optional[str] = None
    i = 0

    while i < len(tokens):
        t = tokens[i]

        if t.kind == "subhead_lead":
            current_header = t.text
            i += 1
            continue

        if t.kind == "subhead":
            if not current_header:
                i += 1
                continue

            subhead_text = t.text
            facts_text: Optional[str] = None

            if i + 1 < len(tokens) and tokens[i + 1].kind == "howto_facts":
                candidate = (tokens[i + 1].text or "").strip()
                if candidate:
                    facts_text = candidate
                i += 2
            else:
                i += 1

            items.append(Item(header=current_header, subhead=subhead_text, facts=facts_text))
            continue

        i += 1

    return items


## 3) Bouw Word-document (.docx)

In [None]:
from docx import Document
from docx.enum.text import WD_BREAK
from typing import Dict

def build_docx(items: List[Item], output_path: str) -> Dict[str, int]:
    doc = Document()

    stats = {
        "headers_total": 0,
        "items_total": 0,
        "items_with_facts": 0,
        "empty_facts_skipped": 0,
        "block_separators_added": 0,
    }

    prev_header: Optional[str] = None

    for it in items:
        header = (it.header or "").strip()

        if header and header != prev_header:
            if prev_header is not None:
                doc.add_paragraph("")
                stats["block_separators_added"] += 1

            hp = doc.add_paragraph()
            hr = hp.add_run(header.upper())
            hr.bold = True
            stats["headers_total"] += 1
            prev_header = header

        p = doc.add_paragraph()
        r1 = p.add_run((it.subhead or "").strip())

        facts = (it.facts or "").strip()
        if facts:
            r1.add_break(WD_BREAK.LINE)  # Shift+Enter
            r2 = p.add_run(facts)
            r2.italic = True
            stats["items_with_facts"] += 1
        else:
            stats["empty_facts_skipped"] += 1

        stats["items_total"] += 1

    doc.save(output_path)
    return stats


## 4) Run + download

In [None]:
from google.colab import files
from collections import Counter
from datetime import datetime

with open(INPUT_PATH, "r", encoding="utf-8", errors="replace") as f:
    raw = f.read()

tokens = extract_tokens(raw)
items = tokens_to_items(tokens)

date = datetime.now().strftime("%Y%m%d")
OUTPUT_PATH = f"{date}_cue_word_uitslagen_amateurs.docx"

stats = build_docx(items, OUTPUT_PATH)

print("Klaar.")
print(f"- Tokens gevonden: {len(tokens)}")
print(f"- Tag verdeling: {dict(Counter(t.kind for t in tokens))}")
print(f"- Koppen geplaatst: {stats['headers_total']}")
print(f"- Items gemaakt: {stats['items_total']}")
print(f"- Items met facts: {stats['items_with_facts']}")
print(f"- Items zonder facts (overgeslagen): {stats['empty_facts_skipped']}")
print(f"- Lege alinea's tussen blokken: {stats['block_separators_added']}")
print(f"- Outputbestand: {OUTPUT_PATH}")

files.download(OUTPUT_PATH)
