# Grundtvig Manuscript Dating & Page Extraction – Final Script

This notebook implements the **final, consolidated pipeline** for:

- parsing all XML files in the *Registranten*,
- extracting suggested dating (1798–1872),
- extracting page and leaf counts (with support for ranges such as `1–10`),
- applying a fallback from leaves to pages,
- extracting bibliographic numbers and fascicle numbers,
- mapping fascicle numbers to 14 archival categories,
- exporting the result to Excel for further analysis.


In [None]:
import os
import re
import logging
import xml.etree.ElementTree as ET
from pathlib import Path

import pandas as pd
from tqdm import tqdm
from openpyxl.styles import Alignment

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

# EDIT THESE PATHS AS NEEDED:
XML_FOLDER = r"C:\Registranten\xml"
OUTPUT_PATH = Path(
    r"C:\Users\au468045\OneDrive - Aarhus universitet\Desktop\PhD\Artikler\Variants-ESTS-2025\python-test\grundtvig_datering_FINAL.xlsx"
)

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)


# ---------------------------------------------------------------------------
# Helper functions – dating
# ---------------------------------------------------------------------------

def extract_text_recursive(elem):
    '''Extract flattened text from an XML element, including nested tags (e.g. <hi>).
    Returns a single whitespace-normalised string.'''
    if elem is None:
        return ""
    text = " ".join(elem.itertext())
    return re.sub(r"\s+", " ", text).strip()


def find_prioritised_year(text):
    '''Find a year in the range 1798–1872 near key terms such as
    "affattelsestid" and "datering". Returns (year, True) on success,
    otherwise (None, False).'''
    if not text:
        return None, False

    text_lower = text.lower()
    context_keywords = ["affattelsestid", "datering"]

    for kw in context_keywords:
        if kw in text_lower:
            words = text.split()
            for i, word in enumerate(words):
                if kw in word.lower():
                    # look ± ~10–20 words around the keyword
                    context_words = words[max(0, i - 10): i + 20]
                    context = " ".join(context_words)
                    matches = re.findall(r"(1[78][0-9]{2})", context)
                    for y in matches:
                        y_int = int(y)
                        if 1798 <= y_int <= 1872:
                            return y_int, True

    return None, False


def fallback_year(text):
    '''Fallback: scan the entire text for the first year in the range 1798–1872.
    Returns "Ukendt" if none found.'''
    if not text:
        return "Ukendt"
    matches = re.findall(r"(1[78][0-9]{2})", text)
    for y in matches:
        y_int = int(y)
        if 1798 <= y_int <= 1872:
            return y_int
    return "Ukendt"


def extract_dating(xml_root):
    '''Walk through a list of priority tags and attempt to extract a dating year.
    Returns (year, source_tag, evidence_text).'''
    priority_tags = [
        "Indholdsregest",
        "title_el_kort_indholdsangivelse",
        "Tidl_forskeres_skoensmaessige_detering_er",
        "Ms_forste_linie",
        "Ms_sidste_linie",
        "Tryk_el_benyttelse",
    ]

    for tag in priority_tags:
        for el in xml_root.findall(f".//{tag}"):
            text = extract_text_recursive(el)
            year, prioritised = find_prioritised_year(text)
            if year:
                return year, tag, text
            # fallback within this field
            year = fallback_year(text)
            if year != "Ukendt":
                return year, tag, text

    return "Ukendt", "", ""


# ---------------------------------------------------------------------------
# Helper functions – numeric extraction from simple tags
# ---------------------------------------------------------------------------

def extract_number(root, tag):
    '''Extract a numeric value from a simple XML tag, used for:
      - Antal_sider
      - Antal_blade
      - Nr_i_bibliografien

    Rules:
      1) If the field starts with an interval such as "1-10" (or "1–10" / "1—10"),
         interpret this as a count and return (end - start + 1).
      2) Otherwise, return the first integer at the start of the string,
         delimited by whitespace or punctuation (.,;:).'''
    el = root.find(f".//{tag}")
    if el is None:
        return ""

    p = el.find("p")
    text = p.text if (p is not None and p.text) else el.text

    if not text:
        return ""

    s = text.strip()
    # normalise dashes
    s_norm = s.replace("–", "-").replace("—", "-")

    # 1) Try interval at the beginning, e.g. "1-10 sider"
    m_range = re.match(r"^(\d+)\s*-\s*(\d+)(?=[\s\.,;:]|$)", s_norm)
    if m_range:
        a, b = int(m_range.group(1)), int(m_range.group(2))
        if b >= a:
            return str(b - a + 1)
        # if reversed, fall through to single number

    # 2) Single integer at the beginning
    m_single = re.match(r"^(\d+)(?=[\s\.,;:]|$)", s)
    if m_single:
        return m_single.group(1)

    return ""


def extract_biblio_number(root):
    '''Extract the leading integer from <Nr_i_bibliografien>.'''
    return extract_number(root, "Nr_i_bibliografien")


def extract_fasc_number(root):
    '''Extract the leading fascicle number from <Grundtvig-arkivet_fasc>,
    taking digits only up to the first "." or "," if present.

    Examples:
      "<p>117.</p>"      -> "117"
      "<p>117, 118</p>"  -> "117"

    If no "." or "," follows, falls back to the first leading integer.
    Returns a string or "" if nothing is found.'''
    el = root.find(".//Grundtvig-arkivet_fasc")
    if el is None:
        return ""

    p = el.find("p")
    text = (p.text if (p is not None and p.text) else el.text) or ""
    s = text.strip()

    # prefer a number followed by optional space and then "." or ","
    m = re.match(r"^\s*(\d+)\s*(?=[\.,])", s)
    if m:
        return m.group(1)

    # fallback: first leading integer
    m2 = re.match(r"^\s*(\d+)", s)
    if m2:
        return m2.group(1)

    return ""


# ---------------------------------------------------------------------------
# Fascicle interval mapping (1–14 categories)
# ---------------------------------------------------------------------------

def map_fasc_interval(fasc_number_str):
    '''Map a fascicle number (string) onto one of 14 archive intervals.
    Returns an integer in 1..14, or None if the number is missing/out of range.'''
    if not fasc_number_str:
        return None
    try:
        x = int(fasc_number_str)
    except ValueError:
        return None

    if   1   <= x <= 74:   return 1
    elif 75  <= x <= 154:  return 2
    elif 155 <= x <= 210:  return 3
    elif 211 <= x <= 250:  return 4
    elif 251 <= x <= 287:  return 5
    elif 288 <= x <= 305:  return 6
    elif 306 <= x <= 331:  return 7
    elif 332 <= x <= 358:  return 8
    elif 359 <= x <= 379:  return 9
    elif 380 <= x <= 385:  return 10
    elif 386 <= x <= 405:  return 11
    elif 406 <= x <= 482:  return 12
    elif 483 <= x <= 535:  return 13
    elif 536 <= x <= 565:  return 14
    else:
        return None


# ---------------------------------------------------------------------------
# Per-file processing
# ---------------------------------------------------------------------------

def process_single_xml_file(xml_path):
    '''Parse a single XML file and return a dict with all extracted fields.
    Any parsing exception is propagated to the caller.'''
    tree = ET.parse(xml_path)
    root = tree.getroot()

    year, source_tag, evidence = extract_dating(root)
    sider = extract_number(root, "Antal_sider")
    blade = extract_number(root, "Antal_blade")
    nr_biblio = extract_biblio_number(root)
    fasc_no = extract_fasc_number(root)
    fasc_interval = map_fasc_interval(fasc_no)

    # fallback from blade → sider when possible
    beregnet_flag = "nej"
    if not sider and blade.isdigit():
        sider = str(int(blade) * 2)
        beregnet_flag = "ja"

    return {
        "file": os.path.basename(xml_path),
        "suggested_dating": str(year),
        "source_tag": source_tag,
        # "evidence_excerpt": evidence,  # optionally include
        "antal_sider": sider,
        "antal_blade": blade,
        "antal_sider_beregnet_fra_blade": beregnet_flag,
        "nr_i_bibliografien": nr_biblio,
        "grundtvig_arkivet_fasc_nr": fasc_no,
        "fasc_interval_1_14": fasc_interval,
    }


# ---------------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------------

def run_pipeline(xml_folder: str = XML_FOLDER, output_path: Path = OUTPUT_PATH):
    '''Walk through all .xml files in `xml_folder`, apply extraction for
    each file, and write a consolidated Excel file to `output_path`.'''
    xml_folder = Path(xml_folder)
    if not xml_folder.is_dir():
        logging.error("XML folder does not exist: %s", xml_folder)
        return

    all_files = sorted([p for p in xml_folder.iterdir() if p.suffix.lower() == ".xml"])
    logging.info("Found %d XML files.", len(all_files))

    records = []
    errors = 0

    for xml_path in tqdm(all_files, desc="Processing XML files"):
        try:
            record = process_single_xml_file(xml_path)
            records.append(record)
        except Exception as e:
            logging.exception("Error processing %s", xml_path)
            errors += 1
            # minimal fallback row so the file is still represented
            records.append({
                "file": xml_path.name,
                "suggested_dating": "Fejl",
                "source_tag": "",
                "antal_sider": "",
                "antal_blade": "",
                "antal_sider_beregnet_fra_blade": "nej",
                "nr_i_bibliografien": "",
                "grundtvig_arkivet_fasc_nr": "",
                "fasc_interval_1_14": None,
            })

    df = pd.DataFrame(records)

    # Write to Excel with left-aligned columns
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        df.to_excel(writer, index=False, sheet_name="Grundtvig")
        ws = writer.sheets["Grundtvig"]
        for col_cells in ws.columns:
            for cell in col_cells:
                cell.alignment = Alignment(horizontal="left")

    logging.info("Saved output to %s", output_path)
    if errors:
        logging.warning("Completed with %d file(s) raising errors (marked as 'Fejl').", errors)


# ---------------------------------------------------------------------------
# Execute pipeline
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    run_pipeline()
