<a href="https://colab.research.google.com/github/vdubya/criteria-assistant/blob/main/src/UFC%20-%20Parse%20Structure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Version 0.1.1  
# Chunk 1 - Setup and PDF Parsing
!pip install requests pdfplumber

import os, re, csv, json, zipfile, glob, itertools, string, traceback, requests, pdfplumber

CLEAN = True
LOOKAHEAD = 20
DEBUG = 2
BAD_TOP_LINES = {"CHAPTER 2 PRELIMINARY DESIGN DATA"}
ZIP_FILE, ZIP_DIR = "ufc.zip", "ufc_zip"
SINGLE_URL = "https://www.wbdg.org/FFC/DOD/UFC/ufc_1_300_01_2021_c1.pdf"
SINGLE_PDF = os.path.basename(SINGLE_URL)
COMB_CSV, COMB_JSON, COMB_TREE = "combined_hierarchy_all.csv", "combined_hierarchy_flat.json", "combined_hierarchy_tree.json"

CH_RE = re.compile(r"^CHAPTER\s+(\d+)\s+(.+)$", re.I)
AP_RE = re.compile(r"^APPENDIX\s+([A-Z])\s+(.+)$", re.I)
SEC_RE = re.compile(r"^(\d+(?:-\d+)+(?:\.\d+)*)\s+(.+)$", re.I)  # strict 1-n hierarchy only
DATE_RE = re.compile(r"\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}")
SENT_RE = re.compile(r"(?<=[.!?])\s+")
level_from = lambda n: n.count(".") + 2

def dbg(level, msg):
    if DEBUG >= level:
        print(msg)

if CLEAN:
    patterns_to_delete = ["*.csv", "*.json"]
    files_to_delete = []
    for pat in patterns_to_delete:
        files_to_delete.extend(glob.glob(pat))
    print("🗑️ Files to delete:", files_to_delete)
    for f in files_to_delete:
        os.remove(f)
    print("🗑️  All CSV and JSON files deleted.")

def download(url, dst):
    if not os.path.exists(dst):
        print(f"⬇️  Downloading: {url}")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(dst, "wb") as f:
                for chunk in r.iter_content(8192):
                    f.write(chunk)
        print(f"✅ Saved: {dst}")
    else:
        print(f"✅ Using cached: {dst}")

def get_pdfs():
    if os.path.exists(ZIP_FILE):
        if not os.path.isdir(ZIP_DIR):
            print("📦 Extracting ufc.zip …")
            with zipfile.ZipFile(ZIP_FILE) as z:
                z.extractall(ZIP_DIR)
        all_pdfs = [p for p in glob.glob(f"{ZIP_DIR}/**/*.pdf", recursive=True)
                     if "/Reference/" not in p and "\\Reference\\" not in p]
        ufc_pdfs = sorted([p for p in all_pdfs if os.path.basename(p).lower().startswith("ufc_")])
        other_pdfs = sorted([p for p in all_pdfs if not os.path.basename(p).lower().startswith("ufc_")])
        pdfs = ufc_pdfs + other_pdfs
        print(f"🗂️  Found {len(pdfs):,} PDFs to process (UFCs first).")
        return pdfs
    download(SINGLE_URL, SINGLE_PDF)
    return [SINGLE_PDF]

def meta_from_pdf(path, url=""):
    with pdfplumber.open(path) as pdf:
        first_page_text = pdf.pages[0].extract_text() or ""
    lines = [ln.strip() for ln in first_page_text.splitlines() if ln.strip()]
    title = next((ln for ln in lines if ln.isupper()), "UNKNOWN TITLE")
    date = next((m.group(0) for ln in lines if (m := DATE_RE.search(ln))), "UNKNOWN DATE")
    return {"file_name": os.path.basename(path), "source_url": url, "ufc_title": title, "issue_date": date}

def pdf_lines(path):
    lines = []
    with pdfplumber.open(path) as pdf:
        for pg, page in enumerate(pdf.pages, 1):
            raw_lines = (page.extract_text() or "").splitlines()
            clean_lines = raw_lines[2:]
            if clean_lines and clean_lines[0].strip().lower().startswith("change"):
                clean_lines = clean_lines[1:]
            if clean_lines and clean_lines[-1].strip().isdigit():
                clean_lines = clean_lines[:-1]
            for ln in clean_lines:
                lines.append({"text": ln.rstrip(), "pdf_page": pg})
    dbg(2, f"✅ Processed {len(lines)} lines from {path}")
    return lines

def locate_body(ls):
    for i, l in enumerate(ls):
        line_text = l["text"].strip().upper()
        if line_text == "CHAPTER 1 INTRODUCTION" and "..." not in l["text"]:
            print(f"✅ Found 'CHAPTER 1 INTRODUCTION' at line {i}")
            return i, False
    fallback_markers = ["1-1 BACKGROUND.", "1-1 PURPOSE AND SCOPE."]
    for i, l in enumerate(ls):
        line_text = l["text"].strip().upper()
        if line_text in fallback_markers and "..." not in l["text"]:
            print(f"⚠️ Using fallback marker '{line_text}' at line {i}")
            return i, True
    raise RuntimeError("Body start not found: Neither 'CHAPTER 1 INTRODUCTION' nor fallback 1-1 markers found outside TOC!")

def to_paragraphs(raw):
    paras, buf, prev = [], [], ""
    def flush():
        if buf:
            paras.append(" ".join(buf).strip())
            buf.clear()
    for ln in raw:
        t = ln.rstrip()
        if t == "": flush(); prev = t; continue
        if t.lstrip().startswith(("•", "-", "—")): flush(); buf.append(t); prev = t; continue
        if prev.endswith(".") and t and t[0] in string.ascii_uppercase: flush()
        buf.append(t); prev = t
    flush()
    return paras

def sentences(paras):
    return [[s.strip() for s in SENT_RE.split(p) if s.strip()] for p in paras]

def parse_pdf(path):
    dbg(1, f"🔍 Parsing: {path}")
    ls = pdf_lines(path)
    start, is_fallback = locate_body(ls)
    rows = []
    paragraphs = []
    def flush_paragraphs():
        nonlocal paragraphs
        if paragraphs and rows:
            paras = to_paragraphs(paragraphs)
            sents = sentences(paras)
            rows[-1]["sentences"] = sents
            paragraphs = []
    logical_offset = None

    if is_fallback:
        logical_offset = ls[start]["pdf_page"] - 1  # FIX: fallback sets logical_offset
        rows.append({
            "level": 1,
            "number": "CHAPTER 1",
            "title": "INTRODUCTION",
            "parent": "",
            "pdf_page_start": ls[start]["pdf_page"],
            "logical_page_start": 1,
            "sentences": []
        })
        print("⚠️ Injected 'CHAPTER 1 INTRODUCTION' due to fallback marker. Logical offset set.")

    for idx in range(start, len(ls)):
        txt, pg = ls[idx]["text"], ls[idx]["pdf_page"]
        st = txt.strip()
        if logical_offset is None and st.upper() == "CHAPTER 1 INTRODUCTION":
            logical_offset = pg - 1
            dbg(2, f"✅ logical_offset set to {logical_offset} on line {idx}")
        lp = pg - logical_offset if logical_offset is not None else pg
        if st.upper() in BAD_TOP_LINES:
            continue
        flush_paragraphs()
        if (m := CH_RE.match(txt)):
            rows.append({"level": 1, "number": f"CHAPTER {m.group(1)}", "title": m.group(2).strip(),
                         "parent": "", "pdf_page_start": pg, "logical_page_start": lp, "sentences": []})
            continue
        flush_paragraphs()
        if (m := SEC_RE.match(txt)):
            parent = ""
            for r in reversed(rows):
                if r["level"] == 1:
                    parent = r["number"]
                    break
            rows.append({"level": 2, "number": m.group(1).strip(), "title": m.group(2).strip(),
                         "parent": parent, "pdf_page_start": pg, "logical_page_start": lp, "sentences": []})
            continue
        if st and not st.endswith("."):
            paragraphs.append(st)
        if st.endswith("."):
            paragraphs.append(st)
            paras = to_paragraphs(paragraphs)
            sents = sentences(paras)
            rows[-1]["sentences"] = sents
            paragraphs = []

    flush_paragraphs()
    dbg(1, f"✅ Finished parsing {path}, structured hierarchy built.")
    return rows


In [None]:
# Version 0.1.0 - 
# Chunk 2 - Tree Building, Saving, and Output

def build_tree(rows):
    tree = []
    stack = []
    for r in rows:
        node = r.copy()
        node["children"] = []
        while stack and stack[-1]["level"] >= node["level"]:
            stack.pop()
        if stack:
            stack[-1]["children"].append(node)
        else:
            tree.append(node)
        stack.append(node)
    dbg(2, f"✅ Tree structure built with {len(tree)} root nodes.")
    return tree

def save_per_pdf(base, rows, meta):
    try:
        with open(f"{base}_hierarchy_metadata.json", "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

        with open(f"{base}_hierarchy_flat.json", "w", encoding="utf-8") as f:
            json.dump(rows, f, ensure_ascii=False, indent=2)

        nested_tree = build_tree(rows)
        with open(f"{base}_hierarchy_tree.json", "w", encoding="utf-8") as f:
            json.dump({"chapters": nested_tree}, f, ensure_ascii=False, indent=2)

        with open(f"{base}_hierarchy.csv", "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f, quoting=csv.QUOTE_ALL)
            w.writerow(["GlobalID", "FileName", "Level", "Number", "Title", "Parent",
                         "PDFPage", "LogicalPage", "SentencesJSON"])
            for r in rows:
                w.writerow([r.get("global_id", ""), meta["file_name"], r["level"], r["number"], r["title"],
                             r["parent"], r["pdf_page_start"], r["logical_page_start"],
                             json.dumps(r["sentences"], ensure_ascii=False)])
        dbg(1, f"✅ Saved hierarchy data for {base}")
    except Exception as e:
        dbg(1, f"❌ Error in save_per_pdf for {base}: {e}")
        print(traceback.format_exc())

combined, meta_list, trees, gid = [], [], [], 1
for pdf in get_pdfs():
    try:
        dbg(1, f"
▶️ Processing: {pdf}")
        src = SINGLE_URL if pdf == SINGLE_PDF else ""
        meta = meta_from_pdf(pdf, src)
        dbg(1, f"ℹ️  Extracted metadata: {meta}")
        rows = parse_pdf(pdf)
        if not rows:
            dbg(1, f"⚠️  Warning: No heading entries found for {pdf}")
        for r in rows:
            r["global_id"] = gid
            combined.append(r)
            gid += 1
        meta_list.append(meta)
        save_per_pdf(os.path.splitext(os.path.basename(pdf))[0], rows, meta)
        with open(f"{os.path.splitext(os.path.basename(pdf))[0]}_hierarchy_tree.json", "r", encoding="utf-8") as f:
            t = json.load(f)["chapters"]
            trees.extend(t)
    except Exception as e:
        dbg(1, f"⚠️  Skipping {pdf} due to error: {e}")
        print(traceback.format_exc())

if combined:
    dbg(1, "
✅ Writing combined outputs…")
    with open(COMB_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f, quoting=csv.QUOTE_ALL)
        w.writerow(["GlobalID", "FileName", "Level", "Number", "Title", "Parent",
                     "PDFPage", "LogicalPage", "SentencesJSON"])
        for r in combined:
            w.writerow([r.get("global_id", ""), meta["file_name"], r["level"], r["number"], r["title"],
                         r["parent"], r["pdf_page_start"], r["logical_page_start"],
                         json.dumps(r["sentences"], ensure_ascii=False)])
    with open(COMB_JSON, "w", encoding="utf-8") as jf:
        json.dump(combined, jf, ensure_ascii=False, indent=2)
    with open(COMB_TREE, "w", encoding="utf-8") as jf:
        nested_tree = build_tree(combined)
        json.dump({"bundle_metadata": meta_list, "chapters": nested_tree},
                  jf, ensure_ascii=False, indent=2)
    dbg(1, f"✅ Combined CSV → {COMB_CSV}")
    dbg(1, f"✅ Combined JSON → {COMB_JSON}")
    dbg(1, f"✅ Combined TREE → {COMB_TREE}")
else:
    dbg(1, "❌ No PDFs parsed successfully; no combined files created.")
