<a href="https://colab.research.google.com/github/vdubya/critria-assistant/blob/main/UFGS_TrainFromSECXML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process XML and PDF files

In [12]:
# Install necessary libraries
!pip install pymupdf requests transformers lxml

import fitz  # PyMuPDF
import zipfile
import requests
import os
import json
from lxml import etree  # For parsing XML
from transformers import LayoutLMTokenizer

# Enable debugging
DEBUG = True

# Helper function for debug prints
def debug_print(message, separator="--- "):
    if DEBUG:
        print(f"{separator}{message}")

# Step 1: Download and Extract the SEC File from the ZIP
def download_and_extract_sec(zip_url, target_file):
    print("\n****** Downloading and Extracting SEC File")

    zip_path = "/content/UFGS_M.zip"
    extracted_sec_path = None

    # Download the ZIP file
    debug_print(f"Downloading ZIP file from: {zip_url}")
    response = requests.get(zip_url)
    with open(zip_path, "wb") as f:
        f.write(response.content)
    debug_print(f"Downloaded ZIP file saved at: {zip_path}")

    # Extract the target SEC file
    debug_print(f"Extracting SEC file: {target_file}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for file in zip_ref.namelist():
            if file.endswith(target_file):  # Focus on the specified SEC file
                extracted_sec_path = f"/content/{target_file}"
                with zip_ref.open(file) as sec_file:
                    with open(extracted_sec_path, "wb") as f:
                        f.write(sec_file.read())
    debug_print(f"Extracted SEC file saved at: {extracted_sec_path}")

    return extracted_sec_path

# Step 2: Parse the XML File and Extract Hierarchical Structure
def parse_xml(xml_path):
    print("\n****** Parsing XML File")
    tree = etree.parse(xml_path)
    root = tree.getroot()

    def recursive_parse(element):
        children = [recursive_parse(child) for child in element]
        return {
            "tag": element.tag,
            "text": (element.text or "").strip(),
            "children": children
        }

    parsed_hierarchy = recursive_parse(root)
    debug_print(f"Parsed XML hierarchy:\n{json.dumps(parsed_hierarchy, indent=4)}")
    return parsed_hierarchy

# Step 3: Download the PDF File
def download_pdf(pdf_url, pdf_name):
    print("\n****** Downloading PDF File")

    pdf_path = f"/content/{pdf_name}"
    debug_print(f"Downloading PDF file from: {pdf_url}")
    response = requests.get(pdf_url)
    with open(pdf_path, "wb") as f:
        f.write(response.content)
    debug_print(f"Downloaded PDF file saved at: {pdf_path}")

    return pdf_path

# Step 4: Extract Text and Bounding Boxes from the PDF
def extract_text_and_bboxes(pdf_path):
    print("\n****** Extracting Text and Bounding Boxes from PDF")

    doc = fitz.open(pdf_path)  # Open the PDF
    data = []  # List to store extracted data

    for page_number in range(len(doc)):
        debug_print(f"\n================\nProcessing page {page_number + 1}/{len(doc)}")
        page = doc[page_number]
        page_width, page_height = page.rect.width, page.rect.height  # Get page dimensions

        # Extract text blocks with bounding boxes
        blocks = page.get_text("blocks")
        debug_print(f"* Found {len(blocks)} text blocks on page {page_number + 1}")
        for block in blocks:
            if len(block) >= 5:  # Ensure there is at least text and bounding box data
                x0, y0, x1, y1, text = block[:5]  # Unpack first 5 elements
                if text.strip():  # Ignore empty blocks
                    normalized_bbox = normalize_bbox([x0, y0, x1, y1], page_width, page_height)
                    data.append({
                        "page": page_number + 1,
                        "text": text.strip(),
                        "bbox": [x0, y0, x1, y1],  # Original bounding box
                        "normalized_bbox": normalized_bbox
                    })
                    debug_print(f"----\n* Extracted block: \n{text.strip()}\n"
                                f"* Original BBox: {[x0, y0, x1, y1]}\n"
                                f"* Normalized BBox: {normalized_bbox}")

    return data

# Step 5: Normalize Bounding Boxes to LayoutLM's Input Scale (0-1000)
def normalize_bbox(bbox, page_width, page_height):
    return [
        int(1000 * bbox[0] / page_width),
        int(1000 * bbox[1] / page_height),
        int(1000 * bbox[2] / page_width),
        int(1000 * bbox[3] / page_height)
    ]

# Step 6: Align XML Content with PDF Text
def align_xml_with_pdf(xml_hierarchy, pdf_data):
    print("\n****** Aligning XML Content with PDF Text")

    aligned_data = []

    def recursive_align(xml_element, pdf_data_iter):
        aligned_children = []
        for child in xml_element["children"]:
            aligned_child = recursive_align(child, pdf_data_iter)
            aligned_children.append(aligned_child)

        # Attempt to align current XML text with PDF text
        try:
            pdf_item = next(pdf_data_iter)
        except StopIteration:
            pdf_item = {"text": None, "bbox": None, "normalized_bbox": None}

        return {
            "tag": xml_element["tag"],
            "xml_text": xml_element["text"],
            "pdf_text": pdf_item["text"],
            "bbox": pdf_item["bbox"],
            "normalized_bbox": pdf_item["normalized_bbox"],
            "children": aligned_children
        }

    pdf_data_iter = iter(pdf_data)
    aligned_hierarchy = recursive_align(xml_hierarchy, pdf_data_iter)
    debug_print(f"Aligned XML-PDF hierarchy:\n{json.dumps(aligned_hierarchy, indent=4)}")
    return aligned_hierarchy

# Step 7: Save Aligned Data for Review
def save_aligned_data(data, output_path):
    print("\n****** Saving Aligned Data")
    debug_print(f"Saving aligned data to: {output_path}")
    with open(output_path, "w") as f:
        json.dump(data, f, indent=4)
    debug_print(f"Data saved successfully at: {output_path}")

# Main Execution
sec_zip_url = "https://www.wbdg.org/FFC/DOD/UFGS/UFGS_M.zip"
target_sec_file = "00 01 15.SEC"
pdf_url = "https://www.wbdg.org/FFC/DOD/UFGS/UFGS%2000%2001%2015.pdf"
pdf_name = "UFGS_00_01_15.pdf"

# Download and extract the SEC file
extracted_sec_path = download_and_extract_sec(sec_zip_url, target_sec_file)

# Parse the XML file and extract the hierarchy
xml_hierarchy = parse_xml(extracted_sec_path)

# Download the corresponding PDF file
pdf_path = download_pdf(pdf_url, pdf_name)

# Extract text and bounding boxes from the PDF
pdf_data = extract_text_and_bboxes(pdf_path)

# Align XML with PDF content
aligned_data = align_xml_with_pdf(xml_hierarchy, pdf_data)

# Save the aligned data for review
output_json_path = "/content/aligned_data.json"
save_aligned_data(aligned_data, output_json_path)

# Print a summary
print("\n****** Summary")
print(f"Aligned XML-PDF data saved at: {output_json_path}")



****** Downloading and Extracting SEC File
--- Downloading ZIP file from: https://www.wbdg.org/FFC/DOD/UFGS/UFGS_M.zip
--- Downloaded ZIP file saved at: /content/UFGS_M.zip
--- Extracting SEC file: 00 01 15.SEC
--- Extracted SEC file saved at: /content/00 01 15.SEC

****** Parsing XML File
--- Parsed XML hierarchy:
{
    "tag": "SEC",
    "text": "",
    "children": [
        {
            "tag": "MTA",
            "text": "",
            "children": []
        },
        {
            "tag": "MTA",
            "text": "",
            "children": []
        },
        {
            "tag": "HDR",
            "text": "",
            "children": [
                {
                    "tag": "AST",
                    "text": "",
                    "children": []
                },
                {
                    "tag": "TAB",
                    "text": "",
                    "children": [
                        {
                            "tag": "WBK",
                      

# Visualize extracted sections in PDF

In [22]:
# Install necessary libraries
!pip install pymupdf

import fitz  # PyMuPDF
import json
import random

# Load the hierarchical JSON data
def load_hierarchical_json(json_path):
    with open(json_path, "r") as f:
        return json.load(f)

# Generate a consistent color for each unique tag
def generate_color_map(tags):
    random.seed(42)  # Set seed for reproducibility
    color_map = {}
    for tag in tags:
        color_map[tag] = (
            random.randint(0, 255) / 255.0,  # Normalize to range [0, 1]
            random.randint(0, 255) / 255.0,  # Normalize to range [0, 1]
            random.randint(0, 255) / 255.0,  # Normalize to range [0, 1]
        )  # RGB colors
    return color_map

# Recursive function to apply boxes and labels
def annotate_pdf_with_hierarchy(pdf_path, json_data, output_pdf_path):
    # Open the PDF for editing
    doc = fitz.open(pdf_path)

    # Gather all unique tags
    def collect_tags(data, tags):
        tags.add(data["tag"])
        for child in data.get("children", []):
            collect_tags(child, tags)

    unique_tags = set()
    collect_tags(json_data, unique_tags)

    # Generate color map for tags
    color_map = generate_color_map(unique_tags)

    def recursive_annotate(page, data, parent_tag=""):
        # Combine the current tag with the parent tag for a compound tag
        compound_tag = f"{parent_tag}/{data['tag']}" if parent_tag else data["tag"]

        # Draw a box and add a label if the item has a bounding box
        if data["bbox"] and data["pdf_text"]:
            bbox = fitz.Rect(data["bbox"])  # Convert to PyMuPDF Rect
            color = color_map[data["tag"]]  # Get color for the current tag
            # Draw a rectangle around the bounding box
            page.draw_rect(
                bbox,
                color=color,
                width=1,  # Thickness of the rectangle
            )
            # Add the compound tag as a label above the box
            label_bbox = fitz.Rect(
                bbox.x0, bbox.y0 - 10, bbox.x1, bbox.y0 - 2
            )  # Box above the current bounding box
            page.insert_textbox(
                label_bbox,
                compound_tag,
                fontsize=6,
                color=(0, 0, 0),  # Black text
                align=fitz.TEXT_ALIGN_LEFT,
            )

        # Recursively process children
        for child in data.get("children", []):
            recursive_annotate(page, child, compound_tag)

    # Recursively annotate the root element (hierarchical JSON starts as a dict)
    def process_hierarchy(data):
        page_number = data.get("page", 1)  # Default to page 1 if not specified
        page = doc[page_number - 1]  # Pages are 0-indexed in PyMuPDF

        # Annotate the page recursively
        recursive_annotate(page, data)

        # Process children at the root level
        for child in data.get("children", []):
            process_hierarchy(child)

    # Start processing the root JSON
    process_hierarchy(json_data)

    # Save the annotated PDF
    doc.save(output_pdf_path)
    doc.close()
    print(f"Annotated PDF saved at: {output_pdf_path}")

# Main Execution
pdf_path = "/content/UFGS_00_01_15.pdf"  # Input PDF file
json_path = "/content/aligned_data.json"  # Hierarchical JSON file
output_pdf_path = "/content/annotated_UFGS_00_01_15.pdf"  # Output annotated PDF file

# Load the hierarchical JSON data
hierarchical_json = load_hierarchical_json(json_path)

# Annotate the PDF using the hierarchical JSON
annotate_pdf_with_hierarchy(pdf_path, hierarchical_json, output_pdf_path)


Annotated PDF saved at: /content/annotated_UFGS_00_01_15.pdf
