### Architecture 

In [8]:
# pip install diagrams
# macOS: brew install graphviz   |  Ubuntu: sudo apt-get install -y graphviz
from diagrams import Diagram, Cluster, Edge
from diagrams.onprem.client import Users
from diagrams.generic.storage import Storage
from diagrams.generic.compute import Rack
from diagrams.programming.language import Python
from diagrams.custom import Custom

# --- simple palette for consistent styling ---
PAL = {
    "ink":        "#1f2937",
    "bg":         "white",
    "ingest":     "#2563eb",   # blue (ingestion)
    "convert":    "#fb923c",   # orange (convert → PDF)
    "ours":       "#10b981",   # green (our extractors)
    "json":       "#38bdf8",   # cyan (json out)
    "csv":        "#38bdf8",
    "docling":    "#f43f5e",   # red (docling)
    "docling_out":"#fb7185",   # pink (docling outputs)
    "compare":    "#6366f1",   # indigo (comparator)
    "store":      "#16a34a",   # green (to storage)
    "report":     "#0ea5e9",   # sky (to report)
    # cluster borders / fills
    "c_blue":     "#60a5fa", "c_blue_bg": "#e5f2ff",
    "c_green":    "#34d399", "c_green_bg": "#ecfdf5",
    "c_pink":     "#f472b6", "c_pink_bg": "#fff1f2",
    "c_indigo":   "#818cf8", "c_indigo_bg": "#eef2ff",
}

with Diagram(
    "SEC EDGAR → Notebooks & Engines → Outputs → Comparison",
    filename="/Users/divyanshmac/Documents/Big_Data_Assignment/pdf-parser/docs/sec_edgar_architecture_lr_pretty",
    outformat="png",
    show=False,
    graph_attr={
        "rankdir": "LR",
        "fontsize": "12",
        "bgcolor": PAL["bg"],
        "pad": "0.25",
        "splines": "spline",
        "fontname": "Helvetica",
    },
    node_attr={
        "shape": "box",
        "style": "rounded,filled",
        "fontname": "Helvetica",
        "fontsize": "11",
        "color": PAL["ink"],
        "fillcolor": "white",
        "penwidth": "1.6",
    },
    edge_attr={"penwidth": "2"},
):
    dev = Users("Developer")

    # ---------- DATA INGESTION ----------
    with Cluster(
        "DATA INGESTION",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_blue"],
            "bgcolor": PAL["c_blue_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        edgar = Storage("SEC EDGAR Library\n(edgartools / sec-edgar-downloader)")
        raw = Storage("Raw Filings\nnotebooks/data_load.ipynb")
        converter = Rack("PDF Converter\n(HTML/TXT → PDF)")
        pdfdoc = Storage("PDF Documents")

        edgar >> Edge(color=PAL["ingest"], minlen="2") >> raw \
              >> Edge(color=PAL["ingest"], minlen="2") >> converter \
              >> Edge(color=PAL["convert"], minlen="2") >> pdfdoc

    # ---------- NOTEBOOKS & SCRIPTS ----------
    with Cluster(
        "NOTEBOOKS & SCRIPTS",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_green"],
            "bgcolor": PAL["c_green_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        py = Python("Python Runtime\n(main.py / src/pdf_parser)")
        nb_text = Python("Text Extractor\n(simple_text_extractor.ipynb)")
        nb_tabula = Python("Table Extractor\n(tabula_extraction.ipynb)")
        nb_generic = Python("Advanced Extractor\n(documentai_extraction.ipynb)")
        nb_eval = Python("Evaluation\n(evaluation_wer.ipynb)")

        pdfdoc >> Edge(color=PAL["ours"]) >> nb_text
        pdfdoc >> Edge(color=PAL["ours"]) >> nb_tabula
        pdfdoc >> Edge(color=PAL["ours"]) >> nb_generic

    # ---------- EXTRACTION ENGINES ----------
    with Cluster(
        "EXTRACTION ENGINES",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_green"],
            "bgcolor": PAL["c_green_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        pdfplumber_engine = Rack("pdfplumber\n(text + word boxes + layout)")
        tabula_engine = Rack("tabula-py\n(tables → CSV)")
        tesseract_engine = Rack("Tesseract OCR\n(fallback for scans)")
        generic_engine = Rack("Advanced Extractor\n(ML/NLP pipeline)")
        docling_engine = Rack("Docling Baseline")

        nb_text >> pdfplumber_engine
        nb_tabula >> tabula_engine
        nb_generic >> generic_engine
        nb_text >> tesseract_engine
        pdfdoc >> Edge(color=PAL["docling"], style="dashed") >> docling_engine

    # ---------- OUTPUTS ----------
    with Cluster(
        "OUTPUTS",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_pink"],
            "bgcolor": PAL["c_pink_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        json_out = Storage("parsed JSON\n(text/layout/metadata)")
        csv_out = Storage("parsed CSV (Tabula)")
        adv_out = Storage("advanced.json / tables.csv")
        docling_json = Storage("docling.json")
        docling_csv = Storage("docling_tables.csv")

        pdfplumber_engine >> Edge(color=PAL["json"]) >> json_out
        tabula_engine >> Edge(color=PAL["csv"]) >> csv_out
        generic_engine >> Edge(color=PAL["json"]) >> adv_out
        docling_engine >> Edge(color=PAL["docling_out"]) >> docling_json
        docling_engine >> Edge(color=PAL["docling_out"]) >> docling_csv

    # ---------- COMPARISON & EVALUATION ----------
    with Cluster(
        "COMPARISON & EVALUATION",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_indigo"],
            "bgcolor": PAL["c_indigo_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        comparator = Rack("Compare OUR JSON/CSV vs DOCLING\n→ Diffs & Quality Metrics")
        xbrl_store = Storage("XBRL Ground Truth\n(SEC Inline XBRL)")
        xbrl_compare = Rack("XBRL vs Tabula CSV\n(mapping + WER/metrics)")

        json_out     >> Edge(color=PAL["compare"], minlen="2") >> comparator
        csv_out      >> Edge(color=PAL["compare"], minlen="2") >> comparator
        docling_json >> Edge(color=PAL["compare"], style="dashed") >> comparator
        docling_csv  >> Edge(color=PAL["compare"], style="dashed") >> comparator

        xbrl_store >> Edge(color=PAL["compare"], minlen="2") >> xbrl_compare
        csv_out    >> Edge(color=PAL["compare"], minlen="2") >> xbrl_compare

    # ---------- STORAGE & REPORTS ----------
    storage = Storage("Object Storage (S3 / GCS / Azure)\n• PDFs • JSON • CSV • XBRL • Reports")
    report  = Storage("Comparison Reports (HTML/JSON)")

    comparator   >> Edge(color=PAL["store"], minlen="2") >> storage
    xbrl_compare >> Edge(color=PAL["store"], minlen="2") >> storage
    comparator   >> Edge(color=PAL["report"], minlen="2") >> report
    xbrl_compare >> Edge(color=PAL["report"], minlen="2") >> report


In [9]:
# Minimal architecture diagram (with Docling & XBRL comparison)
from diagrams import Diagram, Cluster, Edge
from diagrams.generic.storage import Storage
from diagrams.generic.compute import Rack

with Diagram(
    "PDF Parser - Simple Architecture",
    filename="/Users/divyanshmac/Documents/Big_Data_Assignment/pdf-parser/docs/architecture_simple",
    outformat="png",
    show=False,
    graph_attr={"rankdir": "LR", "fontsize": "12", "fontname": "Helvetica"},
    node_attr={"shape": "box", "style": "rounded,filled", "fontname": "Helvetica", "fontsize": "11"},
    edge_attr={"penwidth": "2"},
):
    src_pdf = Storage("SEC EDGAR / PDFs")
    xbrl = Storage("SEC EDGAR / XBRL")

    # Our extraction
    with Cluster("Extraction"):
        text = Rack("Text (pdfplumber)")
        tables = Rack("Tables (tabula)")
        ocr = Rack("OCR (tesseract)")

    json_out = Storage("JSON")
    tabula_csv = Storage("Tabula CSV")

    # Open-source Docling path (separate)
    docling = Rack("Docling Parser")
    docling_out = Storage("Docling CSV")

    # Layout analysis (layout.py)
    with Cluster("Layout Analysis (LayoutLMv3)"):
        load = Rack("load_data()")
        # Branch A: with parsing
        pre_parse = Rack("preprocess_data(…, use_layout_parsing=True)")
        infer_parse = Rack("perform_inference(…, layout_parsing=True)")
        # Branch B: direct (no parsing)
        pre_direct = Rack("preprocess_data(…, use_layout_parsing=False)")
        infer_direct = Rack("perform_inference(…, layout_parsing=False)")
        save = Rack("save_inference_results()")

    # Comparator focuses on XBRL vs Tabula CSV
    compare = Rack("Compare: XBRL vs Tabula CSV")

    # Flows
    src_pdf >> Edge() >> text >> Edge() >> json_out
    src_pdf >> Edge() >> tables >> Edge() >> tabula_csv
    src_pdf >> Edge() >> ocr >> Edge() >> json_out

    src_pdf >> Edge(style="dashed") >> docling >> Edge() >> docling_out

    # Layout pipeline fed by extraction outputs
    json_out >> Edge() >> load
    load >> Edge() >> pre_parse >> Edge() >> infer_parse >> Edge() >> save
    load >> Edge() >> pre_direct >> Edge() >> infer_direct >> Edge() >> save

    # XBRL vs Tabula comparison
    xbrl >> Edge() >> compare
    tabula_csv >> Edge() >> compare

