<a href="https://colab.research.google.com/github/vdubya/criteria-assistant/blob/main/src/UFC_DownloadAllWithMetadata_vFINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =======================
# Step 0: Install dependencies
# =======================
!pip install requests beautifulsoup4

# =======================
# Step 1: Imports and Config
# =======================
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time
import zipfile
from urllib.parse import urlparse

DEBUG_LEVEL = 2         # 0 = no debug, 1 = light debug, 2 = full debug
PARTIAL_RUN = False      # True = process only first 10 rows per page
FORCE_DOWNLOAD = False  # True = force re-download even if file exists
METADATA_ONLY = False   # True = skip download, only metadata

BASE_URL = "https://www.wbdg.org"
URLS = {
    "active_page1": "https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All",
    "active_page2": "https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All&page=1",
    "inactive": "https://www.wbdg.org/dod/ufc?field_status_value=2&field_series_value=All",
    "archived": "https://www.wbdg.org/dod/ufc?field_status_value=3&field_series_value=All"
}

related_files = [
    ("DoD Directive 4270.5", "https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/427005p.pdf"),
    ("UFC Implementation Guide", "https://www.wbdg.org/FFC/DOD/ufc_implementation.pdf"),
    ("MIL-STD-3007", "https://www.wbdg.org/FFC/FEDMIL/milstd3007g.pdf"),
    ("UFC Word Template (2025)", "https://www.wbdg.org/FFC/DOD/ufc_word_template_05_06_2025.docx"),
    ("UFC UFGS Change Rev Policy", "https://www.wbdg.org/FFC/DOD/UFC/ufc_ufgs_chg_rev_policy.pdf")
]

ufc_complete_downloads = [
    ("UFC Complete Volume 1", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_1-200-01_thru_3-220-20.pdf"),
    ("UFC Complete Volume 2", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_3-230-01_thru_3-340-02.pdf"),
    ("UFC Complete Volume 3", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_3-400-02_thru_3-810-01N.pdf"),
    ("UFC Complete Volume 4", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_4-010-01_thru_4-159-03.pdf"),
    ("UFC Complete Volume 5", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_FC_4-171-06N_thru_4-860_03.pdf")
]

def debug_print(msg, level=1):
    if DEBUG_LEVEL >= level:
        print(msg)

def get_metadata_from_detail_page(detail_url):
    resp = requests.get(detail_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    series_div = soup.find("div", class_="field--name-field-series")
    series_list = [s.strip() for s in series_div.stripped_strings if s.strip() != "Series"] if series_div else None
    download_tag = soup.find("a", href=re.compile(r'\.pdf$|\.docx$|\.zip$'))
    download_link = BASE_URL + download_tag["href"] if download_tag else None
    summary_div = soup.find("div", class_="field--name-field-summary")
    summary = re.sub(r"\s+", " ", summary_div.get_text(separator=" ").replace("Summary", "").strip()) if summary_div else None

    return {
        "status": soup.find("div", class_="field--name-field-status").get_text(strip=True).replace("Status", "").strip() if soup.find("div", class_="field--name-field-status") else None,
        "publish_date": soup.find("div", class_="field--name-field-publish-date").get_text(strip=True).replace("Publish Date", "").strip() if soup.find("div", class_="field--name-field-publish-date") else None,
        "archived / rescinded date": soup.find("div", class_="field--name-field-archived-date").get_text(strip=True).replace("Archived / Rescinded Date", "").strip() if soup.find("div", class_="field--name-field-archived-date") else None,
        "pages": soup.find("div", class_="field--name-field-pages").get_text(strip=True).replace("Pages", "").strip() if soup.find("div", class_="field--name-field-pages") else None,
        "series": series_list,
        "download_link": download_link,
        "summary": summary
    }

def parse_title_fields(raw_title):
    if "Replaced by" in raw_title:
        parts = raw_title.split("Replaced by")
        main_part, replaced_by = parts[0].strip().rstrip(","), "Replaced by " + parts[1].strip()
    else:
        main_part, replaced_by = raw_title, None
    match = re.match(r"^(UFC)\s+([\d\-A-Z]+)\s+(.*)", main_part)
    return {"ufc_prefix": match.group(1) if match else None, "ufc_number": match.group(2) if match else None, "title": match.group(3) if match else main_part, "replaced_by": replaced_by}

def scrape_ufc_list(url):
    debug_print(f"[INFO] Scraping UFC list from: {url}", level=1)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    rows = soup.find_all("tr")
    entries = []
    for i, row in enumerate(rows[:10] if PARTIAL_RUN else rows):
        cols = row.find_all("td")
        if len(cols) < 4: continue
        raw_title = cols[0].get_text(strip=True)
        parsed = parse_title_fields(raw_title)
        detail_url = BASE_URL + cols[0].find("a")["href"]
        metadata = get_metadata_from_detail_page(detail_url)
        entry = {
            "ufc_full_name": raw_title,
            "ufc_number": parsed["ufc_number"],
            "ufc_prefix": parsed["ufc_prefix"],
            "ufc_title": parsed["title"],
            "pages": metadata["pages"],
            "series": metadata["series"],
            "status": metadata["status"],
            "publish_date": cols[1].get_text(strip=True) or None,
            "change_date": cols[3].get_text(strip=True) or None,
            "archived / rescinded date": metadata["archived / rescinded date"],
            "replaced_by": parsed["replaced_by"],
            "download_link": metadata["download_link"],
            "metadata_link": detail_url,
            "summary": metadata["summary"]
        }
        debug_print(f"[DEBUG] Final UFC entry:\n{json.dumps(entry, indent=2)}", level=2)
        entries.append(entry)
        time.sleep(1)
    return entries

all_ufcs = []
for url in URLS.values():
    all_ufcs += scrape_ufc_list(url)

for name, url in ufc_complete_downloads:
    all_ufcs.append({
        "ufc_full_name": name,
        "ufc_number": None,
        "ufc_prefix": None,
        "ufc_title": name,
        "pages": None,
        "series": None,
        "status": "Reference",
        "publish_date": "06/02/2025",
        "change_date": None,
        "archived / rescinded date": None,
        "replaced_by": None,
        "download_link": url,
        "metadata_link": "https://www.wbdg.org/dod/ufc/ufc-complete",
        "summary": "Active UFCs combined into five PDF documents"
    })

with open("wbdg_ufc_metadata_parsed_debug.json", "w", encoding="utf-8") as f:
    json.dump(all_ufcs, f, indent=2, ensure_ascii=False)

if not METADATA_ONLY:
    download_dir = "wbdg_ufc_downloads"
    os.makedirs(download_dir, exist_ok=True)
    file_counter = 1
    total_files = sum(1 for e in all_ufcs if e.get("download_link"))

    for entry in all_ufcs:
        url, status, full_name = entry["download_link"], entry["status"], entry["ufc_full_name"].lower()
        filename = os.path.basename(urlparse(url).path)
        if isinstance(filename, bytes):
            filename = filename.decode()
        if status is None and "ufc complete" in full_name:
            status_dir = os.path.join(download_dir, "Reference")
        else:
            status_lower = status.lower()
            if status_lower in ["active", "inactive"]:
                status_dir = os.path.join(download_dir, status.capitalize())
            elif status_lower in ["archived", "archive(s)"]:
                status_dir = os.path.join(download_dir, "Archived")
            elif status_lower == "reference":
                status_dir = os.path.join(download_dir, "Reference")
            else:
                raise ValueError(f"Unexpected status '{status}' for entry: {entry['ufc_full_name']}")
        os.makedirs(status_dir, exist_ok=True)
        filepath = os.path.join(status_dir, filename)
        if FORCE_DOWNLOAD or not os.path.exists(filepath):
            resp = requests.get(url)
            with open(filepath, "wb") as f:
                f.write(resp.content)
            debug_print(f"[DOWNLOAD {file_counter}/{total_files}] Downloaded: {filename}", level=1)
        else:
            debug_print(f"[DOWNLOAD {file_counter}/{total_files}] Skipped (exists): {filename}", level=1)
        file_counter += 1

    # 🟡 Zip entire download_dir tree
    with zipfile.ZipFile("wbdg_ufc_downloads.zip", "w") as zipf:
        for root, _, files in os.walk(download_dir):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), start=download_dir))
    debug_print("[DONE] All UFC and reference data zipped, including previously downloaded files.", level=1)
else:
    debug_print("[DONE] Metadata-only run complete!", level=1)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  "ufc_prefix": "UFC",
  "ufc_title": "Design: Small Craft Berthing Facilities, with Change 1",
  "pages": "79",
  "series": [
    "4 - MULTI-DISCIPLINARY AND FACILITY-SPECIFIC DESIGN",
    "4-100: OPERATIONAL AND TRAINING FACILITIES"
  ],
  "status": "Active",
  "publish_date": "07/14/2009",
  "change_date": "View/Download",
  "archived / rescinded date": null,
  "replaced_by": null,
  "download_link": "https://www.wbdg.org/FFC/DOD/UFC/ufc_4_152_07_2009_c1.pdf",
  "metadata_link": "https://www.wbdg.org/dod/ufc/ufc-4-152-07",
  "summary": "This UFC provides general criteria for the design of small craft berthing facilities."
}
[DEBUG] Final UFC entry:
{
  "ufc_full_name": "UFC 4-159-01N Design: Hyperbaric Facilities",
  "ufc_number": "4-159-01N",
  "ufc_prefix": "UFC",
  "ufc_title": "Design: Hyperbaric Facilities",
  "pages": "357",
  "series": [
    "4 - MULTI-DISCIPLINARY AND FACILITY-SPECIFIC DESIGN",
    "4-100: OPER