<a href="https://colab.research.google.com/github/vdubya/criteria-assistant/blob/main/src/UFC_DownloadAllWithMetadata_vFINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =======================
# Step 0: Install dependencies
# =======================
!pip install requests beautifulsoup4

# =======================
# Step 1: Imports and Config
# =======================
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time
import zipfile
from urllib.parse import urlparse

# Debug level: 0 = none, 1 = light, 2 = full
DEBUG_LEVEL = 1

# Partial run flag: True = first 10 rows only, False = full run
PARTIAL_RUN = True

# Force download: True = always download, False = skip if already exists
FORCE_DOWNLOAD = False

# Metadata only: True = only metadata, no downloads
METADATA_ONLY = False

# Configuration
BASE_URL = "https://www.wbdg.org"
URLS = {
    "active_page1": "https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All",
    "active_page2": "https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All&page=1",
    "inactive": "https://www.wbdg.org/dod/ufc?field_status_value=2&field_series_value=All",
    "archived": "https://www.wbdg.org/dod/ufc?field_status_value=3&field_series_value=All"
}

# Reference files
related_files = [
    ("DoD Directive 4270.5", "https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/427005p.pdf"),
    ("UFC Implementation Guide", "https://www.wbdg.org/FFC/DOD/ufc_implementation.pdf"),
    ("MIL-STD-3007", "https://www.wbdg.org/FFC/FEDMIL/milstd3007g.pdf"),
    ("UFC Word Template (2025)", "https://www.wbdg.org/FFC/DOD/ufc_word_template_05_06_2025.docx"),
    ("UFC UFGS Change Rev Policy", "https://www.wbdg.org/FFC/DOD/UFC/ufc_ufgs_chg_rev_policy.pdf")
]

# UFC Complete 5-volume downloads
ufc_complete_downloads = [
    ("UFC Complete Volume 1", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_1-200-01_thru_3-220-20.pdf"),
    ("UFC Complete Volume 2", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_3-230-01_thru_3-340-02.pdf"),
    ("UFC Complete Volume 3", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_3-400-02_thru_3-810-01N.pdf"),
    ("UFC Complete Volume 4", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_4-010-01_thru_4-159-03.pdf"),
    ("UFC Complete Volume 5", "https://www.wbdg.org/FFC/DOD/UFC/UFC_Complete_FC_4-171-06N_thru_4-860_03.pdf")
]

# Utility: controlled print
def debug_print(msg, level=1):
    if DEBUG_LEVEL >= level:
        print(msg)

# Metadata extraction
def get_metadata_from_detail_page(detail_url):
    debug_print(f"[DEBUG] Fetching metadata page: {detail_url}", level=2)
    resp = requests.get(detail_url)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract series as separate lines, remove "Series" label
    series_div = soup.find("div", class_="field--name-field-series")
    series_list = None
    if series_div:
        series_items = series_div.find_all(string=True)
        series_list = [item.strip() for item in series_items if item.strip() and item.strip() != "Series"]

    download_link_tag = soup.find("a", href=re.compile(r'\.pdf$|\.docx$|\.zip$'))
    download_link = BASE_URL + download_link_tag["href"] if download_link_tag else None

    # Clean summary
    summary_div = soup.find("div", class_="field--name-field-summary")
    summary = None
    if summary_div:
        summary_text = summary_div.get_text(separator=" ").replace("Summary", "").strip()
        summary = re.sub(r"\s+", " ", summary_text)

    metadata = {
        "status": soup.find("div", class_="field--name-field-status").get_text(strip=True).replace("Status", "").strip() if soup.find("div", class_="field--name-field-status") else None,
        "publish_date": soup.find("div", class_="field--name-field-publish-date").get_text(strip=True).replace("Publish Date", "").strip() if soup.find("div", class_="field--name-field-publish-date") else None,
        "archived / rescinded date": soup.find("div", class_="field--name-field-archived-date").get_text(strip=True).replace("Archived / Rescinded Date", "").strip() if soup.find("div", class_="field--name-field-archived-date") else None,
        "pages": soup.find("div", class_="field--name-field-pages").get_text(strip=True).replace("Pages", "").strip() if soup.find("div", class_="field--name-field-pages") else None,
        "series": series_list,
        "download_link": download_link,
        "summary": summary
    }

    debug_print(f"[DEBUG] Metadata parsed: {metadata}", level=2)
    return metadata

def parse_title_fields(raw_title):
    replaced_by = None
    if "Replaced by" in raw_title:
        parts = raw_title.split("Replaced by")
        main_part = parts[0].strip().rstrip(",")
        replaced_by = "Replaced by " + parts[1].strip()
    else:
        main_part = raw_title

    match = re.match(r"^(UFC)\s+([\d\-A-Z]+)\s+(.*)", main_part)
    if match:
        ufc_prefix, ufc_number, title = match.groups()
    else:
        ufc_prefix, ufc_number, title = None, None, main_part

    return {"ufc_prefix": ufc_prefix, "ufc_number": ufc_number, "title": title, "replaced_by": replaced_by}

def scrape_ufc_list(url, status):
    debug_print(f"[INFO] Scraping UFC list from: {url}", level=1)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    rows = soup.find_all("tr")
    debug_print(f"[DEBUG] Found {len(rows)} table rows.", level=2)

    row_limit = 10 if PARTIAL_RUN else len(rows)
    ufc_entries = []

    for i, row in enumerate(rows[:row_limit]):
        cols = row.find_all("td")
        debug_print(f"[DEBUG] Row {i} Columns:", level=2)
        for idx, col in enumerate(cols):
            debug_print(f"    Col {idx}: {col.get_text(strip=True)}", level=2)

        if len(cols) < 4:
            debug_print(f"[DEBUG] Row {i}: Skipped (not enough columns).", level=2)
            continue

        raw_title = cols[0].get_text(strip=True)
        parsed_title = parse_title_fields(raw_title)
        detail_url = BASE_URL + cols[0].find("a")["href"]

        metadata = get_metadata_from_detail_page(detail_url)

        ufc_entry = {
            "ufc_full_name": raw_title,
            "ufc_number": parsed_title["ufc_number"],
            "ufc_prefix": parsed_title["ufc_prefix"],
            "ufc_title": parsed_title["title"],
            "pages": metadata["pages"],
            "series": metadata["series"],
            "status": metadata["status"],
            "publish_date": cols[1].get_text(strip=True) or None,
            "change_date": cols[3].get_text(strip=True) or None,
            "archived / rescinded date": metadata["archived / rescinded date"],
            "replaced_by": parsed_title["replaced_by"],
            "download_link": metadata["download_link"],
            "metadata_link": detail_url,
            "summary": metadata["summary"]
        }
        debug_print(f"[DEBUG] Final UFC entry JSON:\n{json.dumps(ufc_entry, indent=2)}", level=2)
        ufc_entries.append(ufc_entry)
        time.sleep(1)

    return ufc_entries

# Scrape all UFCs
all_ufcs = []
for page_key in ["active_page1", "active_page2", "inactive", "archived"]:
    all_ufcs += scrape_ufc_list(URLS[page_key], page_key)

# Add UFC Complete entries
for name, url in ufc_complete_downloads:
    entry = {
        "ufc_full_name": name,
        "ufc_number": None,
        "ufc_prefix": None,
        "ufc_title": name,
        "pages": None,
        "series": None,
        "status": "Reference",
        "publish_date": "06/02/2025",
        "change_date": None,
        "archived / rescinded date": None,
        "replaced_by": None,
        "download_link": url,
        "metadata_link": "https://www.wbdg.org/dod/ufc/ufc-complete",
        "summary": "Active UFCs combined into five PDF documents"
    }
    all_ufcs.append(entry)

# Save JSON
debug_print("[INFO] Saving final JSON data...", level=1)
with open("wbdg_ufc_metadata_parsed_debug.json", "w", encoding="utf-8") as f:
    json.dump(all_ufcs, f, indent=2, ensure_ascii=False)

if not METADATA_ONLY:
    download_dir = "wbdg_ufc_downloads"
    os.makedirs(download_dir, exist_ok=True)

    downloaded_files = []
    total_files = sum(1 for entry in all_ufcs if entry.get("download_link"))
    file_counter = 1

    for entry in all_ufcs:
        download_url = entry.get("download_link")
        if download_url:
            filename = os.path.basename(urlparse(download_url).path)
            # Determine status
            status = entry.get("status")
            if status is None:
                raise ValueError(f"Missing status for entry: {entry.get('ufc_full_name')}")
            status_lower = status.lower()
            if status_lower in ["active", "inactive", "archived"]:
                status_dir = os.path.join(download_dir, status.capitalize())
            elif status_lower == "reference":
                status_dir = os.path.join(download_dir, "Reference")
            else:
                raise ValueError(f"Unexpected status '{status}' for entry: {entry.get('ufc_full_name')}")
            os.makedirs(status_dir, exist_ok=True)
            filepath = os.path.join(status_dir, filename)

            debug_print(f"[DOWNLOAD {file_counter}/{total_files}] {filename} (Status: {status})", level=1)
            try:
                if FORCE_DOWNLOAD or not os.path.exists(filepath):
                    resp = requests.get(download_url)
                    with open(filepath, "wb") as f:
                        f.write(resp.content)
                    downloaded_files.append(filepath)
                    debug_print(f"[DOWNLOAD {file_counter}/{total_files}] Downloaded: {filename}", level=1)
                else:
                    debug_print(f"[DOWNLOAD {file_counter}/{total_files}] Skipped (exists): {filename}", level=1)
            except Exception as e:
                debug_print(f"[DOWNLOAD {file_counter}/{total_files}] Failed for {filename}: {e}", level=1)
            file_counter += 1

    # Create zip archive
    zip_filename = "wbdg_ufc_downloads.zip"
    with zipfile.ZipFile(zip_filename, "w") as zipf:
        for file in downloaded_files:
            arcname = os.path.relpath(file, start=download_dir)
            zipf.write(file, arcname)
            debug_print(f"[ZIP] Added to zip: {arcname}", level=1)

    debug_print("[DONE] All UFC and reference data downloaded and zipped.", level=1)
else:
    debug_print("[DONE] Metadata-only run complete!", level=1)


[INFO] Scraping UFC list from: https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All
[INFO] Scraping UFC list from: https://www.wbdg.org/dod/ufc?field_status_value=1&field_series_value=All&page=1
[INFO] Scraping UFC list from: https://www.wbdg.org/dod/ufc?field_status_value=2&field_series_value=All
[INFO] Scraping UFC list from: https://www.wbdg.org/dod/ufc?field_status_value=3&field_series_value=All
[INFO] Saving final JSON data...


ValueError: Missing status for entry: UFC Complete