In [1]:
import re

BASE = "https://www.supremecourt.gov"

def absolutize(u):
    if not u:
        return None
    u = u.replace("&amp;", "&").strip()
    if u.lower().startswith("http://") or u.lower().startswith("https://"):
        return u
    if u.startswith("//"):
        return "https:" + u
    if u.startswith("/"):
        return BASE + u
    if u.startswith("./"):
        return BASE + "/" + u[2:]
    if u.startswith("opinions/"):
        return BASE + "/" + u
    return BASE + "/" + u

def extract_first_pdf_anywhere(html):
    if not isinstance(html, str) or len(html) < 50:
        raise ValueError("Empty HTML")

    # Collapse whitespace
    norm = re.sub(r"\s+", " ", html)
    print(norm)
    
    # 1) All anchors
    hrefs = re.findall(r'<a\b[^>]*?href\s*=\s*(["\'])(.*?)\1[^>]*>(.*?)</a>', norm, flags=re.IGNORECASE)
    hrefs = [(href, re.sub(r"<[^>]+>", "", text).strip()) for _, href, text in hrefs]

    # 1) Prefer /opinions/YYpdf/*.pdf
    for href, _ in hrefs:
        if re.search(r"/opinions/\d{2}pdf/[^\"'#?]+\.pdf", href, re.IGNORECASE):
            return absolutize(href)

    # 2) Any .pdf in href
    for href, _ in hrefs:
        if re.search(r"\.pdf($|[?#])", href, re.IGNORECASE):
            return absolutize(href)

    # 3) Plain-text absolute SCOTUS PDF
    m = re.search(r"(?:https?:\/\/)?(?:www\.)?supremecourt\.gov\/[^\"'#\s<>]*\.pdf", norm, re.IGNORECASE)
    if m:
        return absolutize(m.group(0))

    # 4) Plain-text slip-opinion relative path
    m = re.search(r"(\/?opinions\/\d{2}pdf\/[^\"'#\s<>]+\.pdf)", norm, re.IGNORECASE)
    if m:
        return absolutize(m.group(1))

    raise ValueError("No PDF link found")

if __name__ == "__main__":
    with open("html_case.html", encoding="utf-8") as f:
        html_content = f.read()
    try:
        pdf_url = extract_first_pdf_anywhere(html_content)
        print("Found PDF URL:", pdf_url)
    except Exception as e:
        print("Error:", e)


Found PDF URL: https://www.supremecourt.gov/opinions/24pdf/606us2r67_d18e.pdf
