In [1]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup, Tag, NavigableString

In [7]:
# ----------------- helpers -----------------
def fetch(url: str) -> str:
    s = requests.Session()
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (compatible; DHCS-FAQ-Scraper/1.0)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    })
    r = s.get(url, timeout=30)
    r.raise_for_status()
    return r.text

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def strip_zw(s: str) -> str:
    # remove common zero-width chars from CMS pages
    return re.sub(r"[\u200B-\u200D\uFEFF]", "", s or "")

def nearest_category(tag: Tag) -> str:
    # category = nearest previous <h2>
    h2 = tag.find_previous("h2")
    return norm(strip_zw(h2.get_text(" ", strip=True))) if h2 else "Uncategorized"

def is_hightlighted_question(tag: Tag) -> bool:
    # highlighted questions are in <p> with <strong> which has <a> with id starting with '#'
    if tag.name != "p":
        return False
    strong = tag.find("strong")
    if not strong:
        return False
    a = strong.find("a", id=True)
    if not a or not a["id"].startswith("#"):
        return False
    return True

def collect_answer(start_tag: Tag, next_qnode: Tag | None, is_header_question: bool) -> str:
    parts = []
    sib = start_tag.next_sibling

    while sib and sib is not next_qnode:
        # skip whitespace nodes
        if not isinstance(sib, Tag):
            sib = sib.next_sibling
            continue

        # stop at a new section or a new question
        if sib.name == "h2":
            break
        if (sib.name in ("h3", "h4") and is_header_question) or (not is_header_question and is_hightlighted_question(sib)):
            break

        if sib.name in ("p", "div", "ul", "ol", "table", "blockquote"):
            txt = norm(strip_zw(sib.get_text(" ", strip=True)))
            if txt and not re.fullmatch(r"(?i)back to top", txt):
                parts.append(txt)

        sib = sib.next_sibling

    return "\n".join(parts)


In [8]:
# ----------------- main -----------------
def scrape_answer_page(url: str, is_header_question: bool) -> pd.DataFrame:
    soup = BeautifulSoup(fetch(url), "lxml")

    # Collect question nodes in document order:
    # - any <h3>/<h4>
    # - any <p> that contains a descendant <a id="..."> (highlight style)
    question_nodes: list[Tag] = []
    for node in soup.find_all(True):
        if is_header_question and node.name in ("h3", "h4"):
            question_nodes.append(node)
        elif not is_header_question and is_hightlighted_question(node):
            question_nodes.append(node)

    rows = []
    for i, qnode in enumerate(question_nodes):
        question_text = norm(strip_zw(qnode.get_text(" ", strip=True)))
        category = nearest_category(qnode)

        next_qnode = question_nodes[i + 1] if i + 1 < len(question_nodes) else None
        answer = collect_answer(qnode, next_qnode, is_header_question)

        rows.append({
            "category": category,
            "question": question_text,
            "answer": answer
        })

    return pd.DataFrame(rows, columns=["category", "question", "answer"])

In [9]:
# Example:
url = "https://www.dhcs.ca.gov/services/medi-cal/eligibility/Pages/Medi-CalFAQs2014b.aspx"
df = scrape_answer_page(url, is_header_question=False)
print(df.head())

                  category                                           question  \
0  Citizenship/Immigration  1 . What eligibility requirements will an undo...   
1  Citizenship/Immigration  5 . Where can I get information about becoming...   
2  Citizenship/Immigration  6 . Will the information I provide for health ...   
3  Citizenship/Immigration  7 . Can people on H-1 visas, including those w...   
4  Citizenship/Immigration  8 . Can green card holders, including those wh...   

                                              answer  
0  An undocumented person must meet the same elig...  
1  If you are not registered to vote where you li...  
2  Immigration status is verified for documented ...  
3  Yes. In general, anyone can apply for Medi-Cal...  
4  Yes. A lawful permanent resident (green card h...  
