In [1]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup, Tag

In [42]:
# ----------------- helpers -----------------
def fetch(url: str) -> str:
    s = requests.Session()
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (compatible; DHCS-FAQ-Scraper/1.0)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    })
    r = s.get(url, timeout=30)
    r.raise_for_status()
    return r.text

def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def strip_zw(s: str) -> str:
    # remove common zero-width chars from CMS pages
    return re.sub(r"[\u200B-\u200D\uFEFF]", "", s or "")

def strip_ordered_list_marker(s: str) -> str:
    # remove leading ordered list markers like "1. ", "1 . "
    return re.sub(r"^\s*\d+\s*\.\s*", "", s or "")

def nearest_category(tag: Tag) -> str:
    # category = nearest previous <h2>
    h2 = tag.find_previous("h2")
    return norm(strip_zw(h2.get_text(" ", strip=True))) if h2 else "Uncategorized"

def is_hightlighted_question(tag: Tag) -> bool:
    # highlighted questions are in <p> with <strong> which has <a> with id starting with '#'
    if tag.name != "p":
        return False
    strong = tag.find("strong")
    if not strong:
        return False
    a = strong.find("a", id=True)
    if not a or not a["id"].startswith("#"):
        return False
    return True

def collect_answer(start_tag: Tag, next_qnode: Tag | None, is_header_question: bool) -> str:
    parts = []
    sib = start_tag.next_sibling

    while sib and sib is not next_qnode:
        # skip whitespace nodes
        if not isinstance(sib, Tag):
            sib = sib.next_sibling
            continue

        # stop at a new section or a new question
        if sib.name == "h2":
            break
        if (sib.name in ("h3") and is_header_question) or (not is_header_question and is_hightlighted_question(sib)):
            break

        if sib.name in ("h4", "p", "div", "ul", "ol", "table", "blockquote"):
            txt = norm(strip_zw(sib.get_text(" ", strip=True)))
            if txt and not re.fullmatch(r"(?i)back to top", txt):
                parts.append(txt)

        sib = sib.next_sibling

    return "\n".join(parts)


In [43]:
# ----------------- main -----------------
def scrape_answer_page(url: str, is_header_question: bool) -> pd.DataFrame:
    soup = BeautifulSoup(fetch(url), "lxml")

    # Collect question nodes in document order:
    # - any <h3>/<h4>
    # - any <p> that contains a descendant <a id="..."> (highlight style)
    question_nodes: list[Tag] = []
    for node in soup.find_all(True):
        if is_header_question and node.name in ("h3"):
            question_nodes.append(node)
        elif not is_header_question and is_hightlighted_question(node):
            question_nodes.append(node)

    rows = []
    for i, qnode in enumerate(question_nodes):
        question_text = norm(strip_ordered_list_marker(strip_zw(qnode.get_text(" ", strip=True))))
        category = nearest_category(qnode)

        next_qnode = question_nodes[i + 1] if i + 1 < len(question_nodes) else None
        answer = collect_answer(qnode, next_qnode, is_header_question)

        rows.append({
            "category": category,
            "question": question_text,
            "answer": answer
        })

    return pd.DataFrame(rows, columns=["category", "question", "answer"])

In [44]:
# ----------------- run -----------------
pages = [('a', True), ('b', False), ('c', False),  ('d', False), ('e', True)]
base_url = "https://www.dhcs.ca.gov/services/medi-cal/eligibility/Pages/Medi-CalFAQs2014{page}.aspx"
urls = [base_url.format(page=p[0]) for p in pages]
dfs = []
for (page, is_header_question), url in zip(pages, urls):
    df = scrape_answer_page(url, is_header_question=is_header_question)
    dfs.append(df)
print(dfs[0].head())


                   category  \
0  Medi-Cal Health Coverage   
1  Medi-Cal Health Coverage   
2  Medi-Cal Health Coverage   
3  Medi-Cal Health Coverage   
4  Medi-Cal Health Coverage   

                                            question  \
0                                  What is Medi-Cal?   
1                       How much does Medi-Cal cost?   
2  What is the difference in coverage between Med...   
3  How is the state and/or counties reaching out ...   
4  What health plans are available through Medi-Cal?   

                                              answer  
0  Medi-Cal offers free or low-cost health covera...  
1  For many individuals who enroll in Medi-Cal, t...  
2  Medi-Cal is health coverage, just like the cov...  
3  Outreach and Enrollment Grants for Targeted Po...  
4  Medi-Cal managed care offers a selection of 21...  


In [45]:
documents = pd.concat(dfs, ignore_index=True)
print(f"Scraped {len(documents)} Q&A pairs from {len(pages)} pages.")
documents.to_csv("../data/cms_faq.csv", index=False)

Scraped 44 Q&A pairs from 5 pages.


In [46]:
documents

Unnamed: 0,category,question,answer
0,Medi-Cal Health Coverage,What is Medi-Cal?,Medi-Cal offers free or low-cost health covera...
1,Medi-Cal Health Coverage,How much does Medi-Cal cost?,"For many individuals who enroll in Medi-Cal, t..."
2,Medi-Cal Health Coverage,What is the difference in coverage between Med...,"Medi-Cal is health coverage, just like the cov..."
3,Medi-Cal Health Coverage,How is the state and/or counties reaching out ...,Outreach and Enrollment Grants for Targeted Po...
4,Medi-Cal Health Coverage,What health plans are available through Medi-Cal?,Medi-Cal managed care offers a selection of 21...
5,Medi-Cal Health Coverage,Will I be able to stay with my doctor?,"More than 400 hospitals and approximately 130,..."
6,Medi-Cal Health Coverage,Is it possible for the members of the same fam...,The Covered California application is a single...
7,Medi-Cal Health Coverage,What do I do if I have questions about medicat...,If you have questions about your coverage unde...
8,Medi-Cal Health Coverage,Can I decline Medi-Cal and enroll in a Covered...,"Under federal law, if you are currently enroll..."
9,Medi-Cal Health Coverage,Is there a deadline to enroll in Medi-Cal?,No. There is no deadline to enroll in Medi-Cal...


In [36]:
documents["answer"].apply(lambda s: len(s) < 5).sum()

np.int64(1)