In [11]:
!pip -q install playwright nest_asyncio tqdm
!playwright install firefox


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright

LISTING_URL = "https://www.factually.gov.sg/corrections-and-clarifications/"

async def fetch_listing_html(headless=True):
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=headless)
        page = await browser.new_page()
        await page.goto(LISTING_URL, wait_until="networkidle")
        await page.wait_for_timeout(1000)  # small extra pause for JS rendering
        await page.screenshot(path="listing.png", full_page=True)
        html = await page.content()
        await browser.close()
        return html

listing_html = asyncio.get_event_loop().run_until_complete(fetch_listing_html(headless=True))
print("Downloaded HTML length:", len(listing_html))
print("Saved screenshot: listing.png")


Downloaded HTML length: 527146
Saved screenshot: listing.png


In [16]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE = "https://www.factually.gov.sg"

soup = BeautifulSoup(listing_html, "html.parser")

items = []
for a in soup.select('a[href^="/corrections-and-clarifications/"]'):
    href = a.get("href", "")
    # exclude the index page itself
    if href.rstrip("/") == "/corrections-and-clarifications":
        continue

    date_el = a.select_one('p.text-base-content-subtle')
    title_el = a.select_one("h3 span[title]") or a.select_one("h3 span")
    summary_el = a.select_one("p.prose-body-base")
    section_el = a.select_one("p.prose-label-sm-medium")
    img_el = a.select_one("img[src]")

    # keep only cards that look like real items (have a title + date)
    if not (date_el and title_el):
        continue

    items.append({
        "date": date_el.get_text(strip=True),
        "title": title_el.get_text(strip=True),
        "summary": summary_el.get_text(" ", strip=True) if summary_el else None,
        "category": section_el.get_text(strip=True) if section_el else None,
        "item_url": urljoin(BASE, href),
        "image_url": img_el.get("src") if img_el else None,
    })

df_page1 = pd.DataFrame(items).drop_duplicates(subset=["item_url"]).reset_index(drop=True)

print("Items found on page 1:", len(df_page1))
df_page1[["date","title","item_url"]].head(10)


Items found on page 1: 10


Unnamed: 0,date,title,item_url
0,15 November 2025,Corrections regarding false statements concern...,https://www.factually.gov.sg/corrections-and-c...
1,22 September 2025,Corrections regarding false statements of fact...,https://www.factually.gov.sg/corrections-and-c...
2,7 September 2025,Corrections regarding false statements of fact...,https://www.factually.gov.sg/corrections-and-c...
3,1 September 2025,Corrections regarding false statements by Samm...,https://www.factually.gov.sg/corrections-and-c...
4,20 May 2025,"What are CPF monies invested in, and how are C...",https://www.factually.gov.sg/corrections-and-c...
5,21 April 2025,Are our electricity prices one of the highest ...,https://www.factually.gov.sg/corrections-and-c...
6,15 April 2025,Date of Government Gazette Notification on Dis...,https://www.factually.gov.sg/corrections-and-c...
7,14 April 2025,Impact of Foreign Professionals on our Economy...,https://www.factually.gov.sg/corrections-and-c...
8,8 April 2025,Where does Government revenue come from?,https://www.factually.gov.sg/corrections-and-c...
9,8 April 2025,Why is it that 40% of all workers do not pay P...,https://www.factually.gov.sg/corrections-and-c...


In [17]:
import nest_asyncio, asyncio, re
nest_asyncio.apply()

import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
from playwright.async_api import async_playwright

LISTING_URL = "https://www.factually.gov.sg/corrections-and-clarifications/"
BASE = "https://www.factually.gov.sg"

def parse_listing_items(html: str) -> list[dict]:
    soup = BeautifulSoup(html, "html.parser")
    items = []
    for a in soup.select('a[href^="/corrections-and-clarifications/"]'):
        href = a.get("href", "")
        if href.rstrip("/") == "/corrections-and-clarifications":
            continue

        date_el = a.select_one("p.text-base-content-subtle")
        title_el = a.select_one("h3 span[title]") or a.select_one("h3 span")
        if not (date_el and title_el):
            continue  # filters out non-card links

        summary_el = a.select_one("p.prose-body-base")
        section_el = a.select_one("p.prose-label-sm-medium")
        img_el = a.select_one("img[src]")

        items.append({
            "date": date_el.get_text(strip=True),
            "title": title_el.get_text(strip=True),
            "summary": summary_el.get_text(" ", strip=True) if summary_el else None,
            "category": section_el.get_text(strip=True) if section_el else None,
            "item_url": urljoin(BASE, href),
            "image_url": img_el.get("src") if img_el else None,
        })
    return items

async def scrape_all_listing_pages(headless=True, max_pages=200):
    all_rows = []
    seen_urls = set()

    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=headless)
        page = await browser.new_page()
        await page.goto(LISTING_URL, wait_until="networkidle")
        await page.wait_for_timeout(1000)

        page_num = 1
        while page_num <= max_pages:
            # parse current page
            html = await page.content()
            rows = parse_listing_items(html)

            new_count = 0
            for r in rows:
                if r["item_url"] not in seen_urls:
                    seen_urls.add(r["item_url"])
                    all_rows.append(r)
                    new_count += 1

            # (optional) screenshot each page for debugging/fun
            await page.screenshot(path=f"listing_page_{page_num:02d}.png", full_page=True)

            print(f"Page {page_num}: cards={len(rows)}, new={new_count}, total_unique={len(seen_urls)}")

            # find "Next" button and see if it's disabled
            next_btn = page.locator('button[aria-label="Go to next page"]')
            if await next_btn.count() == 0:
                print("No Next button found — stopping.")
                break

            is_disabled = await next_btn.is_disabled()
            if is_disabled:
                print("Next is disabled — reached last page.")
                break

            await next_btn.click()
            await page.wait_for_load_state("networkidle")
            await page.wait_for_timeout(800)
            page_num += 1

        await browser.close()

    return pd.DataFrame(all_rows).drop_duplicates(subset=["item_url"]).reset_index(drop=True)

df_items = asyncio.get_event_loop().run_until_complete(scrape_all_listing_pages(headless=True))

print("\nTOTAL unique items:", len(df_items))
df_items.head(10)


  attr_dict[key] = value


Page 1: cards=10, new=10, total_unique=10
Page 2: cards=10, new=10, total_unique=20
Page 3: cards=10, new=10, total_unique=30
Page 4: cards=10, new=10, total_unique=40
Page 5: cards=10, new=10, total_unique=50
Page 6: cards=10, new=10, total_unique=60
Page 7: cards=10, new=10, total_unique=70
Page 8: cards=10, new=10, total_unique=80
Page 9: cards=10, new=10, total_unique=90
Page 10: cards=10, new=10, total_unique=100
Page 11: cards=10, new=10, total_unique=110
Page 12: cards=10, new=10, total_unique=120
Page 13: cards=4, new=4, total_unique=124
Next is disabled — reached last page.

TOTAL unique items: 124


Unnamed: 0,date,title,summary,category,item_url,image_url
0,15 November 2025,Corrections regarding false statements concern...,MalaysiaNow had communicated falsehoods concer...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/bdbc9...
1,22 September 2025,Corrections regarding false statements of fact...,Mr Nicholas Tan communicated multiple false st...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/89fba...
2,7 September 2025,Corrections regarding false statements of fact...,Mr Jay Ish’haq Rajoo communicated false statem...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/9363a...
3,1 September 2025,Corrections regarding false statements by Samm...,Mr Sammy Obeid communicated false statements o...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/eb9ab...
4,20 May 2025,"What are CPF monies invested in, and how are C...",Your CPF savings are invested by the CPF Board...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/a6368...
5,21 April 2025,Are our electricity prices one of the highest ...,Some online articles and text messages have be...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/d2484...
6,15 April 2025,Date of Government Gazette Notification on Dis...,The President’s Proclamation to dissolve Parli...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/88869...
7,14 April 2025,Impact of Foreign Professionals on our Economy...,Foreign professionals in Singapore and their i...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/56c33...
8,8 April 2025,Where does Government revenue come from?,Find out more about the sources of Government ...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/e099e...
9,8 April 2025,Why is it that 40% of all workers do not pay P...,Does it mean 40% of our workers earn less than...,Corrections and Clarifications,https://www.factually.gov.sg/corrections-and-c...,https://isomer-user-content.by.gov.sg/35/f156d...


In [18]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def extract_article_text(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")

    # This matches the container you pasted:
    container = soup.select_one('div.w-full.overflow-x-auto.break-words.lg\\:max-w-\\[660px\\]')
    if not container:
        # fallback: try a broader guess (in case class changes slightly)
        container = soup.select_one('div.break-words')

    if not container:
        return None

    # Get readable text, keep bullet structure reasonably intact
    text = container.get_text("\n", strip=True)

    # de-duplicate excessive blank lines
    lines = [ln.strip() for ln in text.splitlines()]
    lines = [ln for ln in lines if ln]
    return "\n".join(lines)

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; scraper/1.0)"
})

article_texts = []
missing = []

for url in tqdm(df_items["item_url"], total=len(df_items)):
    try:
        r = session.get(url, timeout=30)
        r.raise_for_status()
        txt = extract_article_text(r.text)
        if not txt:
            missing.append(url)
        article_texts.append(txt)
    except Exception:
        missing.append(url)
        article_texts.append(None)

df_items["article_text"] = article_texts

print("Done. Missing article_text for:", len(missing))
if missing:
    print("First 10 missing:")
    for u in missing[:10]:
        print(" -", u)

df_items[["date","title","item_url","article_text"]].head(3)


100%|██████████| 124/124 [03:22<00:00,  1.63s/it]

Done. Missing article_text for: 0





Unnamed: 0,date,title,item_url,article_text
0,15 November 2025,Corrections regarding false statements concern...,https://www.factually.gov.sg/corrections-and-c...,The Ministry of Home Affairs (â\nMHA\nâ) i...
1,22 September 2025,Corrections regarding false statements of fact...,https://www.factually.gov.sg/corrections-and-c...,"On 9, 15 and 16 September 2025, Mr Tan publish..."
2,7 September 2025,Corrections regarding false statements of fact...,https://www.factually.gov.sg/corrections-and-c...,1.Â Â Â Â Mr Jay Ishâhaq Rajoo (â\nMr Jay...


In [19]:
# Preview a few rows (including long text)
pd.set_option("display.max_colwidth", 120)
display(df_items.sample(3, random_state=1)[["date","title","summary","item_url","article_text"]])

# Save
out_path = "factually_corrections_and_clarifications.csv"
df_items.to_csv(out_path, index=False)
print("Saved:", out_path, "| rows:", len(df_items), "| cols:", df_items.shape[1])


Unnamed: 0,date,title,summary,item_url,article_text
48,31 August 2023,Clarifications regarding falsehoods on disappearing ink used in new X-stamp,"When voting on Polling Day, voters can use either the X-stamps provided or bring their own pens to mark their choice...",https://www.factually.gov.sg/corrections-and-clarifications/clarifications-regarding-falsehoods-on-disappearing-ink-...,[Updated as of 25 March 2025]\nThe X-stamp was introduced to make it easier for voters to mark their choice on the b...
114,28 November 2019,Corrections and clarifications regarding falsehoods posted by the States Times Review,Misleading and false statements were made by the States Times Review,https://www.factually.gov.sg/corrections-and-clarifications/factually-corrections-on-falsehoods-posted-by-states-tim...,Corrections and clarifications regarding falsehoods posted by the States Times Review\nThe Facebook post by the Stat...
73,29 November 2021,Corrections and Clarifications regarding content about COVID-19 Vaccines in a blog post by Cheah Kit Sun,Misleading information on COVID-19 vaccines and reported adverse events,https://www.factually.gov.sg/corrections-and-clarifications/factually291121/,There are false and misleading statements about COVID-19 vaccines contained in a blog post by Cheah Kit Sun titled ...


Saved: factually_corrections_and_clarifications.csv | rows: 124 | cols: 7
