In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

## 2025 (14th)

In [2]:
def clean_text(text):
    """Clean text by removing extra whitespace and newlines"""
    return " ".join(text.strip().split())


def safe_extract(element, selector, class_name=None, attribute="text", color=None):
    """Safely extract data from HTML elements with error handling"""
    try:
        if class_name:
            found = element.find(selector, class_=class_name)
        else:
            found = element.find(selector)

        if found:
            if color:
                span = found.find("span", style=f"color:{color}")
                return span.text.strip() if span else ""
            if attribute == "text":
                return found.text.strip()
            else:
                return found.get(attribute, "").strip()
        return ""
    except (AttributeError, TypeError):
        return ""


def extract_mp_info(html_content):
    """Extract MP information from HTML content"""
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all MP entries
    mp_items = soup.find_all("li", id=True)

    mp_data = []
    for item in mp_items:
        try:
            # Extract all information using safe_extract
            name = safe_extract(item, "div", "name")
            rank = safe_extract(item, "div", "rank")
            # Extract email specifically
            email_div = item.find("div", class_="email")
            email = ""
            if email_div:
                email_span = email_div.find("span", style="color:#d11212")
                if email_span:
                    email = email_span.text.strip()

            # Handle phone number specifically
            tel_div = item.find("div", class_="tel")
            phone = ""
            if tel_div:
                phone_div = tel_div.find("div")
                if phone_div:
                    phone = phone_div.text.strip()

            # Extract address
            detail_div = item.find("div", class_="detail")
            address = clean_text(detail_div.text) if detail_div else ""

            # Extract constituency from rank
            constituency = ""
            if rank:
                constituency_match = re.search(r"\((.*?)\*?\)", rank)
                constituency = constituency_match.group(1) if constituency_match else ""

            mp_data.append(
                {
                    "name": name,
                    "rank": rank,
                    "constituency": constituency,
                    "email": email,
                    "phone": phone,
                    "address": address,
                }
            )

        except Exception as e:
            print(f"Error processing MP entry: {e}")
            continue

    return pd.DataFrame(mp_data)


def scrape_mp_info(url):
    """Main function to scrape MP information from the given URL"""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Extract information and create DataFrame
        df = extract_mp_info(response.text)
        return df

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

In [3]:
url2025 = "https://www.sgdi.gov.sg/organs-of-state/parl#ANG-Wei-Neng_-West-Coast-"
df2025 = (
    scrape_mp_info(url2025)
    .assign(leg_start_year=2021)
    .assign(email=lambda df_: df_["email"].str.split(";"))
    .explode("email", ignore_index=True)
)
df2025.head()

Unnamed: 0,name,rank,constituency,email,phone,address,leg_start_year
0,SEAH Kian Peng,Speaker of Parliament,,seah_kian_peng@parl.gov.sg,63325500,,2021
1,Ms TAN Zhi Yi Chloe,Personal Assistant to the Speaker of Parliament,,chloe_tan@parl.gov.sg,63325500,,2021
2,Lawrence WONG,"Prime Minister, Minister for Finance (Marsilin...",Marsiling-Yew Tee,Lawrence_wong@pmo.gov.sg,63327415,Ministry of Finance100 High Street #10-01The T...,2021
3,LEE Hsien Loong,Senior Minister (Ang Mo Kio*),Ang Mo Kio,lee_hsien_loong@pmo.gov.sg,63327200,Prime Minister's OfficeIstana AnnexeOrchard Ro...,2021
4,GAN Kim Yong,"Deputy Prime Minister, Minister for Trade and ...",Chua Chu Kang,GAN_Kim_Yong@mti.gov.sg,63327933,Ministry of Trade and Industry 100 High Street...,2021


## 2021 (14th)

In [5]:
url2021 = "https://web.archive.org/web/20211205013734/https://www.sgdi.gov.sg/organs-of-state/PARL"
df2021 = (
    scrape_mp_info(url2021)
    .assign(leg_start_year=2021)
    .replace(".gov.sg", ".gov.sg;", regex=True)
    .assign(email=lambda df_: df_["email"].str.rstrip(";"))
    .assign(email=lambda df_: df_["email"].str.split(";"))
    .explode("email", ignore_index=True)
)
df2021.head()

Unnamed: 0,name,rank,constituency,email,phone,address,leg_start_year
0,TAN Chuan-Jin,Speaker of Parliament(Marine Parade*),Marine Parade,tan_chuan-jin@parl.gov.sg,,Parliament House 1 Parliament Place Singapore ...,2021
1,LEE Hsien Loong,Prime Minister(Ang Mo Kio*),Ang Mo Kio,lee_hsien_loong@pmo.gov.sg,,Prime Minister's Office Istana Annexe Orchard ...,2021
2,HENG Swee Keat,"Deputy Prime Minister, Coordinating Minister f...",East Coast,HENG_Swee_Keat@pmo.gov.sg,,Prime Minister's Office Istana Annexe Orchard ...,2021
3,TEO Chee Hean,Senior Minister & Coordinating Minister for Na...,Pasir Ris-Punggol,teo_chee_hean@pmo.gov.sg,,New Phoenix Park 28 Irrawaddy Road Singapore 3...,2021
4,THARMAN Shanmugaratnam,Senior Minister & Coordinating Minister for So...,Jurong,tharman_s@pmo.gov.sg,,100 High Street #10-01 The Treasury Singapore ...,2021


## 2016 (13th)

In [6]:
def scrape_mp_info16(url):
    """
    Scrapes MP information (name, email, rank, constituency) from the given archived SGDI URL.

    Args:
        url (str): The URL of the archived SGDI page.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted MP details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    data_list = []

    rows = soup.find_all("tr")

    for row in rows:
        cols = row.find_all("td")

        if len(cols) < 4:
            continue  # Skip if there aren't enough columns

        # Extract rank/title and constituency (ensuring constituency comes from parentheses at the end)
        rank_info = cols[0].get_text(strip=True)

        # Regex to extract "Rank" and "Constituency" correctly
        rank_match = re.match(r"^(.*?)(?:\((.*?)\))?$", rank_info)

        if rank_match:
            rank = rank_match.group(1).strip()  # Everything before the parentheses
            constituency = (
                rank_match.group(2).strip() if rank_match.group(2) else np.nan
            )  # Inside parentheses
        else:
            rank = rank_info.strip()
            constituency = np.nan

        # Extract name from the <a> tag
        name_tag = cols[1].find("a")
        if name_tag and "name" in name_tag.attrs:
            name = name_tag["name"].replace("_", " ")  # Clean up the name format
        else:
            name = (
                cols[1].get_text(strip=True).split("\n")[0]
            )  # Fallback, only take the first line

        # Extract multiple emails from either <script> or plain text
        email_list = []

        # Check for JavaScript-obfuscated emails
        email_script = cols[3].find("script")
        if email_script:
            email_text = email_script.string
            email_match = re.search(
                r"fn_emailScramble\('([^']+)','([^']+)'\)", email_text
            )
            if email_match:
                email_list.append(f"{email_match.group(1)}@{email_match.group(2)}")

        # Check for plain-text emails within <td>
        email_text_tag = cols[3].find("font")
        if email_text_tag:
            email_text = email_text_tag.get_text(separator=" ", strip=True)
            extracted_emails = re.findall(r"[\w\.-]+@[\w\.-]+", email_text)
            email_list.extend(extracted_emails)

        # If no emails found, assign NaN
        if not email_list:
            email_list.append(np.nan)

        for email in email_list:
            data_list.append(
                {
                    "name": name.title(),
                    "email": email,
                    "rank": rank,
                    "constituency": constituency,
                }
            )

    return pd.DataFrame(data_list)

In [7]:
url2016 = "https://web.archive.org/web/20160402120613/http://app.sgdi.gov.sg/listing.asp?agency_subtype=dept&agency_id=0000004564"
df2016 = scrape_mp_info16(url2016).assign(leg_start_year=2015)
df2016.head()

Unnamed: 0,name,email,rank,constituency,leg_start_year
0,Mdm Halimah Yacob,halimah_yacob@parl.gov.sg,Speaker,Marsiling-Yew Tee*,2015
1,Lee Hsien Loong,lee_hsien_loong@pmo.gov.sg,Prime Minister,Ang Mo Kio*,2015
2,Teo Chee Hean,teo_chee_hean@pmo.gov.sg,Deputy Prime Minister & Coordinating Minister ...,Pasir Ris-Punggol*,2015
3,Tharman Shanmugaratnam,tharman_s@pmo.gov.sg,Deputy Prime Minister & Coordinating Minister ...,Jurong*,2015
4,Khaw Boon Wan,khaw_boon_wan@mot.gov.sg,Coordinating Minister for Infrastructure & Min...,Sembawang*,2015


## 2012 (12th)

In [8]:
url2012 = "https://web.archive.org/web/20130210025611fw_/http://app.sgdi.gov.sg/listing.asp?agency_subtype=dept&agency_id=0000004564"
df2012 = scrape_mp_info16(url2012).assign(leg_start_year=2011)
df2012.head()

Unnamed: 0,name,email,rank,constituency,leg_start_year
0,Mdm Halimah Yacob,halimah_yacob@parl.gov.sg,Speaker,Jurong*,2011
1,Lee Hsien Loong,lee_hsien_loong@pmo.gov.sg,Prime Minister,Ang Mo Kio*,2011
2,Teo Chee Hean,teo_chee_hean@mha.gov.sg,Deputy Prime Minister& CoordinatingMinister fo...,Pasir Ris-Punggol*,2011
3,Tharman Shanmugaratnam,tharman_s@mof.gov.sg,Deputy Prime Minister& Minister for Finance,Jurong*,2011
4,Lim Hng Kiang,lim_hng_kiang@mti.gov.sg,Minister for Trade and Industry,West Coast*,2011


## 2007 (11th)

In [9]:
def scrape_mp_info07(url):
    """
    Scrapes MP information (name, email, rank, constituency) from the given archived SGDI URL (2007 version).

    Args:
        url (str): The URL of the archived SGDI page.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted MP details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    data_list = []

    rows = soup.find_all("tr", valign="top")

    for row in rows:
        cols = row.find_all("td")

        if len(cols) < 5:
            continue  # Skip rows that do not contain complete MP data

        # Extract rank/title and constituency from the first <td> within <a> tag
        rank_info_tag = cols[0].find("a")
        if rank_info_tag:
            rank_info = rank_info_tag.get_text(separator=" ", strip=True)
            rank_match = re.match(r"^(.*?)(?:\((.*?)\))?$", rank_info)

            if rank_match:
                rank = rank_match.group(1).strip()
                constituency = (
                    rank_match.group(2).strip() if rank_match.group(2) else np.nan
                )
            else:
                rank = rank_info.strip()
                constituency = np.nan
        else:
            rank, constituency = np.nan, np.nan

        # Extract name correctly, stopping before <br>
        name_tag = cols[1].find("font")  # Find the font tag that contains name
        if name_tag:
            name_text = name_tag.get_text(separator="<br>", strip=True)
            name = name_text.split("<br>")[
                0
            ]  # Only take the first line before any <br> tag
        #             name = re.split(r"\b(?:Parliament|Prime Minister|Ministry)\b", name, maxsplit=1)[0].strip()
        else:
            name = np.nan

        # Extract email, handling both fn_emailScramble and fn_ScrambleText formats
        email_script = cols[4].find("script")
        emails = []  # List to hold multiple emails if needed

        if email_script:
            email_text = email_script.string
            # Extract emails from fn_emailScramble format
            email_match = re.search(
                r"fn_emailScramble\('([^']+)','([^']+)'\)", email_text
            )
            if email_match:
                emails.append(f"{email_match.group(1)}@{email_match.group(2)}")

            # Extract emails from fn_ScrambleText format
            scramble_match = re.search(r"fn_ScrambleText\('([^']+)'\)", email_text)
            if scramble_match:
                scrambled_text = scramble_match.group(1).lower()
                email_parts = scrambled_text.split("<br>")  # Split multiple emails
                for part in email_parts:
                    clean_email = part.replace("~~", "@")
                    emails.append(clean_email)

        if not emails:
            emails.append(np.nan)  # Ensure there's at least one email entry

        for email in emails:
            data_list.append(
                {
                    "name": name.title(),
                    "email": email,
                    "rank": rank,
                    "constituency": constituency,
                }
            )

    return pd.DataFrame(data_list)

In [10]:
url2007 = "https://web.archive.org/web/20070715044750fw_/http://app.sgdi.gov.sg/listing.asp?agency_subtype=dept&agency_id=0000004564"
df2007 = scrape_mp_info07(url2007).assign(leg_start_year=2006)
df2007.head()

Unnamed: 0,name,email,rank,constituency,leg_start_year
0,Printer Friendly Page,parl@parl.gov.sg,Printer friendly page,,2006
1,Abdullah Tarmugi,abdullah_tarmugi@parl.gov.sg,Speaker,East Coast*,2006
2,Lee Hsien Loong,lee_hsien_loong@pmo.gov.sg,Prime Minister and Minister for Finance,Ang Mo Kio*,2006
3,Goh Chok Tong,goh_chok_tong@pmo.gov.sg,Senior Minister,Marine Parade*,2006
4,Lee Kuan Yew,lee_kuan_yew@pmo.gov.sg,Minister Mentor,Tanjong Pagar*,2006


## 2002 (10th)

In [11]:
def scrape_mp_info02(url):
    """
    Scrapes MP information (name, email, rank, constituency) from the given archived SGDI URL (2002 version).

    Args:
        url (str): The URL of the archived SGDI page.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted MP details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    data_list = []

    rows = soup.find_all("tr", valign="top")

    for row in rows:
        cols = row.find_all("td")

        if len(cols) < 7:
            continue  # Skip if there aren't enough columns

        # Extract rank/title and constituency from the first <td> within <a> tag
        rank_info_tag = cols[0].find("a")
        if rank_info_tag:
            rank_info = rank_info_tag.get_text(separator=" ", strip=True)
            rank_match = re.match(r"^(.*?)(?:\((.*?)\))?$", rank_info)

            if rank_match:
                rank = rank_match.group(1).strip()
                constituency = (
                    rank_match.group(2).strip() if rank_match.group(2) else None
                )
            else:
                rank = rank_info.strip()
                constituency = None
        else:
            rank, constituency = None, None

        # Extract name from the third <td>
        name = cols[2].get_text(strip=True)

        # Extract multiple emails from:
        # - `<a href="mailto:...">` format
        # - Plain text inside `<td>` (if present)
        email_list = []

        # Check for mailto links
        email_tag = cols[6].find("a", href=True)
        if email_tag and "mailto:" in email_tag["href"]:
            email_candidate = email_tag["href"].replace("mailto:", "").strip()
            if "@" in email_candidate:
                email_list.append(email_candidate)

        # Check for plain-text emails inside <td>
        email_text_tag = cols[6].find("font")
        if email_text_tag:
            email_text = email_text_tag.get_text(separator=" ", strip=True)
            extracted_emails = [
                email
                for email in re.findall(r"[\w\.-]+@[\w\.-]+", email_text)
                if "@" in email
            ]
            email_list.extend(extracted_emails)

        # Store only legitimate emails (ignore non-emails)
        email_list = [email for email in email_list if "@" in email]

        if email_list:
            for email in email_list:
                data_list.append(
                    {
                        "name": name.title(),
                        "email": email,
                        "rank": rank,
                        "constituency": constituency,
                    }
                )

    return pd.DataFrame(data_list)

In [12]:
url2002 = "https://web.archive.org/web/20021229200050fw_/http://web9.internet.gov.sg/mita_sgdi/owa/pub_directory.ministrylst?agency_subtype=dept&agency_id=0000004564"
df2002 = (
    scrape_mp_info02(url2002)
    .query("~email.str.contains('https://web.archive.org/web/20021229200050')")
    .assign(leg_start_year=2001)
)
df2002.head()

Unnamed: 0,name,email,rank,constituency,leg_start_year
1,Abdullah Tarmugi,abdullah_tarmugi@parl.gov.sg,Speaker,East Coast*,2001
3,Goh Chok Tong,goh_chok_tong@pmo.gov.sg,Prime Minister,Marine Parade*,2001
5,Lee Kuan Yew,lee_kuan_yew@pmo.gov.sg,Senior Minister\n(Tanjong Pagar*),,2001
7,Lee Hsien Loong,lee_hsien_loong@mof.gov.sg,Deputy Prime Minister & Minister for Finance,Ang Mo Kio*,2001
9,Dr Tony Tan Keng Yam,tony_tan@mindef.gov.sg,Deputy Prime Minister \n& Minister for Defence...,,2001


## Combine

In [17]:
df = pd.concat([df2002, df2007, df2012, df2016, df2021, df2025], ignore_index=True)
print(df["email"].nunique())
df.to_csv("sg_mp.csv", index=False)
df.head()

423


Unnamed: 0,name,email,rank,constituency,leg_start_year,phone,address
0,Abdullah Tarmugi,abdullah_tarmugi@parl.gov.sg,Speaker,East Coast*,2001,,
1,Goh Chok Tong,goh_chok_tong@pmo.gov.sg,Prime Minister,Marine Parade*,2001,,
2,Lee Kuan Yew,lee_kuan_yew@pmo.gov.sg,Senior Minister\n(Tanjong Pagar*),,2001,,
3,Lee Hsien Loong,lee_hsien_loong@mof.gov.sg,Deputy Prime Minister & Minister for Finance,Ang Mo Kio*,2001,,
4,Dr Tony Tan Keng Yam,tony_tan@mindef.gov.sg,Deputy Prime Minister \n& Minister for Defence...,,2001,,
