In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

https://www.sgdi.gov.sg/organs-of-state/parl#ANG-Wei-Neng_-West-Coast-

In [8]:
def clean_text(text):
    """Clean text by removing extra whitespace and newlines"""
    return ' '.join(text.strip().split())

def safe_extract(element, selector, class_name=None, attribute='text', color=None):
    """Safely extract data from HTML elements with error handling"""
    try:
        if class_name:
            found = element.find(selector, class_=class_name)
        else:
            found = element.find(selector)
            
        if found:
            if color:
                span = found.find('span', style=f'color:{color}')
                return span.text.strip() if span else ''
            if attribute == 'text':
                return found.text.strip()
            else:
                return found.get(attribute, '').strip()
        return ''
    except (AttributeError, TypeError):
        return ''

def extract_mp_info(html_content):
    """Extract MP information from HTML content"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all MP entries
    mp_items = soup.find_all('li', id=True)
    
    mp_data = []
    for item in mp_items:
        try:
            # Extract all information using safe_extract
            name = safe_extract(item, 'div', 'name')
            rank = safe_extract(item, 'div', 'rank')
            # Extract email specifically
            email_div = item.find('div', class_='email')
            email = ''
            if email_div:
                email_span = email_div.find('span', style='color:#d11212')
                if email_span:
                    email = email_span.text.strip()
            
            # Handle phone number specifically
            tel_div = item.find('div', class_='tel')
            phone = ''
            if tel_div:
                phone_div = tel_div.find('div')
                if phone_div:
                    phone = phone_div.text.strip()
            
            # Extract address
            detail_div = item.find('div', class_='detail')
            address = clean_text(detail_div.text) if detail_div else ''
            
            # Extract constituency from rank
            constituency = ''
            if rank:
                constituency_match = re.search(r'\((.*?)\*?\)', rank)
                constituency = constituency_match.group(1) if constituency_match else ''
            
            mp_data.append({
                'name': name,
                'rank': rank,
                'constituency': constituency,
                'email': email,
                'phone': phone,
                'address': address
            })
            
        except Exception as e:
            print(f"Error processing MP entry: {e}")
            continue
    
    return pd.DataFrame(mp_data)

def scrape_mp_info(url):
    """Main function to scrape MP information from the given URL"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Extract information and create DataFrame
        df = extract_mp_info(response.text)
        return df
    
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

In [9]:
url = "https://www.sgdi.gov.sg/organs-of-state/parl#ANG-Wei-Neng_-West-Coast-"
df = scrape_mp_info(url)
df.head()

Unnamed: 0,name,rank,constituency,email,phone,address
0,SEAH Kian Peng,Speaker of Parliament,,seah_kian_peng@parl.gov.sg,63325500,
1,Ms TAN Zhi Yi Chloe,Personal Assistant to the Speaker of Parliament,,chloe_tan@parl.gov.sg,63325500,
2,Lawrence WONG,"Prime Minister, Minister for Finance (Marsilin...",Marsiling-Yew Tee,Lawrence_wong@pmo.gov.sg,63327415,Ministry of Finance100 High Street #10-01The T...
3,LEE Hsien Loong,Senior Minister (Ang Mo Kio*),Ang Mo Kio,lee_hsien_loong@pmo.gov.sg,63327200,Prime Minister's OfficeIstana AnnexeOrchard Ro...
4,GAN Kim Yong,"Deputy Prime Minister, Minister for Trade and ...",Chua Chu Kang,GAN_Kim_Yong@mti.gov.sg,63327933,Ministry of Trade and Industry 100 High Street...


In [11]:
df['email'].nunique()

99

In [12]:
df.to_csv('sg_mp.csv', index=False)