In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Load CSV file
df =pd.read_csv('midland_building_list.csv')

# Configure Edge WebDriver
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
service = Service(EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

# Detect and replace function for SVG icons
def detect_and_replace_svg(icon_url):
    if 'PD94bW' in icon_url and "jUwl" in icon_url:
        return "Yes"
    elif: "PD94bW" in icon_url and "gNTAi" in icon_url:
        return "No"
    return "Unknown" # Default value if the icon is not detected


# Function to handle webscraping
def scrape_building_info(row, driver):
    try:
        url = row['Detail URL'] + "?lang=english"
        driver.get(url)
        time.sleep(2)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        addtional_info_sections = soup.find_all('div', class_='meta-info-container')
        
        building_info = {
            'Building Name': row['Building Name'],
            'Address': row['Address'],
            'URL': url
        }
        
        for block in addtional_info_sections:
            title = block.find('div', class_='title').text.strip() if block.find('div', class_='title') else 'Unknown'
            content = block.find('div', class_='content').text.strip() if block.find('div', class_='content') else 'N/A'
            if block.find('div', class_='icon'):
                icon = block.find('div', class_='icon')['src']
                content = detect_and_replace_svg(icon)
            building_info[title] = content
            
        # Append reuslt to the list
        return building_info
    except Exception as e:
        print(f"An error occured while scraping {row['Building Name']}: {e}")
        return None
    
# Process each bulding and save progressively 
results = []

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Scraping Buildings:'):
    result = scrape_building_info(row, driver)
    if result:
        results.append(result)
        pd.DataFrame(results).to_csv('midlandici_building_list_lv2.csv', index=False)
        
driver.quit() # Close the browser

# Save the final result
if results:
    pd.DataFrame(results).to_csv('midlandici_building_list_lv2.csv', index=False)
    print("Scraping completed successfully!")

In [4]:
import time
import pandas as pd
from tqdm import tqdm
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

def initialize_driver():
    """Automatically installs and initializes ChromeDriver in headless mode"""
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

def detect_and_replace_svg(icon_url):
    """Convert SVG icons to Yes/No values"""
    if 'PD94bW' in icon_url and "jUwl" in icon_url:
        return "Yes"
    elif 'PD94bW' in icon_url and "gNTAi" in icon_url:
        return "No"
    return "Unknown"

def scrape_building_info(row, driver):
    """Scrape detailed information for individual buildings"""
    try:
        url = row['Detail URL'] + "?lang=english"
        driver.get(url)
        time.sleep(1.5)  # Reduced sleep for faster scraping
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        building_info = {
            'Building Name': row['Building Name'],
            'Address': row['Address'],
            'URL': url
        }
        
        # Extract meta information blocks
        for block in soup.find_all('div', class_='meta-info-container'):
            title = block.find('div', class_='title')
            content = block.find('div', class_='content')
            icon = block.find('div', class_='icon')
            
            if title and (content or icon):
                key = title.text.strip()
                value = content.text.strip() if content else detect_and_replace_svg(icon['src'])
                building_info[key] = value
                
        return building_info
    except Exception as e:
        print(f"Error scraping {row['Building Name']}: {str(e)[:100]}...")
        return None

# Phase 1: Scrape building list
print("Starting building list scraping...")
driver = initialize_driver()


# Phase 2: Scrape detailed information
print("\nStarting detailed information scraping...")
driver = initialize_driver()
df = pd.read_csv("midlandici_building_list.csv")
results = []

try:
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing buildings"):
        result = scrape_building_info(row, driver)
        if result:
            results.append(result)
            # Progressive save every 5 entries
            if len(results) % 5 == 0:
                pd.DataFrame(results).to_csv("midlandici_detailed_info.csv", index=False)
                
    # Final save
    if results:
        pd.DataFrame(results).to_csv("midlandici_detailed_info.csv", index=False)
        print(f"\nSuccessfully scraped {len(results)}/{len(df)} buildings")

except KeyboardInterrupt:
    print("\nUser interrupted! Saving current progress...")
    pd.DataFrame(results).to_csv("midlandici_detailed_info_PARTIAL.csv", index=False)

finally:
    driver.quit()


Starting building list scraping...
Found 694 buildings


Collecting building list: 100%|██████████| 694/694 [00:12<00:00, 57.66it/s]


Initial building list saved

Starting detailed information scraping...


Processing buildings:  91%|█████████▏| 635/694 [34:17<1:30:15, 91.80s/it]

Error scraping 83 Wing Hong Street: Message: timeout: Timed out receiving message from renderer: 298.612
  (Session info: chrome=134.0.6...


Processing buildings: 100%|██████████| 694/694 [36:47<00:00,  3.18s/it]  



Successfully scraped 693/694 buildings
