In [None]:
import time
import random
import re
import os
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
from tqdm import tqdm


def generate_session_id(length=10):
    """Generate a random session ID consisting of lowercase letters and digits."""
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length))


def clean_subdistrict(subdistrict):
    """
    Clean the subdistrict string to generate a URL-friendly slug.
    Any sequence of non-alphanumeric characters is replaced by a hyphen.
    The result is lowercased and stripped of extra hyphens.
    """
    cleaned = re.sub(r'[^A-Za-z0-9]+', '-', subdistrict)
    return cleaned.strip('-').lower()


def initialize_driver():
    """
    Initializes ChromeDriver with custom options including headless mode.
    chromedriver_autoinstaller installs the correct version if needed.
    """
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.127 Safari/537.36")
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--headless")  # Enable headless mode for background execution
    return webdriver.Chrome(options=options)


def random_sleep(min_delay=1, max_delay=3):
    """Pause execution for a random duration between min_delay and max_delay seconds."""
    time.sleep(random.uniform(min_delay, max_delay))


def scroll_down(driver):
    """Scrolls down to the bottom of the page to trigger lazy-loaded content."""
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    random_sleep()


def extract_estate_data(driver):
    """
    Extracts estate information from the current page.

    The expected DOM structure is:
      - Name and Address are inside <div class="flex f-dir-col basic-info">
      - Other details are inside <div class="flex basic-data hidden-xs-only">
            Blocks, Units, Unit Rate, MoM, Trans Record, For Sale, For Rent.
    """
    data = []
    try:
        estate_items = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.flex.f-dir-col.basic-info"))
        )
        for item in estate_items:
            try:
                # Extract name and address from basic-info section
                name = item.find_element(By.CSS_SELECTOR, "div.main-text").text.strip()
                address = item.find_element(By.CSS_SELECTOR, "div.address.f-middle").text.strip()

                # Extract other details from basic-data section
                parent = item.find_element(By.XPATH, "../../..")  # Navigate up to find other details
                blocks = parent.find_element(By.XPATH, ".//div[contains(text(), 'No. of Block(s)')]/following-sibling::div").text.strip()
                units = parent.find_element(By.XPATH, ".//div[contains(text(), 'No. of Units')]/following-sibling::div").text.strip()
                unit_rate = parent.find_element(By.XPATH, ".//div[contains(text(), 'Unit Rate of Saleable Area')]/following-sibling::div").text.strip()
                mom = parent.find_element(By.XPATH, ".//div[contains(text(), 'MoM')]/following-sibling::div").text.strip()
                trans_record = parent.find_element(By.XPATH, ".//div[contains(text(), 'Trans. Record')]/following-sibling::div").text.strip()
                for_sale = parent.find_element(By.XPATH, ".//div[contains(text(), 'For Sale')]/following-sibling::div").text.strip()
                for_rent = parent.find_element(By.XPATH, ".//div[contains(text(), 'For Rent')]/following-sibling::div").text.strip()

                data.append([name, address, blocks, units, unit_rate, mom, trans_record, for_sale, for_rent])
            except Exception:
                continue  # Skip item if any field fails to extract
    except Exception:
        pass  # Skip page if no estate items found
    return data


def main():
    # Base URL for the estate listings.
    base_url = "https://hk.centanet.com/findproperty/en/list/estate"
    
    # Read area codes from the Excel file.
    try:
        area_df = pd.read_excel("Centanet_Res_Area_Code.xlsx", engine="openpyxl")
    except Exception as e:
        print("Error reading Centanet_Res_Area_Code.xlsx:", e)
        return

    driver = initialize_driver()
    file_path = f"{datetime.today().strftime('%Y-%m-%d')}_centanet_estates.csv"
    
    # Remove existing CSV file if exists.
    if os.path.exists(file_path):
        os.remove(file_path)
    
    try:
        # Iterate over each area with a progress bar.
        for idx, row in tqdm(area_df.iterrows(), total=area_df.shape[0], desc="Processing areas"):
            region = row["Region"]
            district = row["District"]
            subdistrict = row["Subdistrict"]
            code = row["Code"]
            subdistrict_part = clean_subdistrict(subdistrict)
            session_id = generate_session_id()
            area_url = f"{base_url}/{subdistrict_part}_19-{code}?q={session_id}"
            driver.get(area_url)
            random_sleep()

            current_page = 1
            area_rows = []
            while True:
                scroll_down(driver)
                page_data = extract_estate_data(driver)
                if page_data:
                    for row_data in page_data:
                        area_rows.append(row_data + [region, district, subdistrict, code, area_url])
                else:
                    break  # Exit loop if no data found on this page

                try:
                    next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn-next:not([disabled])"))
                    )
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                    driver.execute_script("arguments[0].click();", next_button)
                    random_sleep()
                    current_page += 1
                except Exception:
                    break  # Exit loop if no next page button found

            if area_rows:
                df = pd.DataFrame(area_rows,
                                  columns=["Name", "Address", "Blocks", "Units", "Unit Rate", "MoM", "Trans Record",
                                           "For Sale", "For Rent", "Region", "District", "Subdistrict", "Code",
                                           "Area_URL"])
                df.to_csv(file_path, mode="a", index=False, header=not os.path.exists(file_path), encoding="utf-8-sig")
            driver.delete_all_cookies()
            random_sleep()
    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Processing areas:   0%|          | 0/178 [00:00<?, ?it/s]


Processing area: Kennedy Town with URL: https://hk.centanet.com/findproperty/en/list/estate/kennedy-town_19-HMA111?q=rfl8qpbmw5

Scraping page 1 for area: Kennedy Town ...
Found 24 estate items on this page.
Extracted: The Merton | 38 New Praya Kennedy Town
Extracted: Manhattan Heights | 28 New Praya Kennedy Town
Extracted: Smithfield Terrace | 71-77 Smithfield
Extracted: University Heights | 23 Pokfield Road
Extracted: Pearl Court | 13 Belcher's Street
Extracted: Cadogan | 37A Cadogan Street
Extracted: The Hudson | 11 Davis Street
Extracted: Cayman Rise | 29 Ka Wai Man Road
Extracted: Imperial Kennedy | 68 Belcher's Street
Extracted: The Sail At Victoria | 86 Victoria Road
Extracted: Smithfield Court | 43 Smithfield
Extracted: Belcher's Hill | 9 Rock Hill Street
Extracted: Jade Court | 48-49 Praya, Kennedy Town
Extracted: Harbour View Garden | 21 North Street
Extracted: Scholar Court | 15 Sands Street
Extracted: Kennedy Town Centre | 38 Praya, Kennedy Town
Extracted: Axeford Villa | 

Processing areas:   1%|          | 1/178 [01:38<4:49:17, 98.07s/it]


Processing area: Shek Tong Tsui with URL: https://hk.centanet.com/findproperty/en/list/estate/shek-tong-tsui_19-HMA047?q=c3r1lrxhjw

Scraping page 1 for area: Shek Tong Tsui ...
Found 24 estate items on this page.
Extracted: Novum West | 460 Queen's Road West
Extracted: Chong Yip Centre | 423-425 Queen's Road West
Extracted: One South Lane | 1 South Lane
Extracted: Eivissa Crest | 100 Hill Road
Extracted: High West | 36 Clarence Terrence
Extracted: Kwan Yick Building Phase 1 | 430-440A Des Voeux Road West
Extracted: Harbour One | 458 Des Voeux Road West
Extracted: Green View Court | 14-20 Woo Hop Street
Extracted: Kwok Ga Building | 6-12 Woo Hop Street
Extracted: Hill Court | 28 Hill Road
Extracted: Eight South Lane | 8-12 Eight South Lane
Extracted: King's Building | 521-525 Queen's Road West
Extracted: Ka Fai Court | 18-22 Clarence Terrace
Extracted: Dragonfair Garden | 485 Queen's Road West
Extracted: Yip Cheong (Cheung) Building | 4-16 Hill Road
Extracted: Fortune Villa | 61-69 Hi

Processing areas:   1%|          | 1/178 [01:50<5:26:46, 110.77s/it]


KeyboardInterrupt: 