In [2]:
import configparser
import hashlib
import time
import random
import string
import re
import os
import glob
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
from tqdm import tqdm

# Configuration management
CONFIG_FILE = 'scraper_config.ini'
config = configparser.ConfigParser()

def load_config():
    """Initialize and load configuration"""
    config.read(CONFIG_FILE)
    
    if not config.has_section('FileSettings'):
        config.add_section('FileSettings')
        config.set('FileSettings', 'base_name', 'centanet_res_estates')
        config.set('FileSettings', 'backup_name', 'centanet_res_estates_backup')
        config.set('FileSettings', 'update_frequency', 'monthly')
        config.set('FileSettings', 'force_update', 'False')
        config.set('FileSettings', 'min_file_size_ratio', '0.9')
        
    if not config.has_section('ScraperSettings'):
        config.add_section('ScraperSettings')
        config.set('ScraperSettings', 'headless', 'True')
        config.set('ScraperSettings', 'min_delay', '1')
        config.set('ScraperSettings', 'max_delay', '3')
        config.set('ScraperSettings', 'area_limit', 'None')
        config.set('ScraperSettings', 'max_retries', '3')

    with open(CONFIG_FILE, 'w') as configfile:
        config.write(configfile)

load_config()

# Utility functions
def generate_session_id(length=10):
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length))

def clean_subdistrict(subdistrict):
    cleaned = re.sub(r'[^A-Za-z0-9]+', '-', subdistrict)
    return cleaned.strip('-').lower()

def initialize_driver():
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.127 Safari/537.36")
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    if config.getboolean('ScraperSettings', 'headless'):
        options.add_argument("--headless")
        
    return webdriver.Chrome(options=options)

def random_sleep():
    min_d = config.getint('ScraperSettings', 'min_delay')
    max_d = config.getint('ScraperSettings', 'max_delay')
    time.sleep(random.uniform(min_d, max_d))

def get_current_ym():
    return datetime.now().strftime("%Y%m")

def get_latest_file():
    pattern = f"{config.get('FileSettings', 'base_name')}_*.csv"
    files = glob.glob(pattern)
    return max(files, key=os.path.getctime) if files else None

def backup_existing():
    latest = get_latest_file()
    if not latest:
        return
    
    backup_path = f"{config.get('FileSettings', 'backup_name')}.csv"
    if os.path.exists(backup_path):
        os.remove(backup_path)
    os.rename(latest, backup_path)
    print(f"Created backup: {backup_path}")

def should_run_update():
    if config.get('FileSettings', 'update_frequency') != 'monthly':
        return True
        
    current_ym = get_current_ym()
    latest = get_latest_file()
    if not latest:
        return True
        
    file_ym = re.search(r'_(\d{6})\.csv', latest).group(1)
    return file_ym != current_ym or config.getboolean('FileSettings', 'force_update')

def validate_update(new_path, backup_path):
    if not os.path.exists(backup_path):
        return
    
    new_size = os.path.getsize(new_path)
    backup_size = os.path.getsize(backup_path)
    ratio = new_size / backup_size
    
    if ratio < config.getfloat('FileSettings', 'min_file_size_ratio'):
        print(f"Warning: New file is {ratio:.0%} of backup size. Potential data loss!")

# Core scraping functions
def extract_estate_data(driver, existing_links):
    data = []
    try:
        estate_items = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.property-text.flex.def-property-box"))
        )
        for item in estate_items:
            try:
                estate_link = item.get_attribute("href")
                if estate_link in existing_links:
                    continue
                
                name = item.find_element(By.CSS_SELECTOR, "div.main-text").text.strip()
                address = item.find_element(By.CSS_SELECTOR, "div.address.f-middle").text.strip()
                blocks = item.find_element(By.XPATH, ".//div[contains(text(), 'No. of Block(s)')]/following-sibling::div").text.strip()
                units = item.find_element(By.XPATH, ".//div[contains(text(), 'No. of Units')]/following-sibling::div").text.strip()
                unit_rate = item.find_element(By.XPATH, ".//div[contains(text(), 'Unit Rate of Saleable Area')]/following-sibling::div").text.strip()
                mom = item.find_element(By.XPATH, ".//div[contains(text(), 'MoM')]/following-sibling::div").text.strip()
                trans_record = item.find_element(By.XPATH, ".//div[contains(text(), 'Trans. Record')]/following-sibling::div").text.strip()
                for_sale = item.find_element(By.XPATH, ".//div[contains(text(), 'For Sale')]/following-sibling::div").text.strip()
                for_rent = item.find_element(By.XPATH, ".//div[contains(text(), 'For Rent')]/following-sibling::div").text.strip()

                data.append([name, address, blocks, units, unit_rate, mom, trans_record, 
                            for_sale, for_rent, estate_link])
            except Exception:
                continue
    except Exception:
        pass
    return data

def scrape_estate_listings(excel_path):
    if not should_run_update():
        print("Update not required based on configuration")
        return get_latest_file()
    
    driver = initialize_driver()
    existing_links = set()
    existing_data = pd.DataFrame()
    
    # Load existing data
    latest_file = get_latest_file()
    if latest_file:
        existing_data = pd.read_csv(latest_file)
        existing_links = set(existing_data['Estate Link'].tolist())
    
    # Prepare new collection
    area_df = pd.read_excel(excel_path, engine="openpyxl")
    if config.get('ScraperSettings', 'area_limit') != 'None':
        area_limit = config.getint('ScraperSettings', 'area_limit')
        area_df = area_df[:area_limit]
    
    new_data = []
    for idx, row in tqdm(area_df[:5].iterrows(), total=len(area_df), desc="Processing areas"):
        region = row["Region"]
        district = row["District"]
        subdistrict = row["Subdistrict"]
        code = row["Code"]
        
        subdistrict_part = clean_subdistrict(subdistrict)
        session_id = generate_session_id()
        area_url = f"https://hk.centanet.com/findproperty/en/list/estate/{subdistrict_part}_19-{code}?q={session_id}"
        
        driver.get(area_url)
        random_sleep()
        
        current_page = 1
        while True:
            page_data = extract_estate_data(driver, existing_links)
            if not page_data:
                break
                
            # Add regional metadata
            new_data.extend([entry + [region, district, subdistrict, code] 
                           for entry in page_data])
            existing_links.update(entry[9] for entry in page_data)
            
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn-next:not([disabled])"))
                )
                driver.execute_script("arguments[0].click();", next_button)
                random_sleep()
                current_page += 1
            except Exception:
                break

    driver.quit()
    
    # Create final dataset
    if new_data:
        new_df = pd.DataFrame(new_data, columns=[
            "Name", "Address", "Blocks", "Units", "Unit Rate", "MoM", 
            "Trans Record", "For Sale", "For Rent", "Estate Link",
            "Region", "District", "Subdistrict", "Code"
        ])
        combined_df = pd.concat([existing_data, new_df], ignore_index=True)
        combined_df.drop_duplicates(subset=['Estate Link'], keep='last', inplace=True)
        
        # Save results
        output_path = f"{config.get('FileSettings', 'base_name')}_{get_current_ym()}.csv"
        backup_existing()
        combined_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        validate_update(output_path, f"{config.get('FileSettings', 'backup_name')}.csv")
        return output_path
    
    print("No new data found")
    return latest_file

# Rest of the functions remain similar with config integration
def scrape_estate_details(input_csv=None, output_csv=None):
    """
    Scrape detailed information for estates in the input CSV.
    
    Args:
        input_csv (str, optional): Path to input CSV with estate URLs. If None, finds the latest CSV
        output_csv (str, optional): Path to save enriched CSV. If None, generates based on input filename
        
    Returns:
        str: Path to the saved CSV file
    """
    # Find latest CSV if not provided
    if input_csv is None:
        input_csv = find_latest_estates_csv()
        if input_csv is None:
            print("No CSV files matching the specified pattern found.")
            return None
    
    print(f"Using input file: {input_csv}")
    
    # Generate output path if not provided
    if output_csv is None:
        output_csv = input_csv.replace("_centanet_estates.csv", "_centanet_estates_scraped.csv")
    
    # Initialize driver
    driver = initialize_driver()
    
    try:
        # Read the original CSV
        df = pd.read_csv(input_csv)
        print(f"Loaded {len(df)} rows from {input_csv}")
        
        # Create new columns for scraped data if they don't already exist
        for col in ["Scraped Estate Name", "Occupation Permit", "Scraped Blocks",
                    "Scraped Units", "School Net Info", "Estate Detailed Address", "Developer"]:
            if col not in df.columns:
                df[col] = None
        
        # Iterate over each row using tqdm for progress indication
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing URLs"):
            url = row["Estate Link"]
            try:
                driver.get(url)
                random_sleep(2, 3)  # Allow the page to load
                scroll_down(driver)  # Scroll to load lazy-loaded content if needed
        
                # Extract Estate Name
                try:
                    estate_name_elem = driver.find_element(By.CLASS_NAME, "estate-detail-banner-title")
                    estate_name = estate_name_elem.text.strip()
                except Exception:
                    estate_name = None
        
                # Extract other details: Occupation Permit, Blocks, Units
                occupation, blocks_text, units_text = None, None, None
                try:
                    table_items = driver.find_elements(By.CLASS_NAME, "table-item")
                    for item in table_items:
                        try:
                            title_elem = item.find_element(By.CLASS_NAME, "table-item-title")
                            text_elem = item.find_element(By.CLASS_NAME, "table-item-text")
                            text_content = text_elem.text.strip()
                            if "Date of Occupation Permit" in text_content:
                                occupation = title_elem.text.strip()
                            elif "No. of Blocks" in text_content:
                                blocks_text = title_elem.text.strip().split()[0]
                            elif "No. of Units" in text_content:
                                units_text = title_elem.text.strip()
                        except Exception:
                            continue
                except Exception:
                    pass
        
                # Extract School Net information
                school_net_val = None
                try:
                    items_divs = driver.find_elements(By.CLASS_NAME, "item")
                    for div in items_divs:
                        try:
                            label_elem = div.find_element(By.CLASS_NAME, "label-item-left")
                            if "School Net" in label_elem.text.strip():
                                links_elems = div.find_elements(By.TAG_NAME, "a")
                                if len(links_elems) >= 2:
                                    primary_net = links_elems[0].text.strip()
                                    secondary_net = links_elems[1].text.strip()
                                    school_net_val = f"{primary_net} | {secondary_net}"
                                break
                        except Exception:
                            continue
                except Exception:
                    pass
        
                # Extract Estate Detailed Address
                estate_address = None
                try:
                    address_elem = driver.find_element(By.CLASS_NAME, "estate-detail-banner-position")
                    estate_address = address_elem.text.strip()
                except Exception:
                    pass
        
                # Extract Developer information
                developer_val = None
                try:
                    developer_divs = driver.find_elements(By.CLASS_NAME, "item")
                    for div in developer_divs:
                        try:
                            label_elem = div.find_element(By.CLASS_NAME, "label-item-left")
                            if "Developer" in label_elem.text.strip():
                                developer_span_elem = div.find_element(By.CLASS_NAME, "label-item-right")
                                developer_val = developer_span_elem.text.strip()
                                break
                        except Exception:
                            continue
                except Exception:
                    pass
        
                # Save the scraped data into the DataFrame (for the current row only)
                df.at[idx, "Scraped Estate Name"] = estate_name
                df.at[idx, "Occupation Permit"] = occupation
                df.at[idx, "Scraped Blocks"] = blocks_text
                df.at[idx, "Scraped Units"] = units_text
                df.at[idx, "School Net Info"] = school_net_val
                df.at[idx, "Estate Detailed Address"] = estate_address
                df.at[idx, "Developer"] = developer_val
        
            except Exception as e:
                print(f"Error processing URL {url}: {e}")
            
            # Write the current DataFrame to CSV to prevent data loss after each iteration
            df.to_csv(output_csv, index=False)
            
            # Pause briefly before processing the next URL
            random_sleep(2, 3)

        print(f"Scraped data saved to: {output_csv}")
        
    except Exception as e:
        print(f"Error during scraping estate details: {e}")
    finally:
        driver.quit()
    
    return output_csv

def main():
    if not should_run_update():
        print("Update not required based on configuration")
        return
    
    print("========== SCRAPING ESTATE LISTINGS ==========")
    listings_path = scrape_estate_listings('Centanet_Res_Area_Code.xlsx')
    
    print("\n========== SCRAPING ESTATE DETAILS ==========")
    details_path = scrape_estate_details(listings_path)
    
    return {
        'listings': listings_path,
        'details': details_path
    }

if __name__ == "__main__":
    main()




Processing areas:   3%|▎         | 5/178 [04:17<2:28:37, 51.54s/it]



Using input file: centanet_res_estates_202504.csv
Loaded 1235 rows from centanet_res_estates_202504.csv


Processing URLs:   0%|          | 0/1235 [00:04<?, ?it/s]

Error processing URL https://hk.centanet.com/estate/en/The-Merton/2-SSPPWPPYPS: random_sleep() takes 0 positional arguments but 2 were given
Error during scraping estate details: random_sleep() takes 0 positional arguments but 2 were given



