In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from datetime import datetime
import os
import csv
from urllib.parse import urlparse

# Constants
CSV_HEADERS = [
    'property_name', 'property_type', 'address', 'area', 'year',
    'floors', 'developer', 'transaction_count', 'image_url',
    'region', 'district', 'code'
]

# Set up logging
def setup_logging(log_dir="logs"):
    os.makedirs(log_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(f"{log_dir}/scraper_{timestamp}.log"),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

def init_csv():
    csv_filename = f"{datetime.now().strftime('%Y%m%d')}_centanet_estates_ici.csv"
    if not os.path.exists(csv_filename):
        with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
    return csv_filename

def safe_extract(card, selector, attr=None):
    try:
        elem = card.select_one(selector)
        if not elem:
            return ''
        if attr:
            return elem.get(attr, '').strip()
        return elem.text.strip()
    except Exception as e:
        logger.error(f"Extraction error: {e}")
        return ''

def extract_property_data(card):
    return {
        'property_name': safe_extract(card, 'h3.col-top-title'),
        'property_type': safe_extract(card, 'div.col-center p').split('|')[0].strip() if '|' in safe_extract(card, 'div.col-center p') else '',
        'address': safe_extract(card, 'div.col-center p').split('|')[-1].strip(),
        'area': safe_extract(card, 'p.area'),
        'year': safe_extract(card, 'p.opDate'),
        'floors': safe_extract(card, 'p.floor'),
        'developer': safe_extract(card, 'p.developer'),
        'transaction_count': safe_extract(card, 'div.col-top-butn p span'),
        'image_url': safe_extract(card, 'div.img-size img', 'src')
    }

def scrape_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
        'Referer': urlparse(url).scheme + '://' + urlparse(url).netloc + '/'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return [extract_property_data(card) for card in soup.select('div.property-database-results-card')], True
    except Exception as e:
        logger.error(f"Page scrape failed: {e}")
        return [], False

def scrape_area(region, district, code, csv_filename, max_pages=100):
    page_index = 1
    district_url = district.lower().replace(' ', '-')
    base_url = f"https://oir.centanet.com/en/property/all-usage/{region.lower()}-{district_url}/{code.lower()}"
    
    while page_index <= max_pages:
        page_url = f"{base_url}/?pageindex={page_index}"
        properties, success = scrape_page(page_url)
        
        if not success or not properties:
            break
            
        with open(csv_filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            for prop in properties:
                prop.update({
                    'region': region,
                    'district': district,
                    'code': code
                })
                writer.writerow(prop)
        
        logger.info(f"Page {page_index} saved: {len(properties)} records")
        page_index += 1
        time.sleep(1.2)
    
    return page_index - 1  # Return number of pages scraped

def main():
    global logger
    logger = setup_logging()
    csv_filename = init_csv()
    
    try:
        area_codes = pd.read_excel("Centanet_ICI_Area_Code.xlsx").to_dict('records')
        logger.info(f"Loaded {len(area_codes)} area codes")
        
        for area in area_codes:
            logger.info(f"Scraping {area['Region']} - {area['District']}")
            pages_scraped = scrape_area(
                area['Region'],
                area['District'],
                area['Code'],
                csv_filename
            )
            logger.info(f"Completed {pages_scraped} pages for {area['District']}")
            
    except Exception as e:
        logger.error(f"Main execution failed: {e}")

if __name__ == "__main__":
    main()
