In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import random
import os

def scrape_building_listing(excel_file, base_url, headers, cookies, csv_filename="centanet_ici_buildings.csv"):
    # Read district information
    df = pd.read_csv(excel_file)

    # Load existing data with backward compatibility
    existing_counts = {}
    if os.path.exists(csv_filename):
        existing_df = pd.read_csv(csv_filename)
        existing_counts = existing_df['queriedDistrict'].value_counts().to_dict()

    # Initialize data storage
    data = []
    fields = [
        "queriedDistrict", "queriedCode", "propertyID", "buildingNameEn", 
        "address", "developers", "opDateDisplayName", "floorDisplayName", 
        "districtNameEn", "zoneEn", "sellCount", "rentCount", "transCount"
    ]

    # Configure progress bar with clear hierarchy
    with tqdm(total=len(df), desc="🏙️ Districts", position=0, mininterval=0.5, dynamic_ncols=True) as main_progress:
        for _, district_row in df.iterrows():
            district = district_row["District"]
            code = district_row["Code"]
            
            try:
                # Initial API check with random delay
                time.sleep(random.uniform(0.3, 1.2))
                initial_response = requests.get(
                    base_url,
                    headers=headers,
                    cookies=cookies,
                    params={"PageSize": 1, "districtids": code, "lang": "EN"},
                    timeout=10
                )
                result_json = initial_response.json()
                total_count = result_json.get("data", {}).get("totalCount", 0)

                # Skip handling with immediate feedback
                if existing_counts.get(district, 0) == total_count:
                    main_progress.set_postfix_str(f"{district} ✓", refresh=True)
                    main_progress.update(1)
                    main_progress.refresh()
                    continue

                # Nested progress for pages
                with tqdm(total=total_count, desc=f"📄 {district}", position=1, leave=False, mininterval=0.3) as item_progress:
                    page_index = 1
                    collected = 0
                    
                    while True:
                        # Throttle requests
                        time.sleep(random.uniform(0.2, 0.8))
                        
                        # Fetch page
                        response = requests.get(
                            base_url,
                            headers=headers,
                            cookies=cookies,
                            params={
                                "PageSize": 24,
                                "pageindex": page_index,
                                "districtids": code,
                                "lang": "EN"
                            }
                        )
                        result = response.json()
                        items = result.get("data", {}).get("items", [])

                        if not items:
                            break

                        # Process items
                        for item in items:
                            data.append({
                                "queriedDistrict": district,
                                "queriedCode": code,
                                "propertyID": item.get("propertyID"),
                                "buildingNameEn": item.get("buildingNameEn"),
                                "address": item.get("address"),
                                "developers": ", ".join(item.get("developers", [])),
                                "opDateDisplayName": item.get("opDateDisplayName"),
                                "floorDisplayName": item.get("floorDisplayName"),
                                "districtNameEn": item.get("areaInfo", {}).get("districtNameEn"),
                                "zoneEn": item.get("areaInfo", {}).get("zoneEn"),
                                "sellCount": item.get("sellCount"),
                                "rentCount": item.get("rentCount"),
                                "transCount": item.get("transCount")
                            })
                            collected += 1
                            
                            # Update item progress every 5 items
                            if collected % 5 == 0:
                                item_progress.update(5)
                                item_progress.set_postfix_str(f"Page {page_index}", refresh=True)
                        
                        # Final page update
                        item_progress.update(len(items) % 5)
                        item_progress.set_postfix_str(f"Page {page_index} ✔️", refresh=True)
                        page_index += 1

                    # Finalize district processing
                    item_progress.close()
                    main_progress.set_postfix_str(f"{district} {collected} items", refresh=True)
                
                main_progress.update(1)
                main_progress.refresh()
                
            except Exception as e:
                main_progress.set_postfix_str(f"🚨 {district}", refresh=True)
                tqdm.write(f"\nError processing {district}: {str(e)}")
                main_progress.update(1)
                continue

    # Save results
    new_df = pd.DataFrame(data)
    if os.path.exists(csv_filename):
        combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset=["propertyID"])
        combined_df.to_csv(csv_filename, index=False)
        return combined_df

    else:
        new_df.to_csv(csv_filename, index=False)
        return new_df


In [2]:
# Example inputs
excel_file = "centanet_oir_area_code.csv"  # CSV file with district info
base_url = "https://oir.centanet.com/api/Property/GetPropertyList"
headers = {
    "sec-ch-ua": '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
    "referer": "https://oir.centanet.com/en/property/search/?pageindex=2&depts=Any&districtids=WS012",
}
cookies = {
    "gr_user_id": "24005ea2-e47e-4e55-baed-171d8f324a03",
    # Add additional cookies if needed
}

# Call the function
df_output = scrape_building_listing(excel_file, base_url, headers, cookies)

# Save to CSV or process further
#df_output.to_csv("centanet_ici_buildings.csv", index=False)
df_output.head()

🏙️ Districts: 100%|██████████| 53/53 [00:52<00:00,  1.01it/s, Tin Shui Wai ✓]            


Unnamed: 0,queriedDistrict,queriedCode,propertyID,buildingNameEn,address,developers,opDateDisplayName,floorDisplayName,districtNameEn,zoneEn,sellCount,rentCount,transCount
0,Admiralty,WS005,ea0d8e9f-0de0-4b04-ac83-53e6c602859a,"Lippo Centre, Tower 2",89 Queensway,Sino Land Company Limited,1987 Year,47 Floor(s),Admiralty,Hong Kong,36,22,37
1,Admiralty,WS005,a5c52d7e-e865-40b4-b594-ba7a33a0e899,"Lippo Centre, Tower 1",89 Queensway,Sino Land Company Limited,1987 Year,51 Floor(s),Admiralty,Hong Kong,21,21,28
2,Admiralty,WS005,09eeef57-e096-4827-955f-5aadc50263d6,Admiralty Centre Tower 1,18 Harcourt Road,Cheung Kong (Holdings) Limited,1980 Year,33 Floor(s),Admiralty,Hong Kong,18,19,22
3,Admiralty,WS005,8d493726-ea4c-4864-aaa8-c677373415c5,Far East Finance Centre,16 Harcourt Road,,1982 Year,48 Floor(s),Admiralty,Hong Kong,11,13,19
4,Admiralty,WS005,8945cdfc-f462-48e6-a917-b6e0be4709bf,United Centre,95 Queensway,World-Wide Properties Corporation Limited,1981 Year,39 Floor(s),Admiralty,Hong Kong,12,11,20


In [3]:
df_output.to_csv("centanet_ici_buildings.csv", index=False)

In [4]:
def scrape_building_details(input_df, headers, cookies, output_csv="property_details.csv"):
    from bs4 import BeautifulSoup
    import re

    # Field mapping configuration
    detail_fields = {
        'District': 'district',
        'Address': 'full_address',
        'Grade': 'grade',
        'Usage': 'usage',
        'Property Type': 'property_type',
        'Year of Completion': 'completion_year',
        'Title': 'title_status',
        'Management Company': 'management_company',
        'Developers': 'developers',
        'Transportation': 'transportation',
        'Floor': 'total_floors',
        'Floor Area': 'floor_area',
        'Height': 'ceiling_height',
        'A/C System': 'ac_system',
        'No. of Lift': 'lifts',
        'Carpark': 'carpark'
    }

    # Load existing data if available
    existing_data = pd.DataFrame()
    scraped_ids = set()
    if os.path.exists(output_csv):
        existing_data = pd.read_csv(output_csv)
        scraped_ids = set(existing_data['property_id'].astype(str))

    # Prepare filtered input dataframe
    input_df = input_df.copy()
    input_df['propertyID'] = input_df['propertyID'].astype(str)
    todo_df = input_df[~input_df['propertyID'].isin(scraped_ids)]
    
    results = []
    
    for _, row in tqdm(todo_df.iterrows(), total=len(todo_df), desc="Scraping Details"):
        try:
            # URL sanitization
            zone_slug = re.sub(r'[^a-z0-9]+', '-', row['zoneEn'].lower()).strip('-')
            district_slug = re.sub(r'[^a-z0-9]+', '-', row['districtNameEn'].lower()).strip('-')
            building_slug = re.sub(r'[^a-z0-9]+', '-', row['buildingNameEn'].lower()).strip('-')
            
            url = (
                f"https://oir.centanet.com/en/property/office/"
                f"{zone_slug}-{district_slug}-{building_slug}/detail/{row['propertyID']}/"
            )

            # Request with timeout and retry
            response = requests.get(url, headers=headers, cookies=cookies, timeout=15)
            if response.status_code != 200:
                continue

            # Parse response
            soup = BeautifulSoup(response.content, 'html.parser')
            container = soup.find('section', class_='property-info')
            if not container:
                continue

            # Extract details
            property_details = {'source_url': url}
            for col in container.find_all('div', class_='col'):
                title = col.find('p', class_='col-title')
                text = col.find('p', class_='col-text') or col.find('div', class_='col-text')
                
                if title and text:
                    key = detail_fields.get(title.text.strip())
                    if key:
                        property_details[key] = ' '.join(text.stripped_strings)

            # Add metadata
            property_details.update({
                'property_id': row['propertyID'],
                'building_name': row['buildingNameEn'],
                'zone': row['zoneEn'],
                'district': row['districtNameEn']
            })
            
            results.append(property_details)
            time.sleep(random.uniform(1.0, 2.5))

        except Exception as e:
            print(f"Error processing {row['propertyID']}: {str(e)}")
            continue

    # Save results with existing data
    if results:
        new_data = pd.DataFrame(results)
        combined_data = pd.concat([existing_data, new_data], ignore_index=True)
        
        # Remove potential duplicates from new batch
        combined_data = combined_data.drop_duplicates(
            subset=['property_id'], 
            keep='last'
        )
        
        # Save with type conversion for numeric fields
        numeric_cols = ['completion_year', 'total_floors', 'lifts']
        combined_data[numeric_cols] = combined_data[numeric_cols].apply(
            pd.to_numeric, errors='coerce'
        )
        
        combined_data.to_csv(output_csv, index=False)
    
    return combined_data if results else existing_data


In [5]:
# Load previous scraping results
base_df = pd.read_csv("centanet_ici_buildings.csv")

# Set headers and cookies (anti-scraping measures)
headers = {
    "sec-ch-ua": '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
    "referer": "https://oir.centanet.com/en/property/search/?pageindex=2&depts=Any&districtids=WS012",
}

cookies = {
    "gr_user_id": "24005ea2-e47e-4e55-baed-171d8f324a03",
    # Add additional cookies if needed
}


# Run detail scraper
detail_df = scrape_building_details(
    input_df=base_df,
    headers=headers,
    cookies=cookies,
    output_csv="property_details.csv"
)

print(f"Scraped {len(detail_df)} detailed listings")

Scraping Details:   4%|▎         | 55/1541 [01:49<49:21,  1.99s/it]  


KeyboardInterrupt: 

In [44]:
detail_df.shape

(40, 20)