In [16]:
import requests
import csv
import time
from tqdm import tqdm
import pandas as pd
import os

# File paths
excel_file = "Centanet_ICI_Area_Code.xlsx"  # Excel file with district info (see [1])
csv_filename = "centanet_ici_buildings.csv"

# Read districts from the Excel file (Sheet1 with columns: Region, District, Code)
df = pd.read_excel(excel_file, sheet_name="Sheet1")

# Define CSV fields – adding the queried district info for reference
fields = [
    "queriedDistrict", "queriedCode", "propertyID", "buildingNameEn", 
    "address", "developers", "opDateDisplayName", "floorDisplayName", 
    "districtNameEn", "zoneEn", "sellCount", "rentCount", "transCount"
]

# If the CSV does not already exist, create it and write the header
if not os.path.exists(csv_filename):
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()

# Base API URL
base_url = "https://oir.centanet.com/api/Property/GetPropertyList"

# Set headers and cookies (anti-scraping measures)
headers = {
    "sec-ch-ua": '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
    "referer": "https://oir.centanet.com/en/property/search/?pageindex=2&depts=Any&districtids=WS012",
}

cookies = {
    "gr_user_id": "24005ea2-e47e-4e55-baed-171d8f324a03",
    # Add additional cookies if needed
}

# Iterate over each district in the Excel file using tqdm
for _, district_row in tqdm(df.iterrows(), total=len(df), desc="Processing Districts"):
    queried_district = district_row["District"]
    queried_code = district_row["Code"]
    page_index = 1

    while True:
        # Build the query parameters for each page
        params = {
            "PageSize": 24,
            "pageindex": page_index,
            "depts": "Any",
            "districtids": queried_code,
            "lang": "EN"
        }
        
        try:
            response = requests.get(base_url, headers=headers, cookies=cookies, params=params)
        except Exception as e:
            print(f"Error fetching {queried_district} (code: {queried_code}) page {page_index}: {e}")
            break

        if response.status_code != 200:
            print(f"Non-200 status for {queried_district} (code: {queried_code}) page {page_index}: {response.status_code}")
            break

        try:
            result_json = response.json()
        except Exception as e:
            print(f"JSON parsing error for {queried_district} (code: {queried_code}) page {page_index}: {e}")
            break

        # Check for items; if empty then we assume the last page has been reached
        items = result_json.get("data", {}).get("items", [])
        if not items:
            break

        # Open CSV in append mode so results are written immediately (protecting against data loss)
        with open(csv_filename, mode='a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fields)
            for item in items:
                # Assemble row data; developers is converted from a list to a comma-separated string
                row_data = {
                    "queriedDistrict": queried_district,
                    "queriedCode": queried_code,
                    "propertyID": item.get("propertyID"),
                    "buildingNameEn": item.get("buildingNameEn"),
                    "address": item.get("address"),
                    "developers": ", ".join(item.get("developers", [])),
                    "opDateDisplayName": item.get("opDateDisplayName"),
                    "floorDisplayName": item.get("floorDisplayName"),
                    "districtNameEn": item.get("areaInfo", {}).get("districtNameEn"),
                    "zoneEn": item.get("areaInfo", {}).get("zoneEn"),
                    "sellCount": item.get("sellCount"),
                    "rentCount": item.get("rentCount"),
                    "transCount": item.get("transCount")
                }
                writer.writerow(row_data)
        
        #print(f"Processed {queried_district} (code: {queried_code}) page {page_index} with {len(items)} item(s)")
        page_index += 1
        time.sleep(1)  # Pause between requests as an anti-scraping measure

print("Scraping complete. Data saved to", csv_filename)


Processing Districts: 100%|██████████| 53/53 [02:36<00:00,  2.96s/it]

Scraping complete. Data saved to centanet_ici_buildings.csv



