In [15]:
import time
import random
import requests
import pandas as pd
import datetime
from tqdm import tqdm
import urllib.parse
import os
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Configuration
DEFAULT_START_DATE = "2010-01-01"
DEFAULT_END_DATE = datetime.date.today().strftime("%Y-%m-%d")
AREA_CODE_FILE = "Centanet_ICI_Area_Code.xlsx"
BASE_URL = "https://oir.centanet.com/api/Transaction/GetTransactionList"
PAGESIZE = 10000

def get_random_user_agent():
    ua = UserAgent()
    return ua.random

def get_cookies():
    # This is a placeholder. In a real scenario, you'd implement a way to get fresh cookies.
    return {
        "cookie1": f"value1_{random.randint(1000, 9999)}",
        "cookie2": f"value2_{random.randint(1000, 9999)}"
    }

def create_session():
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def main():
    try:
        start_dt = datetime.datetime.strptime(DEFAULT_START_DATE, "%Y-%m-%d")
        end_dt = datetime.datetime.strptime(DEFAULT_END_DATE, "%Y-%m-%d")
    except Exception as e:
        print("Error parsing dates:", e)
        return

    start_api = start_dt.strftime("%d/%m/%Y")
    end_api = end_dt.strftime("%d/%m/%Y")
    date_range = f"{start_api}-{end_api}"
    date_range_encoded = urllib.parse.quote(date_range)

    try:
        area_df = pd.read_excel(AREA_CODE_FILE, engine="openpyxl")
    except Exception as e:
        print(f"Error reading {AREA_CODE_FILE}:", e)
        return

    output_file = f"{datetime.date.today().strftime('%Y-%m-%d')}_centanet_ici_transaction.csv"
    if os.path.exists(output_file):
        os.remove(output_file)

    session = create_session()
    cookies = get_cookies()

    for idx, row in tqdm(area_df.iterrows(), total=area_df.shape[0], desc="Processing Areas"):
        if idx % 10 == 0:
            cookies = get_cookies()
            session = create_session()

        region = row["Region"]
        district = row["District"]
        code = row["Code"]

        print(f"\nProcessing area: {district} (Code: {code}, Region: {region})")
        page_index = 1
        area_results = []

        while True:
            url = (f"{BASE_URL}?pageindex={page_index}&pagesize={PAGESIZE}"
                   f"&daterang={date_range_encoded}&posttype=B&districtids={code}&lang=EN")
            
            headers = {
                "User-Agent": get_random_user_agent(),
                "Accept": "application/json, text/plain, */*",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://oir.centanet.com/",
                "Origin": "https://oir.centanet.com",
                "Connection": "keep-alive"
            }

            #print(f"Requesting page {page_index} for area {district} …")
            try:
                response = session.get(url, headers=headers, cookies=cookies, timeout=20)
                response.raise_for_status()
                json_data = response.json()
            except requests.exceptions.RequestException as e:
                print("Error during the API request for area", district, ":", e)
                break

            if json_data.get("responseCode") != 1:
                print("API response not successful; ending pagination for", district)
                break

            items = json_data.get("data", {}).get("recordList", {}).get("items", [])
            if not items:
                #print(f"No items found on page {page_index} for area {district}.")
                break

            for item in items:
                item["Region"] = region
                item["District"] = district
                item["AreaCode"] = code
                area_results.append(item)
            
            print(f"Fetched {len(items)} items on page {page_index} for area {district}.")
            page_index += 1

            time.sleep(random.uniform(3, 5))

        if area_results:
            df_area = pd.DataFrame(area_results)
            df_area.to_csv(output_file, mode="a", index=False, header=not os.path.exists(output_file), encoding="utf-8-sig")
            #print(f"Saved {len(area_results)} items for area {district} into {output_file}.")
        else:
            print(f"No data collected for area {district}.")
        
        time.sleep(random.uniform(5, 7))

    print("\nScraping complete. All data saved in:", output_file)

if __name__ == "__main__":
    main()


Processing Areas:   0%|          | 0/53 [00:00<?, ?it/s]


Processing area: Admiralty (Code: WS005, Region: HK_Island)
Fetched 2527 items on page 1 for area Admiralty.


Processing Areas:   2%|▏         | 1/53 [00:11<09:37, 11.11s/it]


Processing area: Happy Valley (Code: WS048, Region: HK_Island)
Fetched 567 items on page 1 for area Happy Valley.


Processing Areas:   4%|▍         | 2/53 [00:22<09:21, 11.02s/it]


Processing area: Chai Wan (Code: WS012, Region: HK_Island)
Fetched 7672 items on page 1 for area Chai Wan.


Processing Areas:   6%|▌         | 3/53 [00:33<09:25, 11.31s/it]


Processing area: Tin Hau (Code: WS046, Region: HK_Island)
Fetched 1914 items on page 1 for area Tin Hau.


Processing Areas:   8%|▊         | 4/53 [00:45<09:23, 11.51s/it]


Processing area: Taikoo Shing (Code: WS049, Region: HK_Island)
Fetched 122 items on page 1 for area Taikoo Shing.


Processing Areas:   9%|▉         | 5/53 [00:56<09:06, 11.39s/it]


Processing area: Shau Kei Wan (Code: WS011, Region: HK_Island)
Fetched 849 items on page 1 for area Shau Kei Wan.


Processing Areas:  11%|█▏        | 6/53 [01:08<08:56, 11.42s/it]


Processing area: Siu Sai Wan (Code: WS013, Region: HK_Island)
Fetched 88 items on page 1 for area Siu Sai Wan.


Processing Areas:  13%|█▎        | 7/53 [01:58<18:23, 24.00s/it]


Processing area: Aberdeen (Code: WS047, Region: HK_Island)
Fetched 840 items on page 1 for area Aberdeen.


Processing Areas:  15%|█▌        | 8/53 [02:09<14:56, 19.92s/it]


Processing area: Western District (Code: WS001, Region: HK_Island)
Fetched 3545 items on page 1 for area Western District.


Processing Areas:  17%|█▋        | 9/53 [02:21<12:53, 17.58s/it]


Processing area: Causeway Bay (Code: WS007, Region: HK_Island)
Fetched 7932 items on page 1 for area Causeway Bay.


Processing Areas:  19%|█▉        | 10/53 [02:34<11:33, 16.12s/it]


Processing area: Central (Code: WS004, Region: HK_Island)
Fetched 10000 items on page 1 for area Central.
Fetched 3416 items on page 2 for area Central.


Processing Areas:  21%|██        | 11/53 [02:59<13:05, 18.70s/it]


Processing area: North Point (Code: WS008, Region: HK_Island)
Fetched 4418 items on page 1 for area North Point.


Processing Areas:  23%|██▎       | 12/53 [03:24<14:13, 20.83s/it]


Processing area: Quarry Bay (Code: WS009, Region: HK_Island)
Fetched 2263 items on page 1 for area Quarry Bay.


Processing Areas:  25%|██▍       | 13/53 [03:37<12:08, 18.22s/it]


Processing area: Sai Wan Ho (Code: WS010, Region: HK_Island)
Fetched 538 items on page 1 for area Sai Wan Ho.


Processing Areas:  26%|██▋       | 14/53 [03:48<10:36, 16.32s/it]


Processing area: Sheung Wan (Code: WS003, Region: HK_Island)
Fetched 10000 items on page 1 for area Sheung Wan.
Fetched 2165 items on page 2 for area Sheung Wan.


Processing Areas:  28%|██▊       | 15/53 [04:07<10:47, 17.05s/it]


Processing area: Southern District (Code: WS002, Region: HK_Island)
Fetched 7401 items on page 1 for area Southern District.


Processing Areas:  30%|███       | 16/53 [04:18<09:26, 15.31s/it]


Processing area: Wan Chai (Code: WS006, Region: HK_Island)
Fetched 10000 items on page 1 for area Wan Chai.
Fetched 5788 items on page 2 for area Wan Chai.


Processing Areas:  32%|███▏      | 17/53 [04:40<10:22, 17.29s/it]


Processing area: Cheung Sha Wan (Code: WS015, Region: Kowloon)
Fetched 10000 items on page 1 for area Cheung Sha Wan.
Fetched 10000 items on page 2 for area Cheung Sha Wan.
Fetched 3499 items on page 3 for area Cheung Sha Wan.


Processing Areas:  34%|███▍      | 18/53 [05:14<12:54, 22.12s/it]


Processing area: Jordan (Code: WS022, Region: Kowloon)
Fetched 4618 items on page 1 for area Jordan.


Processing Areas:  36%|███▌      | 19/53 [05:27<10:57, 19.33s/it]


Processing area: Kowloon Bay (Code: WS031, Region: Kowloon)
Fetched 10000 items on page 1 for area Kowloon Bay.
Fetched 2994 items on page 2 for area Kowloon Bay.


Processing Areas:  38%|███▊      | 20/53 [05:46<10:38, 19.35s/it]


Processing area: Kowloon Tong (Code: WS024, Region: Kowloon)
Fetched 76 items on page 1 for area Kowloon Tong.


Processing Areas:  40%|███▉      | 21/53 [05:59<09:14, 17.32s/it]


Processing area: Mei Foo (Code: WS014, Region: Kowloon)
Fetched 235 items on page 1 for area Mei Foo.


Processing Areas:  42%|████▏     | 22/53 [06:10<08:06, 15.70s/it]


Processing area: Ho Man Tin (Code: WS045, Region: Kowloon)
Fetched 547 items on page 1 for area Ho Man Tin.


Processing Areas:  43%|████▎     | 23/53 [06:21<07:02, 14.09s/it]


Processing area: San Po Kong (Code: WS030, Region: Kowloon)
Fetched 10000 items on page 1 for area San Po Kong.
Fetched 4124 items on page 2 for area San Po Kong.


Processing Areas:  45%|████▌     | 24/53 [06:47<08:36, 17.80s/it]


Processing area: Tai Kok Tsui (Code: WS018, Region: Kowloon)
Fetched 3893 items on page 1 for area Tai Kok Tsui.


Processing Areas:  47%|████▋     | 25/53 [07:00<07:37, 16.35s/it]


Processing area: Tsim Sha Tsui (Code: WS023, Region: Kowloon)
Fetched 10000 items on page 1 for area Tsim Sha Tsui.
Fetched 10000 items on page 2 for area Tsim Sha Tsui.
Fetched 897 items on page 3 for area Tsim Sha Tsui.


Processing Areas:  49%|████▉     | 26/53 [07:25<08:30, 18.91s/it]


Processing area: Wong Tai Sin (Code: WS029, Region: Kowloon)
Fetched 358 items on page 1 for area Wong Tai Sin.


Processing Areas:  51%|█████     | 27/53 [07:37<07:13, 16.66s/it]


Processing area: Hung Hom (Code: WS027, Region: Kowloon)
Fetched 5006 items on page 1 for area Hung Hom.


Processing Areas:  53%|█████▎    | 28/53 [07:49<06:24, 15.40s/it]


Processing area: Kai Tak Development Area (Code: WS028, Region: Kowloon)
Fetched 54 items on page 1 for area Kai Tak Development Area.


Processing Areas:  55%|█████▍    | 29/53 [08:03<06:00, 15.02s/it]


Processing area: Kowloon City (Code: WS025, Region: Kowloon)
Fetched 540 items on page 1 for area Kowloon City.


Processing Areas:  57%|█████▋    | 30/53 [08:16<05:32, 14.47s/it]


Processing area: Kwun Tong (Code: WS032, Region: Kowloon)
Fetched 10000 items on page 1 for area Kwun Tong.
Fetched 10000 items on page 2 for area Kwun Tong.
Fetched 10000 items on page 3 for area Kwun Tong.
Fetched 10000 items on page 4 for area Kwun Tong.
Fetched 10000 items on page 5 for area Kwun Tong.
Fetched 1912 items on page 6 for area Kwun Tong.


Processing Areas:  58%|█████▊    | 31/53 [09:15<10:07, 27.63s/it]


Processing area: Mongkok (Code: WS020, Region: Kowloon)
Fetched 10000 items on page 1 for area Mongkok.
Fetched 1469 items on page 2 for area Mongkok.


Processing Areas:  60%|██████    | 32/53 [09:47<10:09, 29.04s/it]


Processing area: Prince Edward (Code: WS019, Region: Kowloon)
Fetched 2815 items on page 1 for area Prince Edward.


Processing Areas:  62%|██████▏   | 33/53 [09:58<07:54, 23.73s/it]


Processing area: Sham Shui Po (Code: WS016, Region: Kowloon)
Fetched 2507 items on page 1 for area Sham Shui Po.


Processing Areas:  64%|██████▍   | 34/53 [10:21<07:22, 23.31s/it]


Processing area: To Kwa Wan (Code: WS026, Region: Kowloon)
Fetched 2845 items on page 1 for area To Kwa Wan.


Processing Areas:  66%|██████▌   | 35/53 [10:32<05:57, 19.84s/it]


Processing area: West Kowloon (Code: WS017, Region: Kowloon)
Fetched 121 items on page 1 for area West Kowloon.


Processing Areas:  68%|██████▊   | 36/53 [10:43<04:48, 16.95s/it]


Processing area: Yau Ma Tei (Code: WS021, Region: Kowloon)
Fetched 2540 items on page 1 for area Yau Ma Tei.


Processing Areas:  70%|██████▉   | 37/53 [10:58<04:24, 16.53s/it]


Processing area: Island District (Code: WS044, Region: NT)
Fetched 75 items on page 1 for area Island District.


Processing Areas:  72%|███████▏  | 38/53 [11:12<03:55, 15.69s/it]


Processing area: Lai King (Code: WS043, Region: NT)
Fetched 5 items on page 1 for area Lai King.


Processing Areas:  74%|███████▎  | 39/53 [11:25<03:29, 14.95s/it]


Processing area: Fanling (Code: WS052, Region: NT)
Fetched 2159 items on page 1 for area Fanling.


Processing Areas:  75%|███████▌  | 40/53 [11:38<03:04, 14.20s/it]


Processing area: Sha Tin (Code: WS035, Region: NT)
Fetched 10000 items on page 1 for area Sha Tin.
Fetched 8627 items on page 2 for area Sha Tin.


Processing Areas:  77%|███████▋  | 41/53 [11:59<03:15, 16.27s/it]


Processing area: Ma On Shan (Code: WS054, Region: NT)
Fetched 133 items on page 1 for area Ma On Shan.


Processing Areas:  79%|███████▉  | 42/53 [12:11<02:46, 15.11s/it]


Processing area: Tseung Kwan O (Code: WS037, Region: NT)
Fetched 455 items on page 1 for area Tseung Kwan O.


Processing Areas:  81%|████████  | 43/53 [12:24<02:25, 14.58s/it]


Processing area: Tsuen Wan (Code: WS040, Region: NT)
Fetched 10000 items on page 1 for area Tsuen Wan.
Fetched 10000 items on page 2 for area Tsuen Wan.
Fetched 7576 items on page 3 for area Tsuen Wan.


Processing Areas:  83%|████████▎ | 44/53 [12:54<02:51, 19.03s/it]


Processing area: Yuen Long (Code: WS038, Region: NT)
Fetched 4900 items on page 1 for area Yuen Long.


Processing Areas:  85%|████████▍ | 45/53 [13:06<02:16, 17.00s/it]


Processing area: Kwai Chung (Code: WS042, Region: NT)
Fetched 10000 items on page 1 for area Kwai Chung.
Fetched 10000 items on page 2 for area Kwai Chung.
Fetched 10000 items on page 3 for area Kwai Chung.
Fetched 3914 items on page 4 for area Kwai Chung.


Processing Areas:  87%|████████▋ | 46/53 [13:46<02:46, 23.74s/it]


Processing area: Sheung Shui (Code: WS051, Region: NT)
Fetched 1999 items on page 1 for area Sheung Shui.


Processing Areas:  89%|████████▊ | 47/53 [13:58<02:02, 20.42s/it]


Processing area: Sai Kung (Code: WS036, Region: NT)
Fetched 237 items on page 1 for area Sai Kung.


Processing Areas:  91%|█████████ | 48/53 [14:12<01:32, 18.43s/it]


Processing area: Tai Wai (Code: WS053, Region: NT)
Fetched 805 items on page 1 for area Tai Wai.


Processing Areas:  92%|█████████▏| 49/53 [14:23<01:04, 16.07s/it]


Processing area: Tai Po (Code: WS034, Region: NT)
Fetched 1477 items on page 1 for area Tai Po.


Processing Areas:  94%|█████████▍| 50/53 [14:33<00:42, 14.28s/it]


Processing area: Tsing Yi (Code: WS041, Region: NT)
Fetched 1317 items on page 1 for area Tsing Yi.


Processing Areas:  96%|█████████▌| 51/53 [14:45<00:27, 13.73s/it]


Processing area: Tuen Mun (Code: WS039, Region: NT)
Fetched 10000 items on page 1 for area Tuen Mun.
Fetched 1515 items on page 2 for area Tuen Mun.


Processing Areas:  98%|█████████▊| 52/53 [15:04<00:15, 15.20s/it]


Processing area: Tin Shui Wai (Code: WS050, Region: NT)
Fetched 488 items on page 1 for area Tin Shui Wai.


Processing Areas: 100%|██████████| 53/53 [15:17<00:00, 17.31s/it]


Scraping complete. All data saved in: 2025-02-27_centanet_ici_transaction.csv





In [17]:
#simple cleansing
import pandas as pd 

df_clean = pd.read_csv('2025-02-27_centanet_ici_transaction.csv')
print(df_clean.shape)

  df_clean = pd.read_csv('2025-02-27_centanet_ici_transaction.csv')


(348777, 34)


In [None]:
import ast

df_cleansed = df_clean[['id', 'deptDisplayName', 'centabldg', 'transactionDate', 'transactionType',
                        'propertyNameCn', 'propertyNameEn', 'propertyUsageDisplayName','floor',
                        'unit', 'isPriceEstimated', 'transactionArea', 'sourceDisplayName',
                        'priceInfo', 'ibsContractID', 'addressDisplayName', 'Region', 'District',
                        'AreaCode']]

def safe_get_value(x, key, default=None):
    if isinstance(x, str):
        try:
            x = ast.literal_eval(x)
        except (SyntaxError, ValueError):
            return default
    if isinstance(x, dict):
        return x.get(key, default)
    return default

# Use .loc columns assignment on the DataFrame copy
df_cleansed.loc[:, 'price'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'price'))
# df_cleansed.loc[:, 'priceDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'priceDisplayName'))
df_cleansed.loc[:, 'pricePostTypeDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'pricePostTypeDisplayName'))
df_cleansed.loc[:, 'avgPrice'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'avgPrice'))
#df_cleansed.loc[:, 'avgPriceDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'avgPriceDisplayName'))
df_cleansed.loc[:, 'rental'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'rental'))
# df_cleansed.loc[:, 'rentalDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'rentalDisplayName'))
df_cleansed.loc[:, 'rentPostTypeDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'rentPostTypeDisplayName'))
df_cleansed.loc[:, 'avgRental'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'avgRental'))
#df_cleansed.loc[:, 'avgRentalDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'avgRentalDisplayName'))
#df_cleansed.loc[:, 'gains_Price'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'gains_Price'))
#df_cleansed.loc[:, 'gains_Rental'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'gains_Rental'))
#df_cleansed.loc[:, 'priceTo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'priceTo'))
#df_cleansed.loc[:, 'rentalTo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'rentalTo'))
#df_cleansed.loc[:, 'unitPriceTo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'unitPriceTo'))
#df_cleansed.loc[:, 'unitRentalTo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'unitRentalTo'))
#df_cleansed.loc[:, 'priceDesc'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'priceDesc'))
#df_cleansed.loc[:, 'fhPriceInfo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'fhPriceInfo'))
#df_cleansed.loc[:, 'fhAvgPriceInfo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'fhAvgPriceInfo'))
#df_cleansed.loc[:, 'fhRentInfo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'fhRentInfo'))
#df_cleansed.loc[:, 'fhAvgRentInfo'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'fhAvgRentInfo'))

# Optionally, drop the original priceInfo containing the dictionaries
df_cleansed.drop(columns=['priceInfo'], inplace=True)

df_cleansed.to_excel('2025-02-27_centanet_ici_transaction.xlsx', index=False)

print(f'After Selecting Columns, the updated shape is: {df_cleansed.shape}.')
	
					
			
#436.5MB csv -> 53.1MB xlsx -> 51MB xlsx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleansed.loc[:, 'price'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'price'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleansed.loc[:, 'pricePostTypeDisplayName'] = df_cleansed['priceInfo'].apply(lambda x: safe_get_value(x, 'pricePostTypeDisplayName'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

After Selecting Columns, the updated shape is: (348777, 24).


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
