In [1]:
# Midland ICI Transaction #1 
# https://www.midlandici.com.hk/ics/property/transaction/json?ics_type=&date_min=2000-01-01&date_max=2025-02-12&lang=english&page_size=50000&cursor=1&order=tx_date-desc

In [4]:
import requests
import csv
import time
import random
from dateutil.relativedelta import relativedelta
from datetime import datetime
from tqdm import tqdm
import json

# Configuration
BASE_URL = "https://www.midlandici.com.hk/ics/property/transaction/json"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "application/json, text/plain, */*",
    "Referer": "https://www.midlandici.com.hk/"
}
MAX_PAGE_SIZE = 20000  # Maximum page size allowed by API


def generate_month_ranges(start_date, end_date):
    """Generate a list of month ranges (start_date to end_date)."""
    current = end_date
    while current >= start_date:
        month_end = current
        month_start = current.replace(day=1)
        yield (month_start.strftime("%Y-%m-%d"), month_end.strftime("%Y-%m-%d"))
        current = month_start - relativedelta(days=1)


def scrape_month(date_min, date_max):
    """
    Scrape data for a given month using cursor-based pagination.
    
    Args:
        date_min (str): Start date in YYYY-MM-DD format.
        date_max (str): End date in YYYY-MM-DD format.

    Returns:
        list: A list of transaction data for the given month.
    """
    all_results = []
    cursor = 1

    with tqdm(desc=f"Scraping {date_min} to {date_max}", leave=False) as pbar:
        while True:
            params = {
                "ics_type": "",
                "date_min": date_min,
                "date_max": date_max,
                "lang": "english",
                "page_size": MAX_PAGE_SIZE,
                "cursor": cursor,
                "order": "tx_date-desc"
            }

            try:
                response = requests.get(BASE_URL, headers=HEADERS, params=params)
                response.raise_for_status()
                data = response.json()

                if not data.get('transactions'):
                    break

                all_results.extend(data['transactions'])
                pbar.total = data.get('count')
                pbar.update(len(data['transactions']))
                pbar.set_postfix_str(f"Cursor {cursor} | Total {len(all_results)}")

                # Stop if we've reached the expected total or no more results
                if len(all_results) >= data.get('count', 0) or len(data['transactions']) < MAX_PAGE_SIZE:
                    break

                cursor += 1
                time.sleep(random.uniform(0.5, 1.5))

            except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
                print(f"Error: {e}. Retrying...")
                time.sleep(5)
                continue

    return all_results


def save_to_csv(data, filename):
    """
    Save scraped data to a CSV file.

    Args:
        data (list): List of dictionaries containing transaction data.
        filename (str): Name of the output CSV file.
    """
    if not data:
        print("No data to save.")
        return

    with open(filename, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    print(f"Data saved to {filename}")


def scrape_all_data(start_date, end_date):
    """
    Scrape all transaction data between start_date and end_date.

    Args:
        start_date (datetime): Start date for scraping.
        end_date (datetime): End date for scraping.

    Returns:
        list: A list of all transaction data scraped.
    """
    all_transactions = []
    months = list(generate_month_ranges(start_date, end_date))

    with tqdm(months, desc="Processing Months") as month_pbar:
        for date_min, date_max in month_pbar:
            month_data = scrape_month(date_min, date_max)
            all_transactions.extend(month_data)
            month_pbar.set_postfix_str(f"Total Records: {len(all_transactions)}")

    return all_transactions


if __name__ == "__main__":
    # Define the scraping range
    START_DATE = datetime(2000, 1, 1)
    END_DATE = datetime.now()

    # Scrape all data within the range and save final output
    all_data = scrape_all_data(START_DATE, END_DATE)

    # Save final consolidated file
    final_filename = f"midlandici_all_transactions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    save_to_csv(all_data, final_filename)

    print(f"\nTotal scraped transactions: {len(all_data)}")


Processing Months: 100%|██████████| 303/303 [02:28<00:00,  2.05it/s, Total Records: 316121]


Data saved to all_transactions_20250330_191952.csv

Total scraped transactions: 316121


In [1]:
import requests
import time
import random
from dateutil.relativedelta import relativedelta
from datetime import datetime
from tqdm import tqdm
import json
import pandas as pd

def scrape_midlandici_transactions(start_date=datetime(2000, 1, 1), end_date=datetime.now(), 
                                  base_url="https://www.midlandici.com.hk/ics/property/transaction/json",
                                  headers=None, max_page_size=20000):
    """
    Scrape all transaction data from Midland ICI website and return as pandas DataFrame.
    
    Args:
        start_date (datetime): Start date for scraping.
        end_date (datetime): End date for scraping.
        base_url (str): API endpoint URL.
        headers (dict): HTTP headers for the request.
        max_page_size (int): Maximum number of records per page.
        
    Returns:
        pandas.DataFrame: DataFrame containing all transaction data.
    """
    # Set default headers if none provided
    if headers is None:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "application/json, text/plain, */*",
            "Referer": "https://www.midlandici.com.hk/"
        }
    
    all_transactions = []
    
    # Generate month ranges
    month_ranges = []
    current = end_date
    while current >= start_date:
        month_end = current
        month_start = current.replace(day=1)
        month_ranges.append((month_start.strftime("%Y-%m-%d"), month_end.strftime("%Y-%m-%d")))
        current = month_start - relativedelta(days=1)
    
    # Scrape data for each month range
    with tqdm(month_ranges, desc="Processing Months") as month_pbar:
        for date_min, date_max in month_pbar:
            # Scrape data for current month using cursor-based pagination
            month_data = []
            cursor = 1
            
            with tqdm(desc=f"Scraping {date_min} to {date_max}", leave=False) as pbar:
                while True:
                    params = {
                        "ics_type": "",
                        "date_min": date_min,
                        "date_max": date_max,
                        "lang": "english",
                        "page_size": max_page_size,
                        "cursor": cursor,
                        "order": "tx_date-desc"
                    }

                    try:
                        response = requests.get(base_url, headers=headers, params=params)
                        response.raise_for_status()
                        data = response.json()

                        if not data.get('transactions'):
                            break

                        month_data.extend(data['transactions'])
                        pbar.total = data.get('count')
                        pbar.update(len(data['transactions']))
                        pbar.set_postfix_str(f"Cursor {cursor} | Total {len(month_data)}")

                        # Stop if we've reached the expected total or no more results
                        if len(month_data) >= data.get('count', 0) or len(data['transactions']) < max_page_size:
                            break

                        cursor += 1
                        time.sleep(random.uniform(0.5, 1.5))

                    except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
                        print(f"Error: {e}. Retrying...")
                        time.sleep(5)
                        continue
            
            all_transactions.extend(month_data)
            month_pbar.set_postfix_str(f"Trans: {len(all_transactions)}")
    
    return pd.DataFrame(all_transactions)


In [2]:
if __name__ == "__main__":
    # Define the scraping parameters
    START_DATE = datetime(2000, 1, 1)
    END_DATE = datetime.now()
    
    # Use the function to obtain data as DataFrame
    transactions_df = scrape_midlandici_transactions(
        start_date=START_DATE,
        end_date=END_DATE
    )
    
    # Now you can work with the DataFrame directly
    print(transactions_df.head())
    print(f"DataFrame shape: {transactions_df.shape}")
    
    # Optional: Save to CSV if needed
    # transactions_df.to_csv("transactions.csv", index=False)


Processing Months:   5%|▌         | 16/304 [00:08<02:37,  1.83it/s, Total Records: 16491]


KeyboardInterrupt: 