In [1]:
# Midland Residential Transaction Data

In [5]:
import json 
import requests
import csv
import time
import os
import pandas as pd 
from tqdm import tqdm
from bs4 import BeautifulSoup
import glob 
from datetime import datetime

# Define a function to handle web requests 
def get_soup(url, params=None):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 \
            Satari/537.36 Edg/131.0.0.0', 
        'authorization':'''Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJndWlkIjoibXItMjAyNC0xMi0wNy0tLVlWN0hKU2QxelRzOHpwVDhJNEdjdGxLcjQ1Z0l4cWhsdVp3SEdvZXVSX1o3RkU2cmh1Q1NjVVpqM1E3SXIzZWVQSmZpMy1JSSIsImF1ZCI6Im15cGFnZWFwcC1tbm5rYiIsInN1YiI6Im1yLTIwMjQtMTItMDctLS1ZVjdISlNkMXpUczh6cFQ4STRHY3RsS3I0NWdJeHFobHVad0hHb2V1Ul9aN0ZFNnJodUNTY1VaajNRN0lyM2VlUEpmaTMtSUkiLCJpYXQiOjE3MzM1NDk0MjUsImV4cCI6MTc2ODEwOTQyNSwiaXNzIjoiZGF0YS5taWRsYW5kLmNvbS5oayJ9.LOOVgc_Nw7OPNnAlB8iC1kRHL0W8UVNVa0GaJYaxTxVZtO33ZbkR64rxMHSifvZOzYr38aJENj-SDIbkq4Y75CxqMPegyBUgHtaub-Fez5qaH2W0Dz71pUdYijDG3rB4Dkbdf8k21QsHerJmOFnpryzTVnZDxv-3g8Lmjz2WUhmrqMamKox3w-T9wRJ4p_wzcJwvXWgtvxkapr3Ep0YSJy3fJsV-Nwm_QiJf2JR0V4rOAu7f-YLMSy7IYje3W-HvVqAZV2cDphg_cYnf6CpirJPu_ix2z6BtIMpYMXeSiZyZtKCHiWFNtUm6QTD2adArWtLl_NvbgcH9mhVYuWi8NcrZBdBh4c72bSNRm104oEbRb9-vb1AylH2oFkEz33xXXEAJRtbQxoQ3qZj_yoDIexrinOSlkJB50fSu98Xizv9eZstnbtzkgVjfKpOAWQFdHKennjN9Azq6yTlejDVspL7A0JsY4ZlO4HQNdkNhiOQDYypHgx8jQMm0B0rbaa0cEz1S0s43Lh01eNVBN9Is35jAWFsJIP-iLvHqXJ9d0pGoHe0N7PQk2dmLo9E5szP0U04MZxt4m9TEpJkn-0uS_ZDSVABlBU2KGIkTmuzm1VltsDhPhoNrbJBJVdxJJdublpDnVFk8aO1gFWKNzptw48ipmLfpRosynC_x3Ud6QMU'''
    }

    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status() # Will raise HTTPError for bad requests (4XX or 5XX)
    #return BeautifulSoup(response.text, 'html.parser')
    return response.json()

# Define a field map for nested fields
FIELD_MAP = {
    'id': ['id'],
    'region_id': ['region', 'id'],
    'region_name': ['region', 'name'],
    'subregion_id': ['subregion', 'id'],
    'subregion': ['subregion', 'name'],
    'district_id': ['district', 'id'],
    'district': ['district', 'name'],
    'sm_district_id': ['sm_district', 'id'],
    'sm_district': ['sm_district', 'name'],
    'combined_district_id': ['combined_district', 'id'],
    'combined_district': ['combined_district', 'name'],
    'int_district_id': ['int_district', 'id'],
    'int_district': ['int_district', 'name'],
    'int_sm_district_id': ['int_sm_district', 'id'],
    'int_sm_district': ['int_sm_district', 'name'],
    'estate_id': ['estate', 'id'],
    'estate': ['estate', 'name'],
    'building_id': ['building', 'id'],
    'building': ['building', 'name'],
    'building_first_op_date': ['building', 'first_op_date'],
    'unit': ['unit', 'id'],
    'floor': ['floor'],
    'floor_level': ['floor_level', 'name'],
    'floor_level_id': ['floor_level', 'id'],
    'flat': ['flat'],
    'area': ['area'],
    'net_area': ['net_area'],
    'price': ['price'],
    'tags': ['tags'],
    'tx_date': ['tx_date'],
    'tx_type': ['tx_type'],
    'last_tx_date': ['last_tx_date'],
    'holding_period': ['holding_period'],
    'last_price': ['last_price'],
    'mkt_type': ['mkt_type'],
    'source': ['source'],
    'original_source': ['original_source'],
    'update_date': ['update_date'],
    'gain': ['gain'],
    'transaction_type': ['transaction_type'],
    'url_desc': ['url_desc'],
    'location': ['location']
}
    
# Function to extract fields based on a mapping
def extract_fields(result, field_map):
    row = {}
    for key, path in field_map.items():
        temp = result
        for p in path:
            temp = temp.get(p, None)
            if temp is None:  # Stop if any part of the path is missing
                break
        row[key] = temp
    return row


def scrape_data(district_ids, base_url, params):
    all_data = []
    for district_id in tqdm(district_ids, desc='Scraping district IDs:'):
        params['intsmdist_ids'] = district_id
        page = 1
        while True:
            params['page'] = page
            try:
                data = get_soup(base_url, params)  # Use updated get_soup function
                #print(f"Scraping page {page} for district ID {district_id}")
                
                results = data.get('result', [])
                if not results:
                    #print(f"No more results found for district ID {district_id} on page {page}")
                    break
                
                for result in results:
                    row = extract_fields(result, FIELD_MAP)
                    if any(row.values()):
                        all_data.append(row)
                
                page += 1
                time.sleep(3)  # Avoid rate-limiting
                
            except Exception as e:
                print(f"Error occurred while scraping district ID {district_id}, page {page}: {e}")
                break

    return all_data

def save_to_csv(data, filename):
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f'Data saved ({len(data)} records) to {filename}')
        
# Read the district IDs from the file
def read_district_ids_from_excel(filename, column_name):
    df = pd.read_excel(filename, usecols=[column_name])
    return df[column_name].dropna().unique().tolist()

# Script Execution 
base_url = "https://data.midland.com.hk/search/v2/transactions"
params = {
    "ad": "true",
    "chart": "true",
    "lang": "en",
    "currency": "HKD",
    "unit": "feet",
    "search_behavior": "normal",
    "tx_date": "3year",
    "limit": 1000  # Number of records per page
}

# Read district IDs from Excel file (ensure file exists)
district_ids = read_district_ids_from_excel('midland_res_area_code.xlsx', 'm_idstrict_code')

# Scrape data and save to CSV
scraped_data = scrape_data(district_ids, base_url, params)
save_to_csv(scraped_data, 'midland_res_transaction_data.csv')


Scraping district IDs::  84%|████████▍ | 109/130 [17:04<04:19, 12.34s/it]

Error occurred while scraping district ID 130ND30023, page 4: 504 Server Error: Gateway Time-out for url: https://data.midland.com.hk/search/v2/transactions?ad=true&chart=true&lang=en&currency=HKD&unit=feet&search_behavior=normal&tx_date=3year&limit=1000&intsmdist_ids=130ND30023&page=4


Scraping district IDs:: 100%|██████████| 130/130 [20:24<00:00,  9.42s/it]


Data saved (201135 records) to midland_res_transaction_data.csv
