In [4]:
import requests
import csv
import json
import time
from tqdm import tqdm

# Base URL and headers (try to mimic the browser request)
base_url = "https://www.leasinghub.com/office/buildings"
headers = {
    "Accept": "application/json, text/plain, */*",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
}

# API parameters (query string form)
params = {
    "task": "buildings.fetch",
    "format": "json",
    "with_images": 1,
    "with_prices": 1,
    "limit": 20,
    "filter_order": "default",
    "filter_order_Dir": "ASC",
    "usage": 1,
    "is_new": "",
    "year": "",
    "keyword": "",
    "starting": ""
}

# Output CSV file path
output_file = "leasinghub_office_buildings.csv"

# Write CSV headers (customize fields as needed)
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow([
        "ID", "Name", "Street Name", "Area Name", "Year Built", 
        "Highest Floor", "Typical Floor Area", "District Name", 
        "Latitude", "Longitude", "Grade",
        "Min Rent", "Max Rent", "Min Price", "Max Price"
    ])

max_retries = 3

# --- Step 1: Get initial page to determine total pages ---
params["limitstart"] = 0
try:
    response_init = requests.post(base_url, headers=headers, params=params)
except Exception as e:
    print(f"Initial request failed: {e}")
    exit(1)

content_type_init = response_init.headers.get("Content-Type", "")
if not content_type_init.startswith("application/json"):
    print(f"Initial response is not JSON. Content-Type: {content_type_init}")
    exit(1)

try:
    data_init = response_init.json()
except json.JSONDecodeError as e:
    print(f"JSON decode error on initial request: {e}")
    exit(1)

# Extract pager information to set up pagination
pager = data_init.get("data", {}).get("pager", {})
pages_total = pager.get("pagesTotal")
if not pages_total:
    print("Could not determine total pages from the initial response.")
    exit(1)

# Optionally, print a summary of the overall data
total_items = pager.get("total", "unknown")
print(f"Found {total_items} items across ~{pages_total} pages.")

# --- Step 2: Loop through each page with tqdm ---
for page in tqdm(range(pages_total), desc="Scraping pages", unit="page"):
    current_limitstart = page * params["limit"]
    params["limitstart"] = current_limitstart
    retries = 0
    data = None

    while retries < max_retries:
        try:
            response = requests.post(base_url, headers=headers, params=params)
            content_type = response.headers.get("Content-Type", "")
            if content_type.startswith("application/json"):
                data = response.json()
                break
            else:
                print(f"Non-JSON response at limitstart {current_limitstart}. Content-Type: {content_type}")
                break  # Break out—even if empty—that page will be skipped.
        except json.JSONDecodeError as e:
            print(f"JSON decode error at limitstart {current_limitstart}: {e}")
            retries += 1
            time.sleep(2)  # Delay before retrying
        except Exception as e:
            print(f"Request failed at limitstart {current_limitstart}: {e}")
            retries += 1
            time.sleep(2)

    if data is None or retries == max_retries:
        print(f"Skipping page with limitstart {current_limitstart} after {max_retries} retries.")
        continue

    items = data.get("data", {}).get("items", [])
    if not items:
        # Sometimes an empty items list indicates no more data
        # You can choose to break or continue to next page
        print(f"No items returned at limitstart {current_limitstart}.")
        continue

    # Append items from this page to the CSV
    with open(output_file, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        for item in items:
            writer.writerow([
                item.get("id"),
                item.get("name_locale"),
                item.get("street_name"),
                item.get("area_name"),
                item.get("year"),
                item.get("highest_floor"),
                item.get("typfloor_area_text"),
                item.get("district_name"),
                item.get("building_lat"),
                item.get("building_lng"),
                item.get("grade"),
                item.get("stock_rents", {}).get("min_rent"),
                item.get("stock_rents", {}).get("max_rent"),
                item.get("stock_prices", {}).get("min_price"),
                item.get("stock_prices", {}).get("max_price")
            ])

    # Optional short delay to avoid overwhelming the server
    time.sleep(1)

print(f"Data scraping completed. Output saved in '{output_file}'.")


Found 1675 items across ~84 pages.


Scraping pages: 100%|██████████| 84/84 [01:58<00:00,  1.41s/page]

Data scraping completed. Output saved in 'leasinghub_office_buildings.csv'.



