In [22]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import concurrent.futures
import time

In [None]:
# Max 1000 property links per suburb

def get_property_links(suburb, postcode, max_pages=50):
    # Base URL for the property listings with placeholders for suburb and postcode
    base_url = f'https://www.domain.com.au/rent/{suburb}-vic-{postcode}/?sort=price-desc&page='
    property_links = []

    # Loop through the pages up to the maximum specified
    for page in range(1, max_pages + 1):
        url = base_url + str(page)

        try:
            # Make a GET request to the URL with a specified User-Agent header
            response = requests.get(url, headers={'User-Agent': 'PostmanRuntime/7.6.0'})
        except requests.ConnectionError as e:
            # Handle connection errors and continue to the next page
            print(f"CONNECTION_ERROR: Failed to retrieve page {page}, {suburb}, {postcode}. Error: {e}. Exiting loop.")
            continue

        # Handle rate limiting by checking for a 403 status code
        if response.status_code == 403:
            print(f"RATE_LIMITING: Failed to retrieve page {page}, {suburb}, {postcode}. Status code: {response.status_code}. Exiting loop.")
            break

        # Handle incorrect suburb names by checking for a 404 status code
        if response.status_code == 404:
            print(f"SUBURB_NOT_FOUND: Failed to retrieve page {page}, {suburb}, {postcode} likely due to incorrect suburb name. Status code: {response.status_code}. Exiting loop.")
            break
        
        # Check if the response status code is not 200 and handle it
        elif response.status_code != 200:
            print(f"Failed to retrieve page {page}, {suburb}, {postcode}. Status code: {response.status_code}. Exiting loop.")
            break
        
        # Parse the response content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all listings with the class 'address'
        listings = soup.find_all('a', class_='address')

        # If no listings are found, exit the loop
        if not listings:
            print(f"NO LISTINGS: No listings found on page {page}, {suburb}, {postcode}. Exiting loop.")
            break

        # Extract the href attribute from each listing and add it to the property_links list
        for listing in listings:
            link = listing.get('href')
            if link:
                property_links.append(link)

        # Sleep for 1 second to avoid overwhelming the server
        time.sleep(1)
    
    # Return the list of property links
    return property_links

In [None]:
# Initialize a dictionary to store property data with all fields set to None
empty_data = {
    'title': None,                  # Title of the property listing
    'description': None,            # Description of the property
    'street_address': None,         # Street address of the property
    'suburb': None,                 # Suburb where the property is located
    'postcode': None,               # Postcode of the property's location
    'price': None,                  # Price of the property
    'bedrooms': None,               # Number of bedrooms in the property
    'bathrooms': None,              # Number of bathrooms in the property
    'parking': None,                # Number of parking spaces available
    'primary_property_type': None,  # Primary type of the property (e.g., house, apartment)
    'property_features': None,      # Features of the property (e.g., pool, garden)
    'structured_features': None,    # Structured features of the property (e.g., floor area)
    'video_count': None,            # Number of videos available for the property
    'photo_count': None,            # Number of photos available for the property
    'date_listed': None,            # Date when the property was listed
    'days_listed': None,            # Number of days the property has been listed
    'floor_plans_count': None,      # Number of floor plans available for the property
    'virtual_tour': None,           # Availability of a virtual tour for the property
    'nbn_details': None,            # NBN (National Broadband Network) details for the property
    'nearby_schools': None,         # Information about nearby schools
}

In [25]:
def extract_property_details(soup):
    # Locate the JSON data embedded in the HTML
    script_data = soup.find('script', id='__NEXT_DATA__').string
    json_data = json.loads(script_data)

    # Use .get() to safely access keys that may or may not be present
    layout_props = json_data.get('props', {}).get('pageProps', {}).get('layoutProps', {})
    property_details = layout_props.get("digitalData", {}).get("page", {}).get("pageInfo", {}).get("property", {})
    component_props = json_data.get('props', {}).get('pageProps', {}).get('componentProps', {})

    # Extract property details into a dictionary
    data = {
        'title': layout_props.get('title', None),
        'description': layout_props.get('description', None),
        'street_address': property_details.get('address', None),
        'suburb': property_details.get('suburb', None),
        'postcode': property_details.get('postcode', None),
        'price': property_details.get('price', None),
        'bedrooms': property_details.get('bedrooms', None),
        'bathrooms': property_details.get('bathrooms', None),
        'parking': property_details.get('parking', None),
        'primary_property_type': property_details.get('primaryPropertyType', None),
        'property_features': property_details.get('propertyFeatures', None),
        'structured_features': property_details.get('structuredFeatures', []),  # Default empty list
        'video_count': property_details.get('videoCount', None),
        'photo_count': property_details.get('photoCount', None),
        'date_listed': property_details.get('dateListed', None),
        'days_listed': property_details.get('daysListed', None),
        'floor_plans_count': property_details.get('floorPlansCount', None),
        'virtual_tour': property_details.get('virtualTour', None),
        'nbn_details': layout_props.get('nbnDetails', None),
        'nearby_schools': component_props.get('schoolCatchment', {}).get('schools', []),  # Default empty list
    }

    return data


In [26]:
def scrape_property(url):
    # Fetch the page content
    try:
        response = requests.get(url, headers={'User-Agent': 'PostmanRuntime/7.6.0'})
    except requests.ConnectionError as e:
        # Handle connection errors
        print(f"CONNECTION_ERROR: Failed to retrieve page. Error: {e}. Exiting loop.")
        return None
    
    # Handle rate limiting by checking for a 403 status code
    if response.status_code == 403:
        print(f"RATE_LIMITING: Failed to retrieve page {url}. Status code: {response.status_code}. Exiting loop.")
        return None  # Exit the function early if rate-limited

    # Check if the response status code is not 200
    elif response.status_code != 200:
        print(f"Failed to retrieve page {url}. Status code: {response.status_code}. Exiting loop.")
        return None
    
    # Parse the response content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the property details
    property_data = extract_property_details(soup)

    # Sleep for 0.5 seconds to avoid overwhelming the server
    time.sleep(0.5)
    
    return property_data

In [27]:
# Load the postcode mapping from the JSON file
with open('../../data/landing/postcode_mapping.json', 'r') as json_file:
    postcode_mapping = json.load(json_file)

In [28]:
def collect_all_property_links_threaded(postcode_mapping, max_workers=5):
    # Initialize an empty list to store all property links
    all_property_links = []
    
    # Create a ThreadPoolExecutor to manage threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        # Iterate over the postcode mapping dictionary
        for postcode, suburbs in postcode_mapping.items():
            # Iterate over each suburb in the list of suburbs for the current postcode
            for suburb in suburbs:
                # Submit a task to the executor to get property links for the suburb and postcode
                futures.append(executor.submit(get_property_links, suburb, postcode))
                # Sleep for 0.5 seconds to avoid overwhelming the server
                time.sleep(0.5)

        # Process the completed futures as they finish
        for future in concurrent.futures.as_completed(futures):
            # If the future result is not None, extend the all_property_links list with the result
            if future.result() is not None:
                all_property_links.extend(future.result())
            
    # Return the list of all collected property links
    return all_property_links

In [29]:
def scrape_properties_threaded(property_links, max_workers=5):
    # Initialize an empty list to store all scraped property details
    all_properties = []
    
    # Create a ThreadPoolExecutor to manage threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        # Iterate over the list of property links
        for i, url in enumerate(property_links):
            # Submit a task to the executor to scrape property details from the URL
            futures.append(executor.submit(scrape_property, url))
            # Sleep for 0.5 seconds to avoid overwhelming the server
            time.sleep(0.5)
        
        # Process the completed futures as they finish
        for future in concurrent.futures.as_completed(futures):
            # If the future result is not None, append the result to the all_properties list
            if future.result() is not None:
                all_properties.append(future.result())

    # Convert the list of all properties to a DataFrame
    df = pd.DataFrame(all_properties)
    # Return the DataFrame
    return df

In [30]:
# Warning: This will take a long time to run (20-40 mins)
# Status codes 404 are common because of duplicate suburb names, so don't worry about them
# Status codes 403 are rate limiting, so you may need to slow down your requests

# Recommended max_workers = 20. If you get rate limited, reduce the number of workers
property_links = collect_all_property_links_threaded(postcode_mapping, 15)

SUBURB_NOT_FOUND: Failed to retrieve page 1, Melbourne, 3001 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, East Melbourne, 3002 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, West Melbourne, 3003 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, World Trade Centre, 3005 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, University Of Melbourne, 3010 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, West Footscray, 3012 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, South Kingsville, 3015 likely due to incorrect suburb name. Status code: 404. Exiting loop.
SUBURB_NOT_FOUND: Failed to retrieve page 1, 

In [31]:
# Convert to DataFrame and save to CSV
links_df = pd.DataFrame(property_links, columns=['property_link'])
links_df.drop_duplicates(inplace=True)

# Save to parquet file
links_df.to_parquet('../../data/raw/property_links.parquet', index=False)

In [32]:
len(links_df)

14459

In [33]:
property_df = scrape_properties_threaded(links_df['property_link'], 200)

property_df.head()

Failed to retrieve page https://www.domain.com.au/407-33-blackwood-street-north-melbourne-vic-3051-16672345. Status code: 500. Exiting loop.
Failed to retrieve page https://www.domain.com.au/22-200-wattletree-road-malvern-vic-3144-17185487. Status code: 500. Exiting loop.


Unnamed: 0,title,description,street_address,suburb,postcode,price,bedrooms,bathrooms,parking,primary_property_type,property_features,structured_features,video_count,photo_count,date_listed,days_listed,floor_plans_count,virtual_tour,nbn_details,nearby_schools
0,"60 Little Windrock Lane, Craigieburn VIC 3064 ...","View this 2 bedroom, 1 bathroom rental house a...","60 Little Windrock Lane, Craigieburn VIC 3064",Craigieburn,3064,$450 Per Week,2.0,1.0,1.0,House,,"[{'name': 'Built in wardrobes', 'category': 'I...",0.0,21.0,2024-08-22T16:07:26.000,14.0,0.0,False,,"[{'id': '', 'educationLevel': 'primary', 'name..."
1,"53 Were Street, Brighton VIC 3186 - House For ...","View this $1,500/week 4 bedroom, 2 bathroom re...","53 Were Street, Brighton VIC 3186",Brighton,3186,"$1,490.00",4.0,2.0,2.0,House,,[],0.0,6.0,2024-06-02T18:11:41.000,95.0,2.0,True,,"[{'id': '11562', 'educationLevel': 'combined',..."
2,"43 Tackle Drive, Point Cook VIC 3030 - Townhou...","View this 3 bedroom, 2 bathroom rental townhou...","43 Tackle Drive, Point Cook VIC 3030",Point Cook,3030,$550 per Week,3.0,2.0,2.0,Townhouse/Villa,"Air conditioning, Ensuite, Built in wardrobes,...","[{'name': 'Secure Parking', 'category': 'Outdo...",0.0,17.0,2024-09-03T12:01:18.000,2.0,0.0,True,,"[{'id': '', 'educationLevel': 'combined', 'nam..."
3,"3 Rostrevor Parade, Mont Albert VIC 3127 - Hou...","View this 5 bedroom, 2 bathroom rental house a...","3 Rostrevor Parade, Mont Albert VIC 3127",Mont Albert,3127,$800 weekly,5.0,2.0,2.0,House,,[],0.0,8.0,2024-07-01T12:53:48.000,66.0,0.0,False,,"[{'id': '2804', 'educationLevel': 'primary', '..."
4,"48 Roberts Street, Frankston VIC 3199 - Studio...","View this 9 bedroom, 3 bathroom rental studio ...","48 Roberts Street, Frankston VIC 3199",Frankston,3199,$299 per week,9.0,3.0,4.0,Apartment,"Furnished, Heating, Modern Bathroom, Modern Ki...","[{'name': 'Furnished', 'category': 'Indoor', '...",0.0,20.0,2024-07-02T11:24:10.000,65.0,1.0,False,,"[{'id': '1717', 'educationLevel': 'primary', '..."


In [34]:
# Drop rows where title, postcode = None (Could not scrape)
property_df = property_df[property_df['title'].notnull()]
property_df = property_df[property_df['postcode'].notnull()]
len(property_df)

14446

In [35]:
property_df.to_parquet(f'../../data/raw/property_details.parquet', index=False)