# Nearest supermarket, train station & CBD

## Purpose
Compute proximity of properties to key amenities and Melbourne CBD. This pipeline uses OpenStreetMap (OSM) and routing APIs (OpenRouteService or Google Maps) to determine the nearest amenities and driving distances/times for accessibility analysis.

## Inputs
- Domain property dataset: `domain_cleaned.csv`

## Outputs
- Nearest amenities for each property: `property_nearest_amenities.csv`
- Driving distances and times from each property to:
  - Each nearest amenity
  - Melbourne CBD
  Saved as: `property_with_distances_copy.csv`

## Key Steps
1. Load properties as a DataFrame.
2. For each amenity type:
   - Query OSM via Overpass API within a search radius (e.g., 3 km).
   - Retrieve nearest amenity coordinates per property.
   - Handle rate-limits and timeouts with retries.
3. Aggregate amenity data per property (pivot table):
   - Each property has separate columns for each amenity type’s latitude and longitude.
4. Compute driving distance and duration:
   - To each nearest amenity.
   - To Melbourne CBD (Flinders Street coordinates).
   - Use batch-safe calls to OpenRouteService or Google Maps Distance Matrix.
   - Handle API rate limits and retries.
5. Save outputs incrementally to CSV to allow resuming after interruptions.

## Notes
- Pipeline supports resuming from partially processed CSVs.
- Distances are in meters, durations in seconds.


In [5]:
# Libraries
import pandas as pd
import numpy as np
import folium 
import json
import csv
import time
import requests
import os
from geopy.distance import geodesic
import openrouteservice as ors
import googlemaps

In [6]:
# Load domain data
domain_df = pd.read_csv("../../datasets/raw/cleaned/domain_cleaned.csv")
domain_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,ensuite,dishwasher,garden,gym,pets_allowed,gas,intercom,security_system,washing_machine,property_type_grouped
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,0,0,1,0,0,0,0,Apartment
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,0,0,0,1,0,0,0,1,Apartment
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,1,1,0,0,1,1,0,0,1,Townhouse
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,1,0,0,0,0,0,0,0,Townhouse
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,0,0,0,0,0,0,0,Apartment


### Get the nearest Supermarket & Train Station coordinates using Open Street Map

In [None]:
# Config 
OSM_TAGS = {
    "supermarket": ['["shop"="supermarket"]'],
    "train_station": ['["railway"="station"]']
}

AMENITY_TYPES = list(OSM_TAGS.keys())
BATCH_SIZE = 100
SEARCH_RADIUS = 3000 
OUTPUT_FILE = '../../datasets/raw/property_nearest_amenities.csv'

# Load properties 
domain_df = pd.read_csv("../../datasets/raw/cleaned/domain_cleaned.csv")

if os.path.exists(OUTPUT_FILE):
    summary_df = pd.read_csv(OUTPUT_FILE)
    processed_coords = set(zip(summary_df['Property_Lat'], summary_df['Property_Lon']))
    rows = summary_df.to_dict('records')
    print(f"Resuming. {len(processed_coords)} properties already processed.")
else:
    rows = []
    processed_coords = set()

# Function to query OSM and get nearest amenity
def get_nearest_coordinates(lat, lon, amenity_type):
    tag_list = OSM_TAGS[amenity_type]
    nearest = None
    min_dist = float('inf')

    for tag in tag_list:
        query = f"""
        [out:json];
        (
          node{tag}(around:{SEARCH_RADIUS},{lat},{lon});
          way{tag}(around:{SEARCH_RADIUS},{lat},{lon});
          relation{tag}(around:{SEARCH_RADIUS},{lat},{lon});
        );
        out center;
        """
        for attempt in range(5):  
            try:
                response = requests.get(
                    'http://overpass-api.de/api/interpreter',
                    params={'data': query},
                    timeout=60
                )
                response.raise_for_status()
                elements = response.json().get('elements', [])

                for a in elements:
                    if 'lat' in a and 'lon' in a:
                        a_lat, a_lon = a['lat'], a['lon']
                    elif 'center' in a:
                        a_lat, a_lon = a['center']['lat'], a['center']['lon']
                    else:
                        continue

                    dist = geodesic((lat, lon), (a_lat, a_lon)).meters
                    if dist < min_dist:
                        min_dist = dist
                        nearest = (a_lat, a_lon)
                break  
            except requests.exceptions.HTTPError as e:
                if response.status_code in [429, 504]:
                    wait = 5 * (attempt + 1)
                    print(f"Rate limit/timeout for {amenity_type} at ({lat},{lon}), retry in {wait}s...")
                    time.sleep(wait)
                else:
                    raise e
            except requests.exceptions.RequestException as e:
                print(f"Request error for {amenity_type} at ({lat},{lon}): {e}, retrying in 5s...")
                time.sleep(5)
    return nearest if nearest else (None, None)

# Process properties in batches 
for start in range(0, len(domain_df), BATCH_SIZE):
    batch = domain_df.iloc[start:start + BATCH_SIZE]
    print(f"\nProcessing properties {start + 1} to {start + len(batch)}")

    batch_rows = []
    for _, row in batch.iterrows():
        lat = row['lat']
        lon = row['lon']
        address = row.get('address', '')

        if (lat, lon) in processed_coords:
            continue

        for amenity_type in AMENITY_TYPES:
            try:
                amenity_lat, amenity_lon = get_nearest_coordinates(lat, lon, amenity_type)
                batch_rows.append({
                    'Property_Lat': lat,
                    'Property_Lon': lon,
                    'Address': address,
                    'Amenity_Type': amenity_type,
                    'Amenity_Lat': amenity_lat,
                    'Amenity_Lon': amenity_lon
                })
                time.sleep(1) 
            except Exception as e:
                print(f"Error processing ({lat},{lon}) for {amenity_type}: {e}")
                continue

        processed_coords.add((lat, lon))

    # Append batch to CSV
    batch_df = pd.DataFrame(batch_rows)
    if os.path.exists(OUTPUT_FILE):
        batch_df.to_csv(OUTPUT_FILE, index=False, mode='a', header=False)
    else:
        batch_df.to_csv(OUTPUT_FILE, index=False)

    print(f"Batch saved. Total properties processed so far: {len(processed_coords)}")

print("\nAll properties processed. Data saved to:", OUTPUT_FILE)


Processing properties 1 to 100
Rate limit/timeout for train_station at (-37.82627,144.8679), retry in 5s...
Rate limit/timeout for train_station at (-37.73087,144.95424), retry in 5s...
Rate limit/timeout for supermarket at (-36.22527,145.5591), retry in 5s...
Rate limit/timeout for supermarket at (-37.82033,144.71889), retry in 5s...
Rate limit/timeout for train_station at (-37.8311,144.71599), retry in 5s...
Batch saved. Total properties processed so far: 100

Processing properties 101 to 200
Rate limit/timeout for train_station at (-37.76404,144.67564), retry in 5s...
Batch saved. Total properties processed so far: 200

Processing properties 201 to 300
Rate limit/timeout for train_station at (-37.76833,144.69543), retry in 5s...
Rate limit/timeout for supermarket at (-37.90359,145.04881), retry in 5s...
Rate limit/timeout for supermarket at (-37.903347,145.04514), retry in 5s...
Rate limit/timeout for train_station at (-37.90227,145.04028), retry in 5s...
Rate limit/timeout for tra

## Turn coordinates to distances using Open Route Service & Google Maps API

### Open Route Service

In [None]:
# Load amenities stats data
amenities_stats = pd.read_csv("../../datasets/raw/property_nearest_amenities.csv")

# Pivot amenities per property
amenities_pivot = amenities_stats.pivot_table(
    index=['Property_Lat', 'Property_Lon', 'Address'],
    columns='Amenity_Type',
    values=['Amenity_Lat', 'Amenity_Lon'],
    aggfunc='first'
).reset_index()

# Flatten MultiIndex columns
amenities_pivot.columns = ['_'.join(filter(None, col)).strip('_') for col in amenities_pivot.columns]
amenities_pivot.head()

Unnamed: 0,Property_Lat,Property_Lon,Address,Amenity_Lat_supermarket,Amenity_Lat_train_station,Amenity_Lon_supermarket,Amenity_Lon_train_station
0,-38.665943,146.32794,12 Gunn Street,-38.665239,,146.32347,
1,-38.65126,146.20575,14 Church Hill Road,-38.652791,,146.201848,
2,-38.633835,145.7225,5 Howsam Place,-38.632875,,145.728641,
3,-38.632805,145.73022,2/1 High Street,-38.632875,,145.728641,
4,-38.631218,145.72937,23 Bayview Avenue,-38.632875,,145.728641,


In [None]:
# INSERT API KEY HERE
ORS_KEY = ''

In [None]:
# CONFIG
CBD_COORDS = (144.9671, -37.8183)  
BATCH_DELAY = 1         
MAX_RETRIES = 3
MAX_ROUTES = 3500 

INPUT_CSV = amenities_pivot
OUTPUT_CSV = "../../datasets/raw/property_with_distances_copy.csv"


# Initialize client
client = ors.Client(key=ORS_KEY)

# Utility func
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def safe_distance_matrix(origins, destinations):
    """Compute ORS distance matrix with retries"""
    for attempt in range(MAX_RETRIES):
        try:
            res = client.distance_matrix(
                locations=origins + destinations,
                profile='driving-car',
                sources=list(range(len(origins))),
                destinations=list(range(len(origins), len(origins) + len(destinations))),
                metrics=['distance', 'duration']
            )
            return res
        except Exception as e:
            print(f"ORS attempt {attempt+1} failed: {e}. Retrying in 5s...")
            time.sleep(5)
    print("ORS failed after retries. Returning NaN.")
    n = len(destinations)
    return {'distances': [[float('nan')]*n for _ in origins],
            'durations': [[float('nan')]*n for _ in origins]}


# load/create output
if os.path.exists(OUTPUT_CSV):
    distance_df = pd.read_csv(OUTPUT_CSV)
    
    # Only consider rows with valid CBD distance as completed
    completed_indices = set(distance_df[~distance_df['Distance_to_CBD_m'].isna()].index)
    print(f"Resuming from {len(completed_indices)} completed properties...")
else:
    distance_df = pd.DataFrame()
    completed_indices = set()

# Prepare origins and destinations
origins_list = []
property_indices = []
destinations_set = set()
for idx, row in amenities_pivot.iterrows():
    if idx in completed_indices:
        continue  
    origins_list.append((row['Property_Lon'], row['Property_Lat']))
    property_indices.append(idx)
    
    if pd.notna(row.get('Amenity_Lon_supermarket')) and pd.notna(row.get('Amenity_Lat_supermarket')):
        destinations_set.add((row['Amenity_Lon_supermarket'], row['Amenity_Lat_supermarket']))
    if pd.notna(row.get('Amenity_Lon_train_station')) and pd.notna(row.get('Amenity_Lat_train_station')):
        destinations_set.add((row['Amenity_Lon_train_station'], row['Amenity_Lat_train_station']))

destinations_set.add(CBD_COORDS)
destinations_list = list(destinations_set)

max_origins_per_batch = max(1, MAX_ROUTES // len(destinations_list))

# Process batches
for origin_chunk, idx_chunk in zip(chunk_list(origins_list, max_origins_per_batch),
                                   chunk_list(property_indices, max_origins_per_batch)):
    print(f"Processing batch of {len(origin_chunk)} properties...")
    
    res_matrix = safe_distance_matrix(origin_chunk, destinations_list)
    distances_matrix = res_matrix['distances']
    durations_matrix = res_matrix['durations']
    
    for i, prop_idx in enumerate(idx_chunk):
        row = amenities_pivot.loc[prop_idx]
        dist_supermarket = dist_train = dist_CBD = float('nan')
        time_supermarket = time_train = time_CBD = float('nan')
        
        for j, dest in enumerate(destinations_list):
            if (pd.notna(row.get('Amenity_Lon_supermarket')) and pd.notna(row.get('Amenity_Lat_supermarket')) and
                dest == (row['Amenity_Lon_supermarket'], row['Amenity_Lat_supermarket'])):
                dist_supermarket = distances_matrix[i][j]
                time_supermarket = durations_matrix[i][j]
            if (pd.notna(row.get('Amenity_Lon_train_station')) and pd.notna(row.get('Amenity_Lat_train_station')) and
                dest == (row['Amenity_Lon_train_station'], row['Amenity_Lat_train_station'])):
                dist_train = distances_matrix[i][j]
                time_train = durations_matrix[i][j]
            if dest == CBD_COORDS:
                dist_CBD = distances_matrix[i][j]
                time_CBD = durations_matrix[i][j]
        
        # If row exists in distance_df, overwrite it
        if prop_idx in distance_df.index:
            distance_df.loc[prop_idx, ['Distance_to_supermarket_m','TravelTime_to_supermarket_s',
                                       'Distance_to_train_station_m','TravelTime_to_train_station_s',
                                       'Distance_to_CBD_m','TravelTime_to_CBD_s']] = [
                dist_supermarket, time_supermarket, dist_train, time_train, dist_CBD, time_CBD
            ]
        else:
            distance_df.loc[prop_idx] = {
                'Property_Lat': row['Property_Lat'],
                'Property_Lon': row['Property_Lon'],
                'Address': row.get('Address', ''),
                'Distance_to_supermarket_m': dist_supermarket,
                'TravelTime_to_supermarket_s': time_supermarket,
                'Distance_to_train_station_m': dist_train,
                'TravelTime_to_train_station_s': time_train,
                'Distance_to_CBD_m': dist_CBD,
                'TravelTime_to_CBD_s': time_CBD
            }
    
    # Save progress after each batch
    distance_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved progress for {len(distance_df)} properties.")
    
    time.sleep(BATCH_DELAY)

print(" All batches processed.")

### Google Maps API

Note: Used Google Maps API free trial here to speed up processing due to OpenRouteService API call limitations.

In [None]:
# INSERT API KEY HERE
GMAPS_KEY = ""

In [None]:
# CONFIG
CBD_COORDS = (-37.8183, 144.9671)  
BATCH_DELAY = 1  
MAX_RETRIES = 3

INPUT_CSV = amenities_pivot
OUTPUT_CSV = "../../datasets/raw/property_with_distances_copy.csv"


# Initialize client
gmaps = googlemaps.Client(key=GMAPS_KEY)

# Utility func
def safe_distance_matrix(origins, destinations):
    for attempt in range(MAX_RETRIES):
        try:
            result = gmaps.distance_matrix(origins, destinations, mode="driving")
            return result
        except Exception as e:
            print(f"Google Maps attempt {attempt+1} failed: {e}. Retrying in 5s...")
            time.sleep(5)
    print("⚠️ Google Maps failed after retries. Returning NaN.")
    return {
        "rows": [{"elements": [{"distance": {"value": float('nan')},
                                "duration": {"value": float('nan')}} for _ in destinations]} 
                 for _ in origins]
    }


# Load/create output
if os.path.exists(OUTPUT_CSV):
    distance_df = pd.read_csv(OUTPUT_CSV)
    completed_indices = set(distance_df[~distance_df['Distance_to_CBD_m'].isna()].index)
    print(f"Resuming from {len(completed_indices)} completed properties...")
else:
    distance_df = pd.DataFrame()
    completed_indices = set()

# Process each property
for idx, row in amenities_pivot.iterrows():
    if idx in completed_indices:
        continue

    origins = [f"{row['Property_Lat']},{row['Property_Lon']}"]
    destinations = [f"{CBD_COORDS[0]},{CBD_COORDS[1]}"]

    if pd.notna(row.get('Amenity_Lat_supermarket')) and pd.notna(row.get('Amenity_Lon_supermarket')):
        destinations.append(f"{row['Amenity_Lat_supermarket']},{row['Amenity_Lon_supermarket']}")
    if pd.notna(row.get('Amenity_Lat_train_station')) and pd.notna(row.get('Amenity_Lon_train_station')):
        destinations.append(f"{row['Amenity_Lat_train_station']},{row['Amenity_Lon_train_station']}")

    res_matrix = safe_distance_matrix(origins, destinations)
    elements = res_matrix["rows"][0]["elements"]

    # Assign distances/durations
    dist_CBD = elements[0]["distance"]["value"]
    time_CBD = elements[0]["duration"]["value"]
    dist_supermarket = time_supermarket = dist_train = time_train = float('nan')

    for i, dest in enumerate(destinations[1:]):
        if dest == f"{row.get('Amenity_Lat_supermarket')},{row.get('Amenity_Lon_supermarket')}":
            dist_supermarket = elements[i+1]["distance"]["value"]
            time_supermarket = elements[i+1]["duration"]["value"]
        elif dest == f"{row.get('Amenity_Lat_train_station')},{row.get('Amenity_Lon_train_station')}":
            dist_train = elements[i+1]["distance"]["value"]
            time_train = elements[i+1]["duration"]["value"]

    # Save row
    distance_df.loc[idx] = {
        'Property_Lat': row['Property_Lat'],
        'Property_Lon': row['Property_Lon'],
        'Address': row.get('Address', ''),
        'Distance_to_supermarket_m': dist_supermarket,
        'TravelTime_to_supermarket_s': time_supermarket,
        'Distance_to_train_station_m': dist_train,
        'TravelTime_to_train_station_s': time_train,
        'Distance_to_CBD_m': dist_CBD,
        'TravelTime_to_CBD_s': time_CBD
    }

    # Save progress after each property
    distance_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved property {idx}")
    time.sleep(BATCH_DELAY)
