#  DOMAIN EDA

In [None]:
import os
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import folium
import numpy as np
from pathlib import Path
import statsmodels.api as sm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from collections import Counter


In [281]:
domain_df = pd.read_csv("../../datasets/raw/cleaned/curated/domain_combined_all_with_sa2_features.csv")
domain_df.columns

Index(['sa2_code', 'sa2_name', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'address', 'lat', 'lon', 'bedrooms', 'bathrooms', 'carspaces',
       'property_type', 'balcony', 'car_parking', 'heating',
       'air_conditioning', 'builtin_wardrobes', 'swimming_pool', 'ensuite',
       'dishwasher', 'garden', 'gym', 'pets_allowed', 'gas', 'intercom',
       'security_system', 'washing_machine', 'property_type_grouped',
       'distance_supermarket_m', 'travel_time_supermarket_mins',
       'distance_train_station_m', 'travel_time_train_station_mins',
       'distance_school_m', 'travel_time_school_mins', 'distance_healthcare_m',
       'travel_time_healthcare_mins', 'distance_cbd_m', 'travel_time_cbd_mins',
       'crime_rate_sa2_2025', 'sa2_erp_2025',
       'sa2_pop_density_2025_people_per_km2', 'sa2_pop_growth_2020_2024',
       'sa2_median_historical_growth', 'sa2_erp_change_per_cent_2023_24',
       'sa2_median_income_cagr', 'sa2_predicted_median_income_25'],
      dtype='objec

### CHECK DATA QUALITY

In [282]:
domain_df.shape

(12616, 47)

In [283]:
domain_df.isnull().sum()

sa2_code                                  0
sa2_name                                  0
suburb                                    0
postcode                                  0
weekly_rent                               0
bond                                      0
address                                  91
lat                                       0
lon                                       0
bedrooms                                  0
bathrooms                                 0
carspaces                                 0
property_type                             0
balcony                                   0
car_parking                               0
heating                                   0
air_conditioning                          0
builtin_wardrobes                         0
swimming_pool                             0
ensuite                                   0
dishwasher                                0
garden                                    0
gym                             

In [284]:
# Create indicator columns: Note the cap for supermarket/train = 3 km radius, school/healthcare = 15 km
domain_df['supermarket_nearby'] = domain_df['distance_supermarket_m'].notna().astype(int)
domain_df['train_station_nearby'] = domain_df['distance_train_station_m'].notna().astype(int)
domain_df['school_nearby'] = domain_df['distance_school_m'].notna().astype(int)
domain_df['healthcare_nearby'] = domain_df['distance_healthcare_m'].notna().astype(int)
domain_df

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,sa2_pop_density_2025_people_per_km2,sa2_pop_growth_2020_2024,sa2_median_historical_growth,sa2_erp_change_per_cent_2023_24,sa2_median_income_cagr,sa2_predicted_median_income_25,supermarket_nearby,train_station_nearby,school_nearby,healthcare_nearby
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381,1,1,1,1
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381,1,1,1,1
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,1006.350051,3.364510,4.702436,3.4317,4.220320,70070.10476,1,1,1,1
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381,1,1,1,1
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.826270,144.86790,2,...,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12611,208011169,Brighton (Vic.),BRIGHTON,3186,1575.0,6300.0,,-37.912884,144.99155,2,...,2910.591738,-0.120152,0.364024,2.3174,5.161190,80802.13333,0,0,1,1
12612,208011169,Brighton (Vic.),BRIGHTON,3186,2625.0,10500.0,2/71 Roslyn Street,-37.922750,145.00224,4,...,2910.591738,-0.120152,0.364024,2.3174,5.161190,80802.13333,1,1,1,1
12613,208011169,Brighton (Vic.),BRIGHTON,3186,2200.0,13200.0,23 Bay Street,-37.903280,144.98697,5,...,2910.591738,-0.120152,0.364024,2.3174,5.161190,80802.13333,1,1,1,1
12614,208011169,Brighton (Vic.),BRIGHTON,3186,1390.0,8340.0,2/7B Wilson Street,-37.909650,144.99810,3,...,2910.591738,-0.120152,0.364024,2.3174,5.161190,80802.13333,1,1,1,1


In [285]:
imputed_df = domain_df.copy()

# define caps (metres)
caps = {
    'supermarket': 3100,
    'train_station': 3100,
    'school': 15100,
    'healthcare': 15100
}

# compute & fill missing distances and times
for facility, cap_value in caps.items():
    dist_col = f'distance_{facility}_m'
    time_col = f'travel_time_{facility}_mins'
    
    # --- Estimate avg speed based on valid rows ---
    valid_mask = imputed_df[dist_col].notna() & imputed_df[time_col].notna() & (imputed_df[time_col] > 0)
    avg_speed = (imputed_df.loc[valid_mask, dist_col] / imputed_df.loc[valid_mask, time_col]).mean()
    
    # --- Impute distances with fixed cap ---
    imputed_df.loc[imputed_df[dist_col].isna(), dist_col] = cap_value
    
    # --- Impute travel times based on avg speed ---
    imputed_df.loc[imputed_df[time_col].isna(), time_col] = imputed_df.loc[imputed_df[time_col].isna(), dist_col] / avg_speed
    
    print(f"{facility}: capped missing dist at {cap_value} m, avg_speed ≈ {avg_speed:.1f} m/min")


supermarket: capped missing dist at 3100 m, avg_speed ≈ 379.6 m/min
train_station: capped missing dist at 3100 m, avg_speed ≈ 392.2 m/min
school: capped missing dist at 15100 m, avg_speed ≈ 434.6 m/min
healthcare: capped missing dist at 15100 m, avg_speed ≈ 480.8 m/min


In [286]:
for facility in caps.keys():
    dist_col = f'distance_{facility}_m'
    time_col = f'travel_time_{facility}_mins'
    
    imputed_rows = domain_df[dist_col].isna().sum()
    preview = imputed_df.loc[domain_df[dist_col].isna(), [dist_col, time_col]].head(5)
    
    print(f"\n--- {facility.upper()} ---")
    print(f"Imputed {imputed_rows} rows")
    print(preview)



--- SUPERMARKET ---
Imputed 598 rows
    distance_supermarket_m  travel_time_supermarket_mins
11                  3100.0                      8.167505
26                  3100.0                      8.167505
28                  3100.0                      8.167505
37                  3100.0                      8.167505
43                  3100.0                      8.167505

--- TRAIN_STATION ---
Imputed 2596 rows
    distance_train_station_m  travel_time_train_station_mins
6                     3100.0                        7.904461
7                     3100.0                        7.904461
8                     3100.0                        7.904461
11                    3100.0                        7.904461
26                    3100.0                        7.904461

--- SCHOOL ---
Imputed 15 rows
      distance_school_m  travel_time_school_mins
2941            15100.0                 34.74432
4400            15100.0                 34.74432
5134            15100.0           

In [287]:
imputed_df.isnull().sum()

sa2_code                                0
sa2_name                                0
suburb                                  0
postcode                                0
weekly_rent                             0
bond                                    0
address                                91
lat                                     0
lon                                     0
bedrooms                                0
bathrooms                               0
carspaces                               0
property_type                           0
balcony                                 0
car_parking                             0
heating                                 0
air_conditioning                        0
builtin_wardrobes                       0
swimming_pool                           0
ensuite                                 0
dishwasher                              0
garden                                  0
gym                                     0
pets_allowed                      

In [288]:
imputed_df.shape

(12616, 51)

In [289]:
imputed_df.to_csv('../../datasets/raw/cleaned/curated/domain_modelling.csv', index=False)

### EDA DOMAIN DATASET