### Domain Data

In [217]:
# Libraries
import os
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import folium
import numpy as np
from pathlib import Path
import statsmodels.api as sm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from fuzzywuzzy import process
from collections import Counter

In [218]:
domain_df = pd.read_csv("../../datasets/raw/cleaned/domain_cleaned.csv")
domain_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,ensuite,dishwasher,garden,gym,pets_allowed,gas,intercom,security_system,washing_machine,property_type_grouped
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,0,0,1,0,0,0,0,Apartment
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,0,0,0,1,0,0,0,1,Apartment
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,1,1,0,0,1,1,0,0,1,Townhouse
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,1,0,0,0,0,0,0,0,Townhouse
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,0,0,0,0,0,0,0,Apartment


### Prepare Domain Data for analysis

In [219]:
distances_df = pd.read_csv("../../datasets/property/property_with_distances_copy.csv")
school_health_df = pd.read_csv('../../datasets/property/sch,hospital,cbd(in).csv')
distances_df.head()

Unnamed: 0,Property_Lat,Property_Lon,Address,Distance_to_supermarket_m,TravelTime_to_supermarket_s,Distance_to_train_station_m,TravelTime_to_train_station_s,Distance_to_CBD_m,TravelTime_to_CBD_s
0,-38.665943,146.32794,12 Gunn Street,576.89,85.03,,,182294.98,8502.36
1,-38.65126,146.20575,14 Church Hill Road,418.16,74.67,,,170509.42,7966.59
2,-38.633835,145.7225,5 Howsam Place,679.56,157.02,,,146906.58,7232.62
3,-38.632805,145.73022,2/1 High Street,420.97,89.12,,,146913.31,7204.46
4,-38.631218,145.72937,23 Bayview Avenue,433.16,84.57,,,146803.75,7191.09


In [220]:
distances_df = distances_df.rename(columns={
    "Property_Lat": "lat",
    "Property_Lon": "lon"
})

# Merge on lat + lon (exact match)
merged_distance = pd.merge(domain_df, distances_df, on=["lat", "lon"], how="left")
merged_distance.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,security_system,washing_machine,property_type_grouped,Address,Distance_to_supermarket_m,TravelTime_to_supermarket_s,Distance_to_train_station_m,TravelTime_to_train_station_s,Distance_to_CBD_m,TravelTime_to_CBD_s
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,Apartment,3/53 Greene Street,1774.0,338.0,1928.0,302.0,11552.0,1084.0
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,1,Apartment,1/3 New Street,2051.0,353.0,2205.0,317.0,11829.0,1099.0
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,0,1,Townhouse,19/92 New Street,2969.0,359.0,2318.0,325.0,11942.0,1107.0
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,0,Townhouse,3/14 Saltley Street,1831.0,334.0,1985.0,299.0,11609.0,1080.0
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,Apartment,4/2B Saltley Street,2127.0,366.0,2281.0,330.0,11905.0,1112.0


In [221]:
unmatched = merged_distance[merged_distance["Distance_to_CBD_m"].isna()]
unmatched

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,security_system,washing_machine,property_type_grouped,Address,Distance_to_supermarket_m,TravelTime_to_supermarket_s,Distance_to_train_station_m,TravelTime_to_train_station_s,Distance_to_CBD_m,TravelTime_to_CBD_s
11,217031472,Colac,COLAC,3250,650.0,2600.0,,-38.341020,143.58546,4,...,0,0,House,,,,,,,
26,216031418,Shepparton Surrounds - East,KATANDRA,3634,440.0,1906.0,15 Queen Street,-36.225270,145.55910,3,...,0,1,House,,,,,,,
28,210021235,Romsey,CLARKEFIELD,3430,510.0,2217.0,892 Heaths Lane,-37.470654,144.84047,3,...,0,0,House,,,,,,,
37,213041359,Rockbank - Mount Cottrell,TRUGANINA,3029,520.0,2260.0,38 Caucasus Street,-37.762420,144.69594,4,...,0,0,House,,,,,,,
43,213041359,Rockbank - Mount Cottrell,TRUGANINA,3029,520.0,2260.0,6 Banahaw Road,-37.750180,144.68143,4,...,0,1,House,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12606,208011169,Brighton (Vic.),BRIGHTON,3186,3500.0,14000.0,,-37.904400,144.99974,3,...,1,1,Townhouse,,,,,,,
12608,208011169,Brighton (Vic.),BRIGHTON,3186,3500.0,14000.0,,-37.904400,144.99974,3,...,0,1,Townhouse,,,,,,,
12609,208011169,Brighton (Vic.),BRIGHTON,3186,1750.0,7000.0,,-37.912884,144.99155,2,...,0,1,Apartment,,,,,,,
12610,208011169,Brighton (Vic.),BRIGHTON,3186,1575.0,6300.0,,-37.904400,144.99974,2,...,0,1,Apartment,,,,,,,


In [222]:
len(unmatched)

543

In [223]:
# Columns to convert
time_cols_s = [
    "TravelTime_to_supermarket_s",
    "TravelTime_to_train_station_s",
    "TravelTime_to_CBD_s"
]

# Convert to minutes
for col in time_cols_s:
    merged_distance[col] = merged_distance[col] / 60

rename_map = {
    "Distance_to_supermarket_m": "distance_supermarket_m",
    "TravelTime_to_supermarket_s": "travel_time_supermarket_mins",
    "Distance_to_train_station_m": "distance_train_station_m",
    "TravelTime_to_train_station_s": "travel_time_train_station_mins",
    "Distance_to_CBD_m": "distance_cbd_m",
    "TravelTime_to_CBD_s": "travel_time_cbd_mins",
    
}

merged_distance = merged_distance.rename(columns=rename_map)

# Drop address
merged_distance = merged_distance.drop(columns=["Address"])
merged_distance.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,intercom,security_system,washing_machine,property_type_grouped,distance_supermarket_m,travel_time_supermarket_mins,distance_train_station_m,travel_time_train_station_mins,distance_cbd_m,travel_time_cbd_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,0,Apartment,1774.0,5.633333,1928.0,5.033333,11552.0,18.066667
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,0,1,Apartment,2051.0,5.883333,2205.0,5.283333,11829.0,18.316667
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,0,0,1,Townhouse,2969.0,5.983333,2318.0,5.416667,11942.0,18.45
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,0,0,Townhouse,1831.0,5.566667,1985.0,4.983333,11609.0,18.0
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,0,Apartment,2127.0,6.1,2281.0,5.5,11905.0,18.533333


In [224]:
school_health_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,security_system,washing_machine,median_weekly_rent_sa2,geometry,nearest_education_dist_m,nearest_education_time_mins,nearest_health_dist_m,nearest_health_time_mins,cbd_dist_m,cbd_time_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,650.0,POINT (144.87091 -37.830982),1060.74,3.023333,2427.16,4.2655,11051.07,17.002833
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738,1/3 New Street,-37.826218,144.86755,2,...,0,1,650.0,POINT (144.86755 -37.826218),1401.92,4.15,2549.43,4.859167,11392.26,18.0705
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454,19/92 New Street,-37.831226,144.86632,3,...,0,1,670.0,POINT (144.86632 -37.831226),1449.81,3.266167,1983.01,3.499833,11440.15,18.458167
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933,3/14 Saltley Street,-37.827423,144.86768,3,...,0,0,650.0,POINT (144.86768 -37.827423),1118.54,3.500167,2504.92,4.7915,11108.88,17.420833
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,650.0,POINT (144.8679 -37.82627),1320.59,3.875667,2630.75,5.1335,11310.93,17.796333


In [225]:
school_health_df_unique = school_health_df.drop_duplicates(subset=['lat', 'lon'])


properties_df = merged_distance.merge(
    school_health_df_unique[['lat', 'lon', 'nearest_education_dist_m', 'nearest_education_time_mins', 'nearest_health_dist_m', 'nearest_health_time_mins', 'cbd_dist_m', 'cbd_time_mins' ]],
    on=['lat', 'lon'],
    how='left'
)
properties_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_train_station_m,travel_time_train_station_mins,distance_cbd_m,travel_time_cbd_mins,nearest_education_dist_m,nearest_education_time_mins,nearest_health_dist_m,nearest_health_time_mins,cbd_dist_m,cbd_time_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,1928.0,5.033333,11552.0,18.066667,1060.74,3.023333,2427.16,4.2655,11051.07,17.002833
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,2205.0,5.283333,11829.0,18.316667,1401.92,4.15,2549.43,4.859167,11392.26,18.0705
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,2318.0,5.416667,11942.0,18.45,1449.81,3.266167,1983.01,3.499833,11440.15,18.458167
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,1985.0,4.983333,11609.0,18.0,1118.54,3.500167,2504.92,4.7915,11108.88,17.420833
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,2281.0,5.5,11905.0,18.533333,1320.59,3.875667,2630.75,5.1335,11310.93,17.796333


In [226]:
unmatched = properties_df[properties_df["cbd_dist_m"].isna()]
unmatched

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_train_station_m,travel_time_train_station_mins,distance_cbd_m,travel_time_cbd_mins,nearest_education_dist_m,nearest_education_time_mins,nearest_health_dist_m,nearest_health_time_mins,cbd_dist_m,cbd_time_mins


In [227]:
properties_df = properties_df.drop(columns=["distance_cbd_m", "travel_time_cbd_mins"])

In [228]:
rename_map = {
    "nearest_education_dist_m": "distance_school_m",
    "nearest_health_dist_m": "distance_healthcare_m",
    "nearest_education_time_mins": "travel_time_school_mins",
    "nearest_health_time_mins": "travel_time_healthcare_mins",
    "cbd_dist_m": "distance_cbd_m",
    "cbd_time_mins": "travel_time_cbd_mins",
}
properties_df = properties_df.rename(columns=rename_map)

In [229]:
properties_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_supermarket_m,travel_time_supermarket_mins,distance_train_station_m,travel_time_train_station_mins,distance_school_m,travel_time_school_mins,distance_healthcare_m,travel_time_healthcare_mins,distance_cbd_m,travel_time_cbd_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,1774.0,5.633333,1928.0,5.033333,1060.74,3.023333,2427.16,4.2655,11051.07,17.002833
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,2051.0,5.883333,2205.0,5.283333,1401.92,4.15,2549.43,4.859167,11392.26,18.0705
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,2969.0,5.983333,2318.0,5.416667,1449.81,3.266167,1983.01,3.499833,11440.15,18.458167
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,1831.0,5.566667,1985.0,4.983333,1118.54,3.500167,2504.92,4.7915,11108.88,17.420833
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,2127.0,6.1,2281.0,5.5,1320.59,3.875667,2630.75,5.1335,11310.93,17.796333


In [230]:
# crime 
crime_df = pd.read_csv("../../datasets/crime/crime_dataset_weighted_to_SA2(in) (1).csv")
crime_df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,Incidents_2021,Incidents_2022,Incidents_2023,...,VictimRate_2016,VictimRate_2017,VictimRate_2018,VictimRate_2019,VictimRate_2020,VictimRate_2021,VictimRate_2022,VictimRate_2023,VictimRate_2024,VictimRate_2025
0,201011001,Alfredton,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
1,201011002,Ballarat,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
2,201011005,Buninyong,14548.83225,14879.64518,14357.93044,13111.79632,14042.6387,11655.65456,12212.32826,13260.75245,...,5562.381238,5160.2999,4856.329142,4219.608124,4531.211146,3444.831718,3763.718188,3670.649736,4203.021792,4143.264308
3,201011006,Delacombe,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
4,201011007,Smythes Creek,2700.0,2659.5,2164.5,2299.5,2115.0,2025.0,1903.5,2286.0,...,1948.582849,1733.516119,1220.300979,1436.371273,1288.680949,1132.894103,1074.20718,1084.766775,1363.10134,1354.460609


In [232]:
crime_2025 = crime_df[[
    "SA2_CODE_2021", 
    "CrimeRate_2025"
]].copy()

crime_2025 = crime_2025.rename(columns={
    "SA2_CODE_2021": "sa2_code",
    "CrimeRate_2025": "crime_rate_sa2_2025"
})

# Merge with your property dataset
properties_merged = properties_df.merge(
    crime_2025,
    on='sa2_code',
    how='left'
)


In [233]:
# Check results
properties_merged[['crime_rate_sa2_2025']].isna().sum()

crime_rate_sa2_2025    0
dtype: int64

In [234]:
properties_merged.head(10)

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,travel_time_supermarket_mins,distance_train_station_m,travel_time_train_station_mins,distance_school_m,travel_time_school_mins,distance_healthcare_m,travel_time_healthcare_mins,distance_cbd_m,travel_time_cbd_mins,crime_rate_sa2_2025
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,5.633333,1928.0,5.033333,1060.74,3.023333,2427.16,4.2655,11051.07,17.002833,6491.46985
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,5.883333,2205.0,5.283333,1401.92,4.15,2549.43,4.859167,11392.26,18.0705,6491.46985
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,5.983333,2318.0,5.416667,1449.81,3.266167,1983.01,3.499833,11440.15,18.458167,6491.46985
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,5.566667,1985.0,4.983333,1118.54,3.500167,2504.92,4.7915,11108.88,17.420833,6491.46985
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,6.1,2281.0,5.5,1320.59,3.875667,2630.75,5.1335,11310.93,17.796333,6491.46985
5,213021344,Newport,SOUTH KINGSVILLE,3015,750.0,3258.0,2/142 Blackshaws Road,-37.833134,144.86981,3,...,5.066667,2133.0,5.35,1254.39,2.357167,2167.3,3.599333,11257.9,17.456833,6491.46985
6,203031049,Lorne - Anglesea,LORNE,3232,625.0,2715.0,7 Holliday Road,-38.528168,143.97572,3,...,2.511167,,,2125.16,3.9435,2272.08,4.296167,160523.05,123.344833,2510.609182
7,203031049,Lorne - Anglesea,LORNE,3232,425.0,1846.0,16/22-28 Mountjoy Parade,-38.53726,143.974,1,...,1.015167,,,380.84,0.6935,527.77,1.046167,161371.39,124.453833,2510.609182
8,203031049,Lorne - Anglesea,LORNE,3232,270.0,1173.0,1/4 Smithers Street,-38.52739,143.98016,1,...,2.139833,,,2002.49,3.572167,2149.42,3.924833,160089.58,122.6865,2510.609182
9,217031472,Colac,COLAC,3250,410.0,1640.0,143 Hearn Street,-38.349373,143.58618,3,...,2.104333,1086.23,2.470833,586.29,1.4975,1071.14,1.903667,152663.2,116.597833,6091.996375


In [235]:
properties_merged.columns

Index(['sa2_code', 'sa2_name', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'address', 'lat', 'lon', 'bedrooms', 'bathrooms', 'carspaces',
       'property_type', 'balcony', 'car_parking', 'heating',
       'air_conditioning', 'builtin_wardrobes', 'swimming_pool', 'ensuite',
       'dishwasher', 'garden', 'gym', 'pets_allowed', 'gas', 'intercom',
       'security_system', 'washing_machine', 'property_type_grouped',
       'distance_supermarket_m', 'travel_time_supermarket_mins',
       'distance_train_station_m', 'travel_time_train_station_mins',
       'distance_school_m', 'travel_time_school_mins', 'distance_healthcare_m',
       'travel_time_healthcare_mins', 'distance_cbd_m', 'travel_time_cbd_mins',
       'crime_rate_sa2_2025'],
      dtype='object')

In [236]:
properties_merged.shape

(12616, 40)

### Merge latest population and income per sa2 data 

In [239]:
population_df = pd.read_csv("../../datasets/raw/cleaned/curated/population_data.csv")
erp_df = pd.read_csv("../../datasets/raw/cleaned/curated/full_erp_only_population_data.csv")
income_df = pd.read_csv("../../datasets/income/income_predictions_2017_final(in).csv")

In [241]:
population_df.columns

Index(['sa2_code', 'sa2_name', 'erp_2015', 'erp_2016', 'erp_2017', 'erp_2018',
       'erp_2019', 'erp_2020', 'erp_2021', 'erp_2022', 'erp_2023', 'erp_2024',
       'erp_change_number_2023_24', 'erp_change_per_cent_2023_24', 'area_km2',
       'pop_density_2024_people_per_km2', 'births_2021_22', 'deaths_2021_22',
       'natural_increase_2021_22', 'internal_arrivals_2021_22',
       'internal_departures_2021_22', 'net_internal_migration_2021_22',
       'overseas_arrivals_2021_22', 'overseas_departures_2021_22',
       'net_overseas_migration_2021_22', 'births_2022_23', 'deaths_2022_23',
       'natural_increase_2022_23', 'internal_arrivals_2022_23',
       'internal_departures_2022_23', 'net_internal_migration_2022_23',
       'overseas_arrivals_2022_23', 'overseas_departures_2022_23',
       'net_overseas_migration_2022_23', 'births_2023_24', 'deaths_2023_24',
       'natural_increase_2023_24', 'internal_arrivals_2023_24',
       'internal_departures_2023_24', 'net_internal_migration

In [242]:
population_df.shape

(530, 79)

In [243]:
erp_df.shape

(530, 23)

In [244]:
erp_df.columns

Index(['sa2_code', 'erp_2015', 'erp_2016', 'erp_2017', 'erp_2018', 'erp_2019',
       'erp_2020', 'erp_2021', 'erp_2022', 'erp_2023', 'erp_2024', 'erp_2025',
       'erp_2026', 'erp_2027', 'erp_2028', 'erp_2029', 'erp_2030', 'erp_2031',
       'erp_2032', 'erp_2033', 'erp_2034', 'erp_2035', 'erp_2036'],
      dtype='object')

In [246]:
population_df = population_df.drop_duplicates(subset='sa2_code')
erp_df = erp_df.drop_duplicates(subset='sa2_code')

In [247]:
for year in range(2015, 2037): 
    col = f'erp_{year}'
    if col not in population_df.columns:
        # add the column from erp_full_df
        population_df = population_df.merge(
            erp_df[['sa2_code', col]],
            on='sa2_code',
            how='left'
        )

In [248]:
population_df.shape

(522, 88)

In [249]:
population_df['pop_density_2025_people_per_km2'] = population_df['erp_2025'] / population_df['area_km2']

In [250]:
population_df.to_csv("../../datasets/raw/cleaned/curated/population_data_with_predicted.csv", index=False)

In [251]:
# features we want
sa2_pop_features = ['sa2_code', 'erp_2025','pop_density_2025_people_per_km2','pop_growth_2020_2024', 'median_historical_growth', 'erp_change_per_cent_2023_24']
pop_df_selected = population_df[sa2_pop_features].copy()

In [252]:
property_with_pop = properties_merged.merge(
    pop_df_selected, how='left', on='sa2_code')
property_with_pop.columns

Index(['sa2_code', 'sa2_name', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'address', 'lat', 'lon', 'bedrooms', 'bathrooms', 'carspaces',
       'property_type', 'balcony', 'car_parking', 'heating',
       'air_conditioning', 'builtin_wardrobes', 'swimming_pool', 'ensuite',
       'dishwasher', 'garden', 'gym', 'pets_allowed', 'gas', 'intercom',
       'security_system', 'washing_machine', 'property_type_grouped',
       'distance_supermarket_m', 'travel_time_supermarket_mins',
       'distance_train_station_m', 'travel_time_train_station_mins',
       'distance_school_m', 'travel_time_school_mins', 'distance_healthcare_m',
       'travel_time_healthcare_mins', 'distance_cbd_m', 'travel_time_cbd_mins',
       'crime_rate_sa2_2025', 'erp_2025', 'pop_density_2025_people_per_km2',
       'pop_growth_2020_2024', 'median_historical_growth',
       'erp_change_per_cent_2023_24'],
      dtype='object')

In [253]:
property_with_pop.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_healthcare_m,travel_time_healthcare_mins,distance_cbd_m,travel_time_cbd_mins,crime_rate_sa2_2025,erp_2025,pop_density_2025_people_per_km2,pop_growth_2020_2024,median_historical_growth,erp_change_per_cent_2023_24
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,2427.16,4.2655,11051.07,17.002833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,2549.43,4.859167,11392.26,18.0705,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,1983.01,3.499833,11440.15,18.458167,6491.46985,16287.17176,1006.350051,3.36451,4.702436,3.4317
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,2504.92,4.7915,11108.88,17.420833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,2630.75,5.1335,11310.93,17.796333,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348


In [254]:
property_with_pop.shape

(12616, 45)

In [255]:
property_with_pop[['erp_2025', 'pop_density_2025_people_per_km2', 'pop_growth_2020_2024', 'median_historical_growth', 'erp_change_per_cent_2023_24']].isna().sum()

erp_2025                           0
pop_density_2025_people_per_km2    0
pop_growth_2020_2024               0
median_historical_growth           0
erp_change_per_cent_2023_24        0
dtype: int64

In [256]:
property_with_pop = property_with_pop.rename(columns={
    'erp_2025': 'sa2_erp_2025',
    'pop_density_2025_people_per_km2': 'sa2_pop_density_2025_people_per_km2',
    'pop_growth_2020_2024': 'sa2_pop_growth_2020_2024',
    'median_historical_growth' : 'sa2_median_historical_growth',
    'erp_change_per_cent_2023_24': 'sa2_erp_change_per_cent_2023_24'
})

In [257]:
property_with_pop.shape

(12616, 45)

In [258]:
print(income_df_selected['sa2_code'].duplicated().sum())

0


In [259]:
income_df.columns

Index(['SA2', 'SA2 NAME', '2017-18.sum', '2018-19.sum', '2019-20.sum',
       '2020-21.sum', '2021-22.sum', '2017.med', '2018.med', '2019.med',
       '2020.med', '2021.med', '2022.med', '2017-18.mean', '2018-19.mean',
       '2019-20.mean', '2020-21.mean', '2021-22.mean', 'med_CAGR',
       'Predicted_Income_2023', 'Predicted_Income_2024',
       'Predicted_Income_2025', 'Predicted_Income_2026',
       'Predicted_Income_2027', 'Predicted_Income_2028',
       'Predicted_Income_2029', 'Predicted_Income_2030'],
      dtype='object')

In [260]:
sa2_income_features = ['SA2', 'med_CAGR', 'Predicted_Income_2025']
income_df_selected = income_df[sa2_income_features].drop(0).copy()
income_df_selected.head()

Unnamed: 0,SA2,med_CAGR,Predicted_Income_2025
1,201011001,3.132328,62308.65714
2,201011002,2.924842,59205.51429
3,201011005,1.836983,58239.14286
4,201011006,2.866193,57409.67619
5,201011007,2.740581,61069.40952


In [261]:
# Rename columns for clarity
income_df_selected = income_df_selected.rename(columns={
    'SA2': 'sa2_code',
    'Predicted_Income_2025': 'sa2_predicted_median_income_25',
    'med_CAGR': 'sa2_median_income_cagr'
})
income_df_selected.head()

Unnamed: 0,sa2_code,sa2_median_income_cagr,sa2_predicted_median_income_25
1,201011001,3.132328,62308.65714
2,201011002,2.924842,59205.51429
3,201011005,1.836983,58239.14286
4,201011006,2.866193,57409.67619
5,201011007,2.740581,61069.40952


In [262]:
income_df_selected['sa2_code'] = income_df_selected['sa2_code'].astype(int)
property_with_pop_income = property_with_pop.merge(
    income_df_selected, how='left', on='sa2_code')
property_with_pop_income.columns

Index(['sa2_code', 'sa2_name', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'address', 'lat', 'lon', 'bedrooms', 'bathrooms', 'carspaces',
       'property_type', 'balcony', 'car_parking', 'heating',
       'air_conditioning', 'builtin_wardrobes', 'swimming_pool', 'ensuite',
       'dishwasher', 'garden', 'gym', 'pets_allowed', 'gas', 'intercom',
       'security_system', 'washing_machine', 'property_type_grouped',
       'distance_supermarket_m', 'travel_time_supermarket_mins',
       'distance_train_station_m', 'travel_time_train_station_mins',
       'distance_school_m', 'travel_time_school_mins', 'distance_healthcare_m',
       'travel_time_healthcare_mins', 'distance_cbd_m', 'travel_time_cbd_mins',
       'crime_rate_sa2_2025', 'sa2_erp_2025',
       'sa2_pop_density_2025_people_per_km2', 'sa2_pop_growth_2020_2024',
       'sa2_median_historical_growth', 'sa2_erp_change_per_cent_2023_24',
       'sa2_median_income_cagr', 'sa2_predicted_median_income_25'],
      dtype='objec

In [263]:
property_with_pop_income.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_cbd_m,travel_time_cbd_mins,crime_rate_sa2_2025,sa2_erp_2025,sa2_pop_density_2025_people_per_km2,sa2_pop_growth_2020_2024,sa2_median_historical_growth,sa2_erp_change_per_cent_2023_24,sa2_median_income_cagr,sa2_predicted_median_income_25
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,11051.07,17.002833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,11392.26,18.0705,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,11440.15,18.458167,6491.46985,16287.17176,1006.350051,3.36451,4.702436,3.4317,4.22032,70070.10476
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,11108.88,17.420833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,11310.93,17.796333,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381


In [264]:
property_with_pop_income[['sa2_median_income_cagr', 'sa2_predicted_median_income_25']].isna().sum()

sa2_median_income_cagr            0
sa2_predicted_median_income_25    0
dtype: int64

In [265]:
property_with_pop_income.shape

(12616, 47)

In [266]:
property_with_pop_income.to_csv('../../datasets/raw/cleaned/curated/domain_combined_all_with_sa2_features.csv', index=False)