# Domain Data Integration

## Purpose
Merge and enrich the Domain property dataset with amenity distances, crime rates, population, and income data at the SA2 level for analysis and modeling.

## Inputs
- `property_with_distances_copy.csv` – distances to CBD, supermarkets, train stations  
- `sch,hospital,cbd(in).csv` – nearest schools, healthcare  
- `crime_dataset_weighted_to_SA2(in) (1).csv` – crime rates  
- `population_data.csv`, `full_erp_only_population_data.csv` – population and ERP data  
- `income_predictions_2017_final(in).csv` – predicted income and CAGR  

## Outputs
- `domain_combined_all_with_sa2_features.csv` – integrated dataset combining property and SA2-level features

## Key Steps
1. Merge property dataset with distances and amenities.  
2. Add SA2-level crime, population, and income features.  
3. Rename and standardize feature columns.  
4. Export final integrated dataset.


In [9]:
# Libraries
import os
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import folium
import numpy as np
from pathlib import Path
import statsmodels.api as sm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from fuzzywuzzy import process
from collections import Counter

In [10]:
domain_df = pd.read_csv("../../datasets/raw/cleaned/domain_cleaned.csv")
domain_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,ensuite,dishwasher,garden,gym,pets_allowed,gas,intercom,security_system,washing_machine,property_type_grouped
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,0,0,1,0,0,0,0,Apartment
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,0,0,0,1,0,0,0,1,Apartment
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,1,1,0,0,1,1,0,0,1,Townhouse
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,1,0,0,0,0,0,0,0,Townhouse
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,0,0,0,0,0,0,0,Apartment


## Prepare Domain Data for analysis

### Merge with nearest amenities information

In [11]:
distances_df = pd.read_csv("../../datasets/property/property_with_distances_copy.csv")
school_health_df = pd.read_csv('../../datasets/property/sch,hospital,cbd(in).csv')
distances_df.head()

Unnamed: 0,Property_Lat,Property_Lon,Address,Distance_to_supermarket_m,TravelTime_to_supermarket_s,Distance_to_train_station_m,TravelTime_to_train_station_s,Distance_to_CBD_m,TravelTime_to_CBD_s
0,-38.665943,146.32794,12 Gunn Street,576.89,85.03,,,182294.98,8502.36
1,-38.65126,146.20575,14 Church Hill Road,418.16,74.67,,,170509.42,7966.59
2,-38.633835,145.7225,5 Howsam Place,679.56,157.02,,,146906.58,7232.62
3,-38.632805,145.73022,2/1 High Street,420.97,89.12,,,146913.31,7204.46
4,-38.631218,145.72937,23 Bayview Avenue,433.16,84.57,,,146803.75,7191.09


In [12]:
# Rename for consistency
distances_df = distances_df.rename(columns={"Property_Lat": "lat", "Property_Lon": "lon"})

# Merge with supermarket, train station info 
merged_distance = pd.merge(domain_df, distances_df, on=["lat", "lon"], how="left")

# Columns to convert
time_cols_s = ["TravelTime_to_supermarket_s", "TravelTime_to_train_station_s", "TravelTime_to_CBD_s"]

# Convert to minutes
for col in time_cols_s:
    merged_distance[col] = merged_distance[col] / 60

rename_map = {
    "Distance_to_supermarket_m": "distance_supermarket_m",
    "TravelTime_to_supermarket_s": "travel_time_supermarket_mins",
    "Distance_to_train_station_m": "distance_train_station_m",
    "TravelTime_to_train_station_s": "travel_time_train_station_mins",
    "Distance_to_CBD_m": "distance_cbd_m",
    "TravelTime_to_CBD_s": "travel_time_cbd_mins", 
}

merged_distance = merged_distance.rename(columns=rename_map)

# Drop address
merged_distance = merged_distance.drop(columns=["Address"])
merged_distance.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,intercom,security_system,washing_machine,property_type_grouped,distance_supermarket_m,travel_time_supermarket_mins,distance_train_station_m,travel_time_train_station_mins,distance_cbd_m,travel_time_cbd_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,0,0,0,Apartment,1774.0,5.633333,1928.0,5.033333,11552.0,18.066667
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,0,0,1,Apartment,2051.0,5.883333,2205.0,5.283333,11829.0,18.316667
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,0,0,1,Townhouse,2969.0,5.983333,2318.0,5.416667,11942.0,18.45
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,0,0,0,Townhouse,1831.0,5.566667,1985.0,4.983333,11609.0,18.0
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,0,0,0,Apartment,2127.0,6.1,2281.0,5.5,11905.0,18.533333


In [13]:
# merge with school, healthcarem cbd info dataset
school_health_df_unique = school_health_df.drop_duplicates(subset=['lat', 'lon'])

properties_df = merged_distance.merge(school_health_df_unique[['lat', 'lon', 'nearest_education_dist_m', 'nearest_education_time_mins', 'nearest_health_dist_m', 'nearest_health_time_mins', 'cbd_dist_m', 'cbd_time_mins' ]],
    on=['lat', 'lon'], how='left')

properties_df.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_train_station_m,travel_time_train_station_mins,distance_cbd_m,travel_time_cbd_mins,nearest_education_dist_m,nearest_education_time_mins,nearest_health_dist_m,nearest_health_time_mins,cbd_dist_m,cbd_time_mins
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,1928.0,5.033333,11552.0,18.066667,1060.74,3.023333,2427.16,4.2655,11051.07,17.002833
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,2205.0,5.283333,11829.0,18.316667,1401.92,4.15,2549.43,4.859167,11392.26,18.0705
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,2318.0,5.416667,11942.0,18.45,1449.81,3.266167,1983.01,3.499833,11440.15,18.458167
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,1985.0,4.983333,11609.0,18.0,1118.54,3.500167,2504.92,4.7915,11108.88,17.420833
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,2281.0,5.5,11905.0,18.533333,1320.59,3.875667,2630.75,5.1335,11310.93,17.796333


In [14]:
# drop redundant columns and rename for consistency
properties_df = properties_df.drop(columns=["distance_cbd_m", "travel_time_cbd_mins"])

rename_map = {
    "nearest_education_dist_m": "distance_school_m",
    "nearest_health_dist_m": "distance_healthcare_m",
    "nearest_education_time_mins": "travel_time_school_mins",
    "nearest_health_time_mins": "travel_time_healthcare_mins",
    "cbd_dist_m": "distance_cbd_m",
    "cbd_time_mins": "travel_time_cbd_mins",
}
properties_df = properties_df.rename(columns=rename_map)

### Merge with crime data

In [15]:
# crime 
crime_df = pd.read_csv("../../datasets/crime/crime_dataset_weighted_to_SA2(in) (1).csv")
crime_df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,Incidents_2021,Incidents_2022,Incidents_2023,...,VictimRate_2016,VictimRate_2017,VictimRate_2018,VictimRate_2019,VictimRate_2020,VictimRate_2021,VictimRate_2022,VictimRate_2023,VictimRate_2024,VictimRate_2025
0,201011001,Alfredton,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
1,201011002,Ballarat,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
2,201011005,Buninyong,14548.83225,14879.64518,14357.93044,13111.79632,14042.6387,11655.65456,12212.32826,13260.75245,...,5562.381238,5160.2999,4856.329142,4219.608124,4531.211146,3444.831718,3763.718188,3670.649736,4203.021792,4143.264308
3,201011006,Delacombe,14605.67603,14906.55716,14393.76512,13148.18194,14094.50163,11625.98224,12240.68562,13308.32835,...,5740.096618,5300.755085,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569
4,201011007,Smythes Creek,2700.0,2659.5,2164.5,2299.5,2115.0,2025.0,1903.5,2286.0,...,1948.582849,1733.516119,1220.300979,1436.371273,1288.680949,1132.894103,1074.20718,1084.766775,1363.10134,1354.460609


In [16]:
# Copy only what we want and rename for consistency
crime_2025 = crime_df[["SA2_CODE_2021", "CrimeRate_2025"]].copy()
crime_2025 = crime_2025.rename(columns={"SA2_CODE_2021": "sa2_code", "CrimeRate_2025": "crime_rate_sa2_2025"})

# Merge with your property dataset
properties_merged = properties_df.merge(crime_2025, on='sa2_code', how='left')


### Merge with population and income data

In [17]:
population_df = pd.read_csv("../../datasets/raw/cleaned/curated/population_data.csv")
erp_df = pd.read_csv("../../datasets/raw/cleaned/curated/full_erp_only_population_data.csv")
income_df = pd.read_csv("../../datasets/income/income_predictions_2017_final(in).csv")

In [18]:
# drop duplicates
population_df = population_df.drop_duplicates(subset='sa2_code')
erp_df = erp_df.drop_duplicates(subset='sa2_code')

# Prepare population data
for year in range(2015, 2037): 
    col = f'erp_{year}'
    if col not in population_df.columns:
        # add the column from erp_full_df
        population_df = population_df.merge(
            erp_df[['sa2_code', col]],
            on='sa2_code',
            how='left'
        )
        
# Derive feature we want        
population_df['pop_density_2025_people_per_km2'] = population_df['erp_2025'] / population_df['area_km2']

In [19]:
# define features we want and copy 
sa2_pop_features = ['sa2_code', 'erp_2025','pop_density_2025_people_per_km2','pop_growth_2020_2024', 'median_historical_growth', 'erp_change_per_cent_2023_24']
pop_df_selected = population_df[sa2_pop_features].copy()

In [20]:
# merge
property_with_pop = properties_merged.merge(pop_df_selected, how='left', on='sa2_code')
property_with_pop.columns

Index(['sa2_code', 'sa2_name', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'address', 'lat', 'lon', 'bedrooms', 'bathrooms', 'carspaces',
       'property_type', 'balcony', 'car_parking', 'heating',
       'air_conditioning', 'builtin_wardrobes', 'swimming_pool', 'ensuite',
       'dishwasher', 'garden', 'gym', 'pets_allowed', 'gas', 'intercom',
       'security_system', 'washing_machine', 'property_type_grouped',
       'distance_supermarket_m', 'travel_time_supermarket_mins',
       'distance_train_station_m', 'travel_time_train_station_mins',
       'distance_school_m', 'travel_time_school_mins', 'distance_healthcare_m',
       'travel_time_healthcare_mins', 'distance_cbd_m', 'travel_time_cbd_mins',
       'crime_rate_sa2_2025', 'erp_2025', 'pop_density_2025_people_per_km2',
       'pop_growth_2020_2024', 'median_historical_growth',
       'erp_change_per_cent_2023_24'],
      dtype='object')

In [None]:
# Rename for consistency
property_with_pop = property_with_pop.rename(columns={'erp_2025': 'sa2_erp_2025', 'pop_density_2025_people_per_km2': 'sa2_pop_density_2025_people_per_km2',
    'pop_growth_2020_2024': 'sa2_pop_growth_2020_2024','median_historical_growth' : 'sa2_median_historical_growth',
    'erp_change_per_cent_2023_24': 'sa2_erp_change_per_cent_2023_24'
})

In [22]:
income_df.columns

Index(['SA2', 'SA2 NAME', '2017-18.sum', '2018-19.sum', '2019-20.sum',
       '2020-21.sum', '2021-22.sum', '2017.med', '2018.med', '2019.med',
       '2020.med', '2021.med', '2022.med', '2017-18.mean', '2018-19.mean',
       '2019-20.mean', '2020-21.mean', '2021-22.mean', 'med_CAGR',
       'Predicted_Income_2023', 'Predicted_Income_2024',
       'Predicted_Income_2025', 'Predicted_Income_2026',
       'Predicted_Income_2027', 'Predicted_Income_2028',
       'Predicted_Income_2029', 'Predicted_Income_2030'],
      dtype='object')

In [None]:
# select the columns we want from income and copy
sa2_income_features = ['SA2', 'med_CAGR', 'Predicted_Income_2025']
income_df_selected = income_df[sa2_income_features].drop(0).copy()

# Rename columns for consistency
income_df_selected = income_df_selected.rename(columns={
    'SA2': 'sa2_code',
    'Predicted_Income_2025': 'sa2_predicted_median_income_25',
    'med_CAGR': 'sa2_median_income_cagr'
})
income_df_selected.head()

Unnamed: 0,sa2_code,sa2_median_income_cagr,sa2_predicted_median_income_25
1,201011001,3.132328,62308.65714
2,201011002,2.924842,59205.51429
3,201011005,1.836983,58239.14286
4,201011006,2.866193,57409.67619
5,201011007,2.740581,61069.40952


In [24]:
# merge income
income_df_selected['sa2_code'] = income_df_selected['sa2_code'].astype(int)
property_with_pop_income = property_with_pop.merge(income_df_selected, how='left', on='sa2_code')

In [25]:
property_with_pop_income.head()

Unnamed: 0,sa2_code,sa2_name,suburb,postcode,weekly_rent,bond,address,lat,lon,bedrooms,...,distance_cbd_m,travel_time_cbd_mins,crime_rate_sa2_2025,erp_2025,pop_density_2025_people_per_km2,pop_growth_2020_2024,median_historical_growth,erp_change_per_cent_2023_24,sa2_median_income_cagr,sa2_predicted_median_income_25
0,213021344,Newport,SOUTH KINGSVILLE,3015,460.0,1994.0,3/53 Greene Street,-37.830982,144.87091,2,...,11051.07,17.002833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
1,213021344,Newport,SOUTH KINGSVILLE,3015,400.0,1738.0,1/3 New Street,-37.826218,144.86755,2,...,11392.26,18.0705,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
2,213021343,Altona North,SOUTH KINGSVILLE,3015,795.0,3454.0,19/92 New Street,-37.831226,144.86632,3,...,11440.15,18.458167,6491.46985,16287.17176,1006.350051,3.36451,4.702436,3.4317,4.22032,70070.10476
3,213021344,Newport,SOUTH KINGSVILLE,3015,675.0,2933.0,3/14 Saltley Street,-37.827423,144.86768,3,...,11108.88,17.420833,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381
4,213021344,Newport,SOUTH KINGSVILLE,3015,450.0,1955.0,4/2B Saltley Street,-37.82627,144.8679,2,...,11310.93,17.796333,6491.46985,20138.014352,2277.875547,2.724838,4.891701,2.3348,3.920093,85258.92381


In [265]:
property_with_pop_income.shape

(12616, 47)

In [266]:
property_with_pop_income.to_csv('../../datasets/raw/cleaned/curated/domain_combined_all_with_sa2_features.csv', index=False)