# This Notebook finalise the dataset that we will use for training the time series model

In [11]:
import pandas as pd
import numpy as np
properties_df = pd.read_csv('../datasets/all_properties_tidy_enriched.csv')

  properties_df = pd.read_csv('../datasets/all_properties_tidy_enriched.csv')


# Cleaning property data

In [12]:
missing_both = properties_df[properties_df["SA2_CODE21"].isna() | properties_df["LGA_CODE21"].isna()]
missing_both

Unnamed: 0,Suburb,date,bedrooms,property_type,Count,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,LGA_CODE21,LGA_NAME21
45120,Group Total,1/03/2000,1.0,flat,7746,160,,,,,,
45121,Group Total,1/03/2000,2.0,flat,9079,245,,,,,,
45122,Group Total,1/03/2000,2.0,house,2677,260,,,,,,
45123,Group Total,1/03/2000,3.0,flat,1448,370,,,,,,
45124,Group Total,1/03/2000,3.0,house,1816,320,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
91990,Wanagaratta,1/03/2025,2.0,house,23,390,,,,,,
91991,Wanagaratta,1/03/2025,3.0,flat,22,450,,,,,,
91992,Wanagaratta,1/03/2025,3.0,house,234,450,,,,,,
91993,Wanagaratta,1/03/2025,4.0,house,70,580,,,,,,


In [13]:
#Convert to integers
properties_df["SA2_CODE21"] = properties_df["SA2_CODE21"].astype("Int64")
properties_df["LGA_CODE21"] = properties_df["LGA_CODE21"].astype("Int64")

In [14]:
#Delete all "group total" rows, clean Wanagaratta SA2 code

# Drop rows where Suburb == "Group Total"
properties_df = properties_df[properties_df["Suburb"] != "Group Total"].copy()

# Update  Wanagaratta row
mask = properties_df["Suburb"] == "Wanagaratta"
properties_df.loc[mask, "SA2_CODE21"] = 204021066
properties_df.loc[mask, "SA2_NAME21"] = "Wangaratta"
properties_df.loc[mask, "Lat"] = -36.3588908
properties_df.loc[mask, "Lng"] = 146.3096576

In [15]:
#Delete all LGA related
properties_df = properties_df.drop(columns=["LGA_CODE21", "LGA_NAME21"])

Taking only the total of the properties, nan bedrooms and property type

In [16]:
properties_df = properties_df[
    properties_df["bedrooms"].isna() & properties_df["property_type"].isna()
]
properties_df = properties_df.drop(columns=["bedrooms", "property_type","Count"])

In [17]:
#Change to date format
properties_df["date"] = pd.to_datetime(
    properties_df["date"],
    format="%d/%m/%Y"   # day/month/year
)

In [18]:
# Filter only properties within 2017–2025
start_date = pd.Timestamp("2017-01-01")
end_date   = pd.Timestamp("2025-12-31")

properties_df = properties_df[(properties_df["date"] >= start_date) & (properties_df["date"] <= end_date)]
properties_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21
482,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.9701609,206051128,Albert Park
489,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.9701609,206051128,Albert Park
496,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.9701609,206051128,Albert Park
503,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.9701609,206051128,Albert Park
510,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.9701609,206051128,Albert Park
...,...,...,...,...,...,...,...
100146,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville
100153,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville
100160,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville
100167,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville


# Combining with external features

Start with population

In [19]:
#Import population
population_df = pd.read_csv('../Amanda-workspace/Predict_2030_ERP_only.csv')

In [20]:
import pandas as pd
import re

# Keeping only data from 2017 to 2025
years = range(2017, 2026)
keep_cols = ["sa2_code"] + [f"erp_{y}" for y in years if f"erp_{y}" in population_df.columns]
pop_wide = population_df[keep_cols].copy()

# one row per SA2
if pop_wide.duplicated("sa2_code").any():
    pop_wide = (pop_wide.groupby("sa2_code", as_index=False)
                .agg({c: "mean" for c in pop_wide.columns if c != "sa2_code"}))

# Making it to wide format
pop_long = pop_wide.melt(id_vars="sa2_code", var_name="year_lbl", value_name="ERP")
pop_long["year"] = pop_long["year_lbl"].str[-4:].astype(int)
pop_long = (pop_long.groupby(["sa2_code","year"], as_index=False)
                    .agg(ERP=("ERP","mean")))
pop_long["date"] = pd.to_datetime(pop_long["year"].astype(str) + "-01-01")

# Build ERP quarterly for 2015–2025
q_start = pd.Timestamp("2017-03-01")
q_end   = pd.Timestamp("2025-03-01")
quarterly_index = pd.date_range(start=q_start, end=q_end, freq="QS-MAR")

#Interpolate to quarterly
out = []
for code, g in pop_long.groupby("sa2_code"):
    g = (g[["date", "ERP"]]
         .sort_values("date")
         .set_index("date"))
    g = g[~g.index.duplicated(keep="last")]

    # Interpolate on union
    union_idx = g.index.union(quarterly_index)
    g_u = g.reindex(union_idx)
    g_u["ERP"] = g_u["ERP"].interpolate(method="time") #method="time" because it is linear with time - good for population

    # Keep quarterly rows and make columns tidy
    g_q = (g_u.reindex(quarterly_index)
               .rename(columns={"ERP": "ERP_quarterly"})
               .reset_index()                   
               .rename(columns={"index": "date"}))

    # Assign SA2 after reset_index so it doesn’t align by the old DateTimeIndex
    g_q["SA2_CODE21"] = code
    g_q["SA2_CODE21"] = g_q["SA2_CODE21"].astype("Int64")

    out.append(g_q)

pop_quarterly = pd.concat(out, ignore_index=True)
merged_df = properties_df.merge(pop_quarterly, how="left", on=["SA2_CODE21","date"])

In [21]:
merged_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,ERP_quarterly
0,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.9701609,206051128,Albert Park,16536.854795
1,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.9701609,206051128,Albert Park,16594.323288
2,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.9701609,206051128,Albert Park,16651.791781
3,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.9701609,206051128,Albert Park,16708.635616
4,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.9701609,206051128,Albert Park,16785.060274
...,...,...,...,...,...,...,...,...
4813,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville,16280.121038
4814,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville,16337.039963
4815,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville,16393.958888
4816,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville,16450.259129


Continue with income

In [22]:
#Import income df
income_df = pd.read_csv("../Amanda-workspace/income_predictions_2017_final.csv")
income_df = income_df.drop(income_df.index[0]).reset_index(drop=True) #First row is victoria we dont need it
income_df = income_df.rename(columns={"SA2": "sa2_code"}) #Rename so that when merging it matches easily

In [23]:
income_df

Unnamed: 0,sa2_code,SA2 NAME,2017-18.sum,2018-19.sum,2019-20.sum,2020-21.sum,2021-22.sum,2017.med,2018.med,2019.med,...,2021-22.mean,med_CAGR,Predicted_Income_2023,Predicted_Income_2024,Predicted_Income_2025,Predicted_Income_2026,Predicted_Income_2027,Predicted_Income_2028,Predicted_Income_2029,Predicted_Income_2030
0,201011001,Alfredton,483036463.0,526674675.0,587104678.0,671910899.0,766120245.0,49982,50151,51620,...,69685,3.132328,58944.600000,60626.628571,62308.657143,63990.685714,65672.714286,67354.742857,69036.771429,70718.800000
1,201011002,Ballarat,583676689.0,614019577.0,622719767.0,658757843.0,683888799.0,48152,48937,51187,...,82715,2.924842,56348.600000,57777.057143,59205.514286,60633.971429,62062.428571,63490.885714,64919.342857,66347.800000
2,201011005,Buninyong,267515723.0,279770315.0,288252226.0,302723074.0,325191920.0,50469,48461,49846,...,68664,1.836983,55824.000000,57031.571429,58239.142857,59446.714286,60654.285714,61861.857143,63069.428571,64277.000000
3,201011006,Delacombe,230736055.0,265359118.0,303858080.0,347767491.0,421457036.0,46355,45598,47940,...,59595,2.866193,54359.733333,55884.704762,57409.676190,58934.647619,60459.619048,61984.590476,63509.561905,65034.533333
4,201011007,Smythes Creek,145342967.0,159605715.0,162558698.0,174786090.0,183108810.0,48415,48000,50938,...,64520,2.740581,57691.866667,59380.638095,61069.409524,62758.180952,64446.952381,66135.723810,67824.495238,69513.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,108079271.0,115082194.0,121471267.0,135387035.0,141211176.0,32085,30663,32420,...,54543,5.020252,41767.466667,43770.123810,45772.780952,47775.438095,49778.095238,51780.752381,53783.409524,55786.066667
518,217041477,Moyne - East,207102941.0,235911309.0,248715753.0,259197028.0,285466499.0,38224,40764,43158,...,61049,5.457629,51755.733333,54008.704762,56261.676190,58514.647619,60767.619048,63020.590476,65273.561905,67526.533333
519,217041478,Moyne - West,311159848.0,336313641.0,359222336.0,384277688.0,430826553.0,41581,41093,43243,...,64874,4.403630,52606.933333,54729.104762,56851.276190,58973.447619,61095.619048,63217.790476,65339.961905,67462.133333
520,217041479,Warrnambool - North,797046929.0,759894035.0,804199635.0,854591829.0,915937309.0,42537,43939,45632,...,60799,4.017371,53695.066667,55613.466667,57531.866667,59450.266667,61368.666667,63287.066667,65205.466667,67123.866667


Interpolate quarterly for income and then merge with the main dataset

In [None]:
import pandas as pd
import re

# Collect the columns 2017–2022 medians
median_cols = [c for c in income_df.columns if re.fullmatch(r"\d{4}\.med", c)]
median_cols = [c for c in median_cols if 2017 <= int(c[:4]) <= 2022]
median_cols = sorted(median_cols) 

# Predicted 2023–2024
pred_cols = [c for c in income_df.columns if re.fullmatch(r"Predicted_Income_\d{4}", c)]
pred_cols = [c for c in pred_cols if 2023 <= int(c.split("_")[-1]) <= 2024]
pred_cols = sorted(pred_cols)  

keep_cols = ["sa2_code"] + median_cols + pred_cols
income_keep = income_df[keep_cols].drop_duplicates(subset=["sa2_code"])

# Rename to a unified scheme
rename_map = {c: f"income_{c[:4]}" for c in median_cols}  
rename_map.update({c: f"income_{c.split('_')[-1]}" for c in pred_cols})  
income_keep = income_keep.rename(columns=rename_map)

# Change to Wide format
value_cols = [c for c in income_keep.columns if c.startswith("income_")]
inc_long = income_keep.melt(id_vars="sa2_code", value_vars=value_cols,
                            var_name="year_lbl", value_name="income_annual")
inc_long["year"] = inc_long["year_lbl"].str.extract(r"(\d{4})").astype(int)
inc_long = inc_long.drop(columns="year_lbl")

# Set only to 2017-2024
inc_long = inc_long[inc_long["year"].between(2017, 2024)]

# If there is any duplicate
inc_long = (inc_long
            .groupby(["sa2_code","year"], as_index=False)
            .agg(income_annual=("income_annual","mean")))

# Create annual datetime anchor 
inc_long["date"] = pd.to_datetime(inc_long["year"].astype(str) + "-01-01")

# Interpolate quarterly
target_dates = pd.to_datetime(pd.Index(sorted(merged_df["date"].unique()))).tz_localize(None)

inc_q_list = []
for code, g in inc_long.groupby("sa2_code"):
    g = (g[["date","income_annual"]]
         .sort_values("date")
         .set_index("date"))
    g = g[~g.index.duplicated(keep="last")]

    # Interpolate on the union of annual points and target dates
    union_idx = g.index.union(target_dates)
    g_u = g.reindex(union_idx)
    # linear in actual time 
    g_u["income"] = g_u["income_annual"].interpolate(method="time")
    g_q = g_u.reindex(target_dates)[["income"]]

    g_q = g_q.reset_index().rename(columns={"index":"date"})
    g_q["SA2_CODE21"] = pd.Series([code]*len(g_q), dtype="Int64")
    inc_q_list.append(g_q)

income_quarterly = pd.concat(inc_q_list, ignore_index=True)

# Merged with merged df
income_quarterly["date"] = pd.to_datetime(income_quarterly["date"]).dt.normalize()
merged_df = merged_df.merge(income_quarterly, how="left", on=["SA2_CODE21","date"])
merged_df = merged_df.rename(columns={"income": "Income_quarterly_med"})


In [25]:
merged_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,ERP_quarterly,Income_quarterly_med
0,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.9701609,206051128,Albert Park,16536.854795,62618.808219
1,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.9701609,206051128,Albert Park,16594.323288,62804.068493
2,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.9701609,206051128,Albert Park,16651.791781,62989.328767
3,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.9701609,206051128,Albert Park,16708.635616,63172.575342
4,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.9701609,206051128,Albert Park,16785.060274,63400.523288
...,...,...,...,...,...,...,...,...,...
4813,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville,16280.121038,84590.600000
4814,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville,16337.039963,84590.600000
4815,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville,16393.958888,84590.600000
4816,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville,16450.259129,84590.600000


Continue with crime

In [26]:
crime_df = pd.read_csv("../Amanda-workspace/crime_dataset_weighted_to_SA2.csv")

In [29]:
crime_df = crime_df.filter(regex=r'SA2_CODE_2021|SA2_NAME_2021|CrimeRate')
crime_df

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CrimeRate_2016,CrimeRate_2017,CrimeRate_2018,CrimeRate_2019,CrimeRate_2020,CrimeRate_2021,CrimeRate_2022,CrimeRate_2023,CrimeRate_2024,CrimeRate_2025
0,201011001,Alfredton,8723.671498,8730.376785,8268.901940,7404.370838,7804.200815,6331.935438,6522.657334,6964.597125,7682.775713,8198.974132
1,201011002,Ballarat,8723.671498,8730.376785,8268.901940,7404.370838,7804.200815,6331.935438,6522.657334,6964.597125,7682.775713,8198.974132
2,201011005,Buninyong,8483.328020,8540.311327,8068.830310,7215.435385,7579.955379,6259.961080,6359.777932,6766.601567,7475.590640,7987.772959
3,201011006,Delacombe,8723.671498,8730.376785,8268.901940,7404.370838,7804.200815,6331.935438,6522.657334,6964.597125,7682.775713,8198.974132
4,201011007,Smythes Creek,2725.290698,2613.540884,2074.080462,2146.157077,1928.917344,1807.809738,1670.550136,1968.076863,2092.113756,2377.748157
...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,5650.220017,7473.952304,5686.660862,5989.926033,5746.610398,5495.539917,5446.436616,4938.936782,5210.762332,6091.996375
518,217041477,Moyne - East,2579.056936,2970.921286,2779.558549,2711.269108,2978.374179,2991.612067,3128.963993,2499.661361,2767.442681,3024.476169
519,217041478,Moyne - West,2565.453877,2937.009061,2764.409195,2710.172582,2975.245530,2972.041713,3123.667073,2494.150671,2750.871619,3007.447548
520,217041479,Warrnambool - North,6391.257718,7427.871384,7703.823647,8484.408419,6909.805191,6152.682638,6133.001702,5810.118431,6352.989186,7558.350120


In [None]:
import pandas as pd

# Rename column for merging
crime_df = crime_df.rename(columns={
    'SA2_CODE_2021': 'SA2_CODE21',
    'SA2_NAME_2021': 'SA2_NAME'
})

# Melt to long format
crime_long = crime_df.melt(
    id_vars=["SA2_CODE21", "SA2_NAME"],
    var_name="year_lbl",
    value_name="crime_rate_annual"
)

# Extract numeric year from the crime rate columns
crime_long["year"] = crime_long["year_lbl"].str.extract(r"(\d{4})").astype(int)
crime_long = crime_long.drop(columns="year_lbl")

# Keep only year 2017–2025
crime_long = crime_long[crime_long["year"].between(2017, 2025)]

#Create annual datetime anchors 
crime_long["date"] = pd.to_datetime(crime_long["year"].astype(str) + "-03-01")

#Interpolate quarterly to match merged_df's dates 
target_dates = pd.to_datetime(pd.Index(sorted(merged_df["date"].unique()))).tz_localize(None)

crime_q_list = []
for code, g in crime_long.groupby("SA2_CODE21"):
    g = g[["date", "crime_rate_annual"]].sort_values("date").set_index("date")
    g = g[~g.index.duplicated(keep="last")]
    
    # Interpolate across union of yearly and target dates
    union_idx = g.index.union(target_dates)
    g_u = g.reindex(union_idx)
    g_u["crime_rate"] = g_u["crime_rate_annual"].interpolate(method="time")
    g_q = g_u.reindex(target_dates)[["crime_rate"]]

    g_q = g_q.reset_index().rename(columns={"index": "date"})
    g_q["SA2_CODE21"] = code
    crime_q_list.append(g_q)

crime_quarterly = pd.concat(crime_q_list, ignore_index=True)

# Merge with merged_df 
crime_quarterly["date"] = pd.to_datetime(crime_quarterly["date"]).dt.normalize()
merged_df = merged_df.merge(crime_quarterly, how="left", on=["SA2_CODE21", "date"])
merged_df = merged_df.rename(columns={"crime_rate": "CrimeRate_quarterly"})


In [31]:
merged_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,ERP_quarterly,Income_quarterly_med,CrimeRate_quarterly
0,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.9701609,206051128,Albert Park,16536.854795,62618.808219,9573.331629
1,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.9701609,206051128,Albert Park,16594.323288,62804.068493,9388.538038
2,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.9701609,206051128,Albert Park,16651.791781,62989.328767,9203.744447
3,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.9701609,206051128,Albert Park,16708.635616,63172.575342,9020.959482
4,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.9701609,206051128,Albert Park,16785.060274,63400.523288,8840.183143
...,...,...,...,...,...,...,...,...,...,...
4813,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville,16280.121038,84590.600000,8598.317259
4814,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville,16337.039963,84590.600000,9026.984544
4815,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville,16393.958888,84590.600000,9455.651829
4816,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville,16450.259129,84590.600000,9879.659686


Combine with number of FOIs and PT stops

In [32]:
foi_counts_df = pd.read_csv("../Amanda-workspace/pivot_counts.csv")

In [33]:
# Merge on SA2 code
merged_df = merged_df.merge(
    foi_counts_df,
    on='SA2_CODE21',
    how='left'   # keep all rows from foi_counts_df
)

In [34]:
merged_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,ERP_quarterly,Income_quarterly_med,CrimeRate_quarterly,cultural,education,health,others,tourist
0,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.9701609,206051128,Albert Park,16536.854795,62618.808219,9573.331629,4,7,2,103,12
1,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.9701609,206051128,Albert Park,16594.323288,62804.068493,9388.538038,4,7,2,103,12
2,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.9701609,206051128,Albert Park,16651.791781,62989.328767,9203.744447,4,7,2,103,12
3,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.9701609,206051128,Albert Park,16708.635616,63172.575342,9020.959482,4,7,2,103,12
4,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.9701609,206051128,Albert Park,16785.060274,63400.523288,8840.183143,4,7,2,103,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4813,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville,16280.121038,84590.600000,8598.317259,2,4,2,43,0
4814,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville,16337.039963,84590.600000,9026.984544,2,4,2,43,0
4815,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville,16393.958888,84590.600000,9455.651829,2,4,2,43,0
4816,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville,16450.259129,84590.600000,9879.659686,2,4,2,43,0


In [35]:
# Save to current folder
merged_df.to_csv("merged_df_ALL.csv", index=False)