In [None]:
import pandas as pd
import numpy as np
crime_df = pd.read_csv("../datasets/crime_dataset.csv")
corr_df = pd.read_csv("../datasets/CG_SA2_2021_LGA_2021.csv")

In [None]:
# Clean and format IDs 
crime_df["LGA_CODE21"] = pd.to_numeric(crime_df["LGA_CODE21"], errors="coerce").round().astype("Int64")
crime_df["SA2_CODE21"] = pd.to_numeric(crime_df["SA2_CODE21"], errors="coerce").round().astype("Int64")

crime_df["LGA_CODE21"] = crime_df["LGA_CODE21"].astype(str)
crime_df["SA2_CODE21"] = crime_df["SA2_CODE21"].astype(str).str.zfill(9)

corr_df["LGA_CODE_2021"] = pd.to_numeric(corr_df["LGA_CODE_2021"], errors="coerce").round().astype("Int64").astype(str)
corr_df["SA2_CODE_2021"] = pd.to_numeric(corr_df["SA2_CODE_2021"], errors="coerce").round().astype("Int64").astype(str).str.zfill(9)

#Filter Victoria SA2s (start with '2')
corr_vic = corr_df[corr_df["SA2_CODE_2021"].str.startswith("2")].copy()

# Build LGA → SA2 weights 
# Turn RATIO_FROM_TO (SA2→LGA share) into w_LGA_to_SA2 (LGA→SA2 share)
corr_vic["w_LGA_to_SA2"] = corr_vic.groupby("LGA_CODE_2021")["RATIO_FROM_TO"].transform(lambda x: x / x.sum())

# Expand LGA data to all SA2s inside each LGA 
expanded = crime_df.merge(
    corr_vic[["LGA_CODE_2021","SA2_CODE_2021","SA2_NAME_2021","w_LGA_to_SA2"]],
    left_on="LGA_CODE21", right_on="LGA_CODE_2021", how="left"
)

# Choose which columns to weight 
count_cols = [c for c in crime_df.columns if c.startswith(("Incidents_","Victims_"))]

# Apply the weights 
for c in count_cols:
    expanded[c + "_w"] = expanded[c] * expanded["w_LGA_to_SA2"]

#Collapse to unique SA2s (sum across all LGA overlaps)
sa2_weighted = (
    expanded.groupby(["SA2_CODE_2021","SA2_NAME_2021"])[[c + "_w" for c in count_cols]]
            .sum()
            .rename(columns=lambda x: x.replace("_w",""))
            .reset_index()
)

# Verify totals match original 
orig_total = crime_df[count_cols].sum(numeric_only=True).sum()
new_total  = sa2_weighted[count_cols].sum(numeric_only=True).sum()

print("Original total incidents+victims:", float(orig_total))
print("Weighted SA2 total:", float(new_total))
print("Diff:", float(new_total - orig_total))
print("PErcentage diff:", float((new_total - orig_total)/orig_total*100 if orig_total else np.nan), "%")

sa2_weighted


Original total incidents+victims: 134720943.0
Weighted SA2 total: 134720943.0
Diff: 0.0
PErcentage diff: 0.0 %


Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,Incidents_2021,Incidents_2022,Incidents_2023,...,Victims_2016,Victims_2017,Victims_2018,Victims_2019,Victims_2020,Victims_2021,Victims_2022,Victims_2023,Victims_2024,Victims_2025
0,201011001,Alfredton,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
1,201011002,Ballarat,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
2,201011005,Buninyong,14548.832253,14879.645181,14357.930442,13111.796323,14042.638700,11655.654556,12212.328264,13260.752451,...,9562.105530,9019.978570,8659.427899,7684.668281,8408.861144,6458.571304,7254.715485,7183.005061,8423.353420,8485.365000
3,201011006,Delacombe,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
4,201011007,Smythes Creek,2700.000000,2659.500000,2164.500000,2299.500000,2115.000000,2025.000000,1903.500000,2286.000000,...,1930.500000,1764.000000,1273.500000,1539.000000,1413.000000,1269.000000,1224.000000,1260.000000,1615.500000,1638.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,3221.500943,4307.789994,3312.247449,3523.099623,3416.339028,3272.212226,3226.838973,2935.916353,...,1980.409031,2391.437320,1969.732971,2047.134402,1804.254049,1550.697637,1582.725816,1502.655370,1548.028622,1878.986466
518,217041477,Moyne - East,1908.994094,2204.192887,2076.472650,2037.342973,2250.217049,2285.572393,2415.733116,1943.505238,...,1166.853868,1489.005605,1245.922040,1396.771744,1346.485664,1218.584455,1481.447710,1132.963613,1328.843721,1477.706042
519,217041478,Moyne - West,1922.715612,2207.259802,2094.322881,2068.248563,2276.036838,2298.159945,2439.588221,1965.221301,...,1171.630161,1497.675602,1252.858691,1419.993722,1359.176060,1228.542769,1494.515577,1143.592273,1334.764019,1489.598849
520,217041479,Warrnambool - North,4440.815176,5203.128393,5445.203123,6047.011239,4964.784621,4421.142158,4424.052573,4232.402245,...,2266.899495,2696.385248,2476.103189,2926.645794,2508.887961,2201.045996,2305.081474,2078.101560,2221.790653,2790.643008


In [None]:
#Save result (one row per SA2)
sa2_weig hted.to_csv("crime_dataset_weighted_corrected_SA2.csv", index=False)