In [24]:
import pandas as pd
crime_df = pd.read_csv("/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/datasets/crime_dataset.csv")
corr_df = pd.read_csv("/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/datasets/CG_SA2_2021_LGA_2021.csv")

In [26]:
import pandas as pd
import numpy as np

# --- 2Ô∏è‚É£ Clean and format IDs ---
crime_df["LGA_CODE21"] = pd.to_numeric(crime_df["LGA_CODE21"], errors="coerce").round().astype("Int64")
crime_df["SA2_CODE21"] = pd.to_numeric(crime_df["SA2_CODE21"], errors="coerce").round().astype("Int64")

crime_df["LGA_CODE21"] = crime_df["LGA_CODE21"].astype(str)
crime_df["SA2_CODE21"] = crime_df["SA2_CODE21"].astype(str).str.zfill(9)

corr_df["LGA_CODE_2021"] = pd.to_numeric(corr_df["LGA_CODE_2021"], errors="coerce").round().astype("Int64").astype(str)
corr_df["SA2_CODE_2021"] = pd.to_numeric(corr_df["SA2_CODE_2021"], errors="coerce").round().astype("Int64").astype(str).str.zfill(9)

# --- 3Ô∏è‚É£ Filter Victoria SA2s (start with '2') ---
corr_vic = corr_df[corr_df["SA2_CODE_2021"].str.startswith("2")].copy()

# --- 4Ô∏è‚É£ Build LGA ‚Üí SA2 weights ---
# Turn RATIO_FROM_TO (SA2‚ÜíLGA share) into w_LGA_to_SA2 (LGA‚ÜíSA2 share)
corr_vic["w_LGA_to_SA2"] = corr_vic.groupby("LGA_CODE_2021")["RATIO_FROM_TO"].transform(lambda x: x / x.sum())

# --- 5Ô∏è‚É£ Expand LGA data to all SA2s inside each LGA ---
expanded = crime_df.merge(
    corr_vic[["LGA_CODE_2021","SA2_CODE_2021","SA2_NAME_2021","w_LGA_to_SA2"]],
    left_on="LGA_CODE21", right_on="LGA_CODE_2021", how="left"
)

# --- 6Ô∏è‚É£ Choose which columns to weight (counts) ---
count_cols = [c for c in crime_df.columns if c.startswith(("Incidents_","Victims_"))]

# --- 7Ô∏è‚É£ Apply the weights ---
for c in count_cols:
    expanded[c + "_w"] = expanded[c] * expanded["w_LGA_to_SA2"]

# --- 8Ô∏è‚É£ Collapse to unique SA2s (sum across all LGA overlaps) ---
sa2_weighted = (
    expanded.groupby(["SA2_CODE_2021","SA2_NAME_2021"])[[c + "_w" for c in count_cols]]
            .sum()
            .rename(columns=lambda x: x.replace("_w",""))
            .reset_index()
)

# --- 9Ô∏è‚É£ Verify totals match original ---
orig_total = crime_df[count_cols].sum(numeric_only=True).sum()
new_total  = sa2_weighted[count_cols].sum(numeric_only=True).sum()

print("‚úÖ Original total incidents+victims:", float(orig_total))
print("‚úÖ Weighted SA2 total:", float(new_total))
print("Œî:", float(new_total - orig_total))
print("%Œî:", float((new_total - orig_total)/orig_total*100 if orig_total else np.nan), "%")

sa2_weighted


‚úÖ Original total incidents+victims: 134720943.0
‚úÖ Weighted SA2 total: 134720943.0
Œî: 0.0
%Œî: 0.0 %


Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,Incidents_2021,Incidents_2022,Incidents_2023,...,Victims_2016,Victims_2017,Victims_2018,Victims_2019,Victims_2020,Victims_2021,Victims_2022,Victims_2023,Victims_2024,Victims_2025
0,201011001,Alfredton,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
1,201011002,Ballarat,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
2,201011005,Buninyong,14548.832253,14879.645181,14357.930442,13111.796323,14042.638700,11655.654556,12212.328264,13260.752451,...,9562.105530,9019.978570,8659.427899,7684.668281,8408.861144,6458.571304,7254.715485,7183.005061,8423.353420,8485.365000
3,201011006,Delacombe,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
4,201011007,Smythes Creek,2700.000000,2659.500000,2164.500000,2299.500000,2115.000000,2025.000000,1903.500000,2286.000000,...,1930.500000,1764.000000,1273.500000,1539.000000,1413.000000,1269.000000,1224.000000,1260.000000,1615.500000,1638.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,3221.500943,4307.789994,3312.247449,3523.099623,3416.339028,3272.212226,3226.838973,2935.916353,...,1980.409031,2391.437320,1969.732971,2047.134402,1804.254049,1550.697637,1582.725816,1502.655370,1548.028622,1878.986466
518,217041477,Moyne - East,1908.994094,2204.192887,2076.472650,2037.342973,2250.217049,2285.572393,2415.733116,1943.505238,...,1166.853868,1489.005605,1245.922040,1396.771744,1346.485664,1218.584455,1481.447710,1132.963613,1328.843721,1477.706042
519,217041478,Moyne - West,1922.715612,2207.259802,2094.322881,2068.248563,2276.036838,2298.159945,2439.588221,1965.221301,...,1171.630161,1497.675602,1252.858691,1419.993722,1359.176060,1228.542769,1494.515577,1143.592273,1334.764019,1489.598849
520,217041479,Warrnambool - North,4440.815176,5203.128393,5445.203123,6047.011239,4964.784621,4421.142158,4424.052573,4232.402245,...,2266.899495,2696.385248,2476.103189,2926.645794,2508.887961,2201.045996,2305.081474,2078.101560,2221.790653,2790.643008


In [27]:
import pandas as pd
import numpy as np


# --- 2Ô∏è‚É£ Clean IDs ---
crime_df["LGA_CODE21"] = pd.to_numeric(crime_df["LGA_CODE21"], errors="coerce").astype("Int64").astype(str)
crime_df["SA2_CODE21"] = pd.to_numeric(crime_df["SA2_CODE21"], errors="coerce").astype("Int64").astype(str).str.zfill(9)

corr_df["LGA_CODE_2021"] = pd.to_numeric(corr_df["LGA_CODE_2021"], errors="coerce").astype("Int64").astype(str)
corr_df["SA2_CODE_2021"] = pd.to_numeric(corr_df["SA2_CODE_2021"], errors="coerce").astype("Int64").astype(str).str.zfill(9)

# --- 3Ô∏è‚É£ Filter VIC SA2s ---
corr_vic = corr_df[corr_df["SA2_CODE_2021"].str.startswith("2")].copy()

# --- 4Ô∏è‚É£ Compute proper weights ---
corr_vic["w_LGA_to_SA2"] = corr_vic.groupby("LGA_CODE_2021")["RATIO_FROM_TO"].transform(lambda x: x / x.sum())

# --- 5Ô∏è‚É£ Merge by LGA only ---
expanded = crime_df.merge(
    corr_vic[["LGA_CODE_2021","SA2_CODE_2021","SA2_NAME_2021","w_LGA_to_SA2"]],
    left_on="LGA_CODE21", right_on="LGA_CODE_2021", how="left"
)

# --- 6Ô∏è‚É£ Fill missing weights (if any LGA didn‚Äôt match) ---
expanded["w_LGA_to_SA2"] = expanded["w_LGA_to_SA2"].fillna(0)

# --- 7Ô∏è‚É£ Apply weighting ---
count_cols = [c for c in crime_df.columns if c.startswith(("Incidents_","Victims_"))]
for c in count_cols:
    expanded[c + "_w"] = expanded[c] * expanded["w_LGA_to_SA2"]

# --- 8Ô∏è‚É£ Collapse to unique SA2 (sum overlapping LGA contributions) ---
sa2_weighted = (
    expanded.groupby(["SA2_CODE_2021","SA2_NAME_2021"])[[c + "_w" for c in count_cols]]
            .sum()
            .rename(columns=lambda x: x.replace("_w",""))
            .reset_index()
)

# --- 9Ô∏è‚É£ Check totals and sample weights ---
orig_total = crime_df[count_cols].sum(numeric_only=True).sum()
new_total  = sa2_weighted[count_cols].sum(numeric_only=True).sum()
print("‚úÖ Original:", orig_total, "  ‚úÖ Weighted total:", new_total)
print("Œî%:", (new_total - orig_total)/orig_total * 100)

# --- üîü Inspect Ballarat overlaps ---
check = expanded.loc[expanded["LGA_NAME21"].str.contains("Ballarat", case=False, na=False),
                     ["LGA_NAME21","SA2_NAME_2021","w_LGA_to_SA2"]]
print("\nBallarat SA2 weights:\n", check.head(10))

# --- 11Ô∏è‚É£ Save final SA2 dataset ---
sa2_weighted

‚úÖ Original: 134720943.0   ‚úÖ Weighted total: 134720943.0
Œî%: 0.0

Ballarat SA2 weights:
    LGA_NAME21               SA2_NAME_2021  w_LGA_to_SA2
4    Ballarat                   Alfredton      0.107843
5    Ballarat                    Ballarat      0.107843
6    Ballarat                   Buninyong      0.106027
7    Ballarat                   Delacombe      0.107843
8    Ballarat     Wendouree - Miners Rest      0.107843
9    Ballarat  Ballarat East - Warrenheip      0.106952
10   Ballarat   Ballarat North - Invermay      0.107477
11   Ballarat      Canadian - Mount Clear      0.107843
12   Ballarat          Sebastopol - Redan      0.107843
13   Ballarat           Creswick - Clunes      0.016668


Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,Incidents_2021,Incidents_2022,Incidents_2023,...,Victims_2016,Victims_2017,Victims_2018,Victims_2019,Victims_2020,Victims_2021,Victims_2022,Victims_2023,Victims_2024,Victims_2025
0,201011001,Alfredton,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
1,201011002,Ballarat,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
2,201011005,Buninyong,14548.832253,14879.645181,14357.930442,13111.796323,14042.638700,11655.654556,12212.328264,13260.752451,...,9562.105530,9019.978570,8659.427899,7684.668281,8408.861144,6458.571304,7254.715485,7183.005061,8423.353420,8485.365000
3,201011006,Delacombe,14605.676027,14906.557159,14393.765122,13148.181941,14094.501630,11625.982235,12240.685623,13308.328350,...,9610.402179,9050.698568,8689.964737,7714.527519,8447.318663,6465.709057,7285.852788,7203.353123,8445.701023,8503.936081
4,201011007,Smythes Creek,2700.000000,2659.500000,2164.500000,2299.500000,2115.000000,2025.000000,1903.500000,2286.000000,...,1930.500000,1764.000000,1273.500000,1539.000000,1413.000000,1269.000000,1224.000000,1260.000000,1615.500000,1638.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,3221.500943,4307.789994,3312.247449,3523.099623,3416.339028,3272.212226,3226.838973,2935.916353,...,1980.409031,2391.437320,1969.732971,2047.134402,1804.254049,1550.697637,1582.725816,1502.655370,1548.028622,1878.986466
518,217041477,Moyne - East,1908.994094,2204.192887,2076.472650,2037.342973,2250.217049,2285.572393,2415.733116,1943.505238,...,1166.853868,1489.005605,1245.922040,1396.771744,1346.485664,1218.584455,1481.447710,1132.963613,1328.843721,1477.706042
519,217041478,Moyne - West,1922.715612,2207.259802,2094.322881,2068.248563,2276.036838,2298.159945,2439.588221,1965.221301,...,1171.630161,1497.675602,1252.858691,1419.993722,1359.176060,1228.542769,1494.515577,1143.592273,1334.764019,1489.598849
520,217041479,Warrnambool - North,4440.815176,5203.128393,5445.203123,6047.011239,4964.784621,4421.142158,4424.052573,4232.402245,...,2266.899495,2696.385248,2476.103189,2926.645794,2508.887961,2201.045996,2305.081474,2078.101560,2221.790653,2790.643008


In [None]:
import pandas as pd
import numpy as np

corr_vic["w_LGA_to_SA2"] = corr_vic.groupby("LGA_CODE_2021")["RATIO_FROM_TO"].transform(lambda x: x / x.sum())


# --- 2) Clean IDs (handle '22490.0', scientific notation, NaNs) ---
# LGA codes in your crime file
crime["LGA_CODE21"] = pd.to_numeric(crime["LGA_CODE21"], errors="coerce")\
                          .round().astype("Int64")       # nullable int
crime["LGA_CODE21"] = crime["LGA_CODE21"].astype(str)    # to string keys

# SA2 codes in your crime file -> make 9-digit strings
crime["SA2_CODE21"] = pd.to_numeric(crime["SA2_CODE21"], errors="coerce")\
                          .round().astype("Int64")
crime["SA2_CODE21"] = crime["SA2_CODE21"].astype(str).str.zfill(9)

# Corr file codes as strings
corr["LGA_CODE_2021"] = pd.to_numeric(corr["LGA_CODE_2021"], errors="coerce")\
                            .round().astype("Int64").astype(str)
corr["SA2_CODE_2021"] = pd.to_numeric(corr["SA2_CODE_2021"], errors="coerce")\
                            .round().astype("Int64").astype(str).str.zfill(9)

# Filter VIC SA2s (start with '2')
corr_vic = corr[corr["SA2_CODE_2021"].str.startswith("2")].copy()

# --- 3) Build per-LGA weights (sum to 1 within each LGA) ---
# RATIO_FROM_TO is SA2->LGA share; renormalise within each LGA to use for LGA->SA2 allocation
corr_vic["w_LGA_to_SA2"] = corr_vic.groupby("LGA_CODE_2021")["RATIO_FROM_TO"]\
                                   .transform(lambda x: x / x.sum())

# --- 4) Expand LGA rows to all SA2s inside that LGA (join ONLY on LGA) ---
expanded = crime.merge(
    corr_vic[["LGA_CODE_2021","SA2_CODE_2021","SA2_NAME_2021","w_LGA_to_SA2"]],
    left_on="LGA_CODE21", right_on="LGA_CODE_2021", how="left"
)

# Quick check: any LGAs from crime not found in corr?
missing_lgas = expanded.loc[expanded["w_LGA_to_SA2"].isna(), "LGA_CODE21"].dropna().unique()
if len(missing_lgas) > 0:
    print("‚ö†Ô∏è LGAs missing from correspondence (will produce NaNs):", list(missing_lgas))

# --- 5) Columns to weight (counts only) ---
count_cols = [c for c in crime.columns if c.startswith(("Incidents_","Victims_"))]

# --- 6) Allocate LGA totals down to SA2 using the weights ---
for c in count_cols:
    expanded[c + "_w"] = expanded[c] * expanded["w_LGA_to_SA2"]

# --- 7) Collapse to UNIQUE SA2 (sum all LGA contributions) ---
sa2_weighted = (
    expanded.groupby(["SA2_CODE_2021","SA2_NAME_2021"])[[c + "_w" for c in count_cols]]
            .sum()
            .rename(columns=lambda x: x.replace("_w",""))
            .reset_index()
)

# --- 8) Verify totals preserved (should be ~identical; tiny rounding OK) ---
orig_total = crime[count_cols].sum(numeric_only=True).sum()
new_total  = sa2_weighted[count_cols].sum(numeric_only=True).sum()
print("‚úÖ Original total:", float(orig_total))
print("‚úÖ Weighted SA2 total:", float(new_total))
print("Œî:", float(new_total - orig_total),
      " | %Œî:", float((new_total - orig_total)/orig_total*100 if orig_total else np.nan))

# --- 9) Save result (one row per SA2) ---
sa2_weighted.to_csv("crime_dataset_weighted_corrected_SA2.csv", index=False)
print("‚úÖ Saved: crime_dataset_weighted_corrected_SA2.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'crime_dataset.csv'