# Crime Data Preprocessing

## Purpose
Process and standardize crime and victim data from Victoria's LGA-level reports, and allocate LGA-level measures to SA2s using correspondence weights.

## Inputs
- LGA-level crime & victim data: `Data_Tables_LGA_Criminal_Incidents_Year_Ending_June_2025.xlsx`
- SA2 and LGA shapefiles: `SA2_2021_AUST_GDA2020.shp`, `LGA_2021_AUST_GDA2020.shp`
- Correspondence weights for SA2-LGA allocation: `CG_SA2_2021_LGA_2021(in).csv`

## Outputs
- SA2-level crime dataset: `crime_dataset_weighted_to_SA2.csv`
- Allocation check report: `weight_checks_by_LGA.csv`

## Key Steps
1. Load SA2 and LGA boundaries and map SA2s to LGA.
2. Clean LGA names for consistency.
3. Load and standardize crime and victim datasets.
4. Pivot data to wide format for counts and rates.
5. Merge LGA-wide crime measures with SA2s using correspondence weights.
6. Compute SA2-level counts and weighted rates.
7. Perform sanity checks on allocations and export final SA2 dataset.

In [5]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona
import numpy as np
from pathlib import Path
import re
from shapely.geometry import Point
from rapidfuzz import process, fuzz

### Map SA2s to LGAs

In [8]:
# Load SA2 and LGA boundaries
clusters_df = pd.read_csv("../../datasets/district_shape/sa2_lookup/mapped_target_suburbs.csv")
clusters_gdf = gpd.GeoDataFrame(
    clusters_df,
    geometry=gpd.points_from_xy(clusters_df["Lng"], clusters_df["Lat"]),
    crs="EPSG:4283"
)

sa2 = gpd.read_file("../../datasets/district_shape/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp")
sa2 = sa2[sa2["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

lga = gpd.read_file("../../datasets/district_shape/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp")
lga = lga[lga["STE_CODE21"] == "2"].copy()

In [None]:
# map sa2s to lgas
lga = lga.to_crs(sa2.crs)
sa2_lga_map = gpd.sjoin(sa2, lga, how="left", predicate="intersects")
sa2_lga_map = sa2_lga_map[["SA2_CODE21", "SA2_NAME21", "LGA_CODE21", "LGA_NAME21", "geometry"]].drop_duplicates()
sa2_lga_map.head()


Unnamed: 0,SA2_CODE21,SA2_NAME21,LGA_CODE21,LGA_NAME21,geometry
644,201011001,Alfredton,22490,Golden Plains,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
644,201011001,Alfredton,20570,Ballarat,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
645,201011002,Ballarat,20570,Ballarat,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
646,201011005,Buninyong,22490,Golden Plains,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
646,201011005,Buninyong,25150,Moorabool,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."


### Clean LGA Names

In [None]:
# Standardize LGA names (remove extra words and formatting)
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

sa2_lga_map["LGA_clean"] = clean_lga_name(sa2_lga_map["LGA_NAME21"])

# Fix known naming differences (like Merri-bek)
rename_map = {"merri bek": "merri-bek", "moreland": "merri-bek"}
sa2_lga_map["LGA_clean"] = sa2_lga_map["LGA_clean"].replace(rename_map)

### Load Crime Files

In [13]:
# Detect header row and load crime data
temp = pd.read_excel("../../datasets/crime/Data_Tables_LGA_Criminal_Incidents_Year_Ending_June_2025 (1).xlsx", sheet_name="Table 01", header=None)
header_row = temp[temp.apply(lambda r: r.astype(str).str.contains("Year", case=False)).any(axis=1)].index[0]
crime = pd.read_excel("../../datasets/crime/Data_Tables_LGA_Criminal_Incidents_Year_Ending_June_2025 (1).xlsx", sheet_name="Table 01", skiprows=header_row)
crime.columns = crime.columns.str.strip()
crime.head()

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,2025,June,1 North West Metro,Banyule,8326,6262.803403
1,2025,June,1 North West Metro,Brimbank,14558,7309.044199
2,2025,June,1 North West Metro,Darebin,15257,9429.867451
3,2025,June,1 North West Metro,Hobsons Bay,6458,6665.910684
4,2025,June,1 North West Metro,Hume,17689,6312.13833


### Load victim files

In [15]:
# detect header row and load victim data
temp_v = pd.read_excel("../../datasets/crime/Data_Tables_LGA_Criminal_Incidents_Year_Ending_June_2025 (1).xlsx", sheet_name="Table 01", header=None)
header_row_v = temp_v[temp_v.apply(lambda r: r.astype(str).str.contains("Year", case=False)).any(axis=1)].index[0]
victims = pd.read_excel("../../datasets/crime/Data_Tables_LGA_Criminal_Incidents_Year_Ending_June_2025 (1).xlsx", sheet_name="Table 01", skiprows=header_row_v)
victims.columns = victims.columns.str.strip()
victims.head()

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,2025,June,1 North West Metro,Banyule,8326,6262.803403
1,2025,June,1 North West Metro,Brimbank,14558,7309.044199
2,2025,June,1 North West Metro,Darebin,15257,9429.867451
3,2025,June,1 North West Metro,Hobsons Bay,6458,6665.910684
4,2025,June,1 North West Metro,Hume,17689,6312.13833


### Clean columns 

In [16]:
# Rename columns
crime = crime.rename(columns={
    crime.columns[0]: "Year",
    crime.columns[1]: "Year ending",
    crime.columns[2]: "Police Region",
    crime.columns[3]: "Local Government Area",
    crime.columns[4]: "Incidents Recorded",
    crime.columns[5]: "Crime Rate per 100k"
})
victims = victims.rename(columns={
    victims.columns[0]: "Year",
    victims.columns[1]: "Year ending",
    victims.columns[2]: "Police Region",
    victims.columns[3]: "Local Government Area",
    victims.columns[4]: "Victim Reports",
    victims.columns[5]: "Victim Rate per 100k"
})

# Clean and fix names
for df in [crime, victims]:
    df["LGA_clean"] = clean_lga_name(df["Local Government Area"])
    df["LGA_clean"] = df["LGA_clean"].replace(rename_map)


### Merge crime and victim datasets

In [17]:
crime_incidents_wide = crime.pivot_table(index="LGA_clean", columns="Year", values="Incidents Recorded", aggfunc="sum").add_prefix("Incidents_").reset_index()
crime_rate_wide = crime.pivot_table(index="LGA_clean", columns="Year", values="Crime Rate per 100k", aggfunc="mean").add_prefix("CrimeRate_").reset_index()
victims_wide = victims.pivot_table(index="LGA_clean", columns="Year", values="Victim Reports", aggfunc="sum").add_prefix("Victims_").reset_index()
victim_rate_wide = victims.pivot_table(index="LGA_clean", columns="Year", values="Victim Rate per 100k", aggfunc="mean").add_prefix("VictimRate_").reset_index()

crime_final = crime_incidents_wide.merge(crime_rate_wide, on="LGA_clean", how="left")
victim_final = victims_wide.merge(victim_rate_wide, on="LGA_clean", how="left")
lga_wide = crime_final.merge(victim_final, on="LGA_clean", how="left")


### Merge with SA2s

In [None]:
# Merge SA2 with LGA-wide data
sa2_with_lga = sa2_lga_map.merge(lga_wide, on="LGA_clean", how="left")

# Project geometries to a projected CRS (meters) for accurate centroids
# EPSG:3111 is Victoria Lambert Conformal Conic
sa2_projected = sa2_with_lga.to_crs(epsg=3111)

# Compute centroids in projected CRS
centroids = sa2_projected.geometry.centroid

# Convert centroids back to geographic CRS (lat/lon)
centroids_geo = gpd.GeoSeries(centroids, crs=3111).to_crs(epsg=4326)

# Add lat/lng to the dataframe
sa2_with_lga["lat"] = centroids_geo.y
sa2_with_lga["lng"] = centroids_geo.x

# Drop geometry column if not needed
final = sa2_with_lga.drop(columns="geometry")

final.head()


Unnamed: 0,SA2_CODE21,SA2_NAME21,LGA_CODE21,LGA_NAME21,LGA_clean,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,...,VictimRate_2018,VictimRate_2019,VictimRate_2020,VictimRate_2021,VictimRate_2022,VictimRate_2023,VictimRate_2024,VictimRate_2025,lat,lng
0,201011001,Alfredton,22490,Golden Plains,golden plains,646.0,541.0,504.0,470.0,466.0,...,2173.256867,1973.960521,1912.501026,1795.757673,1785.079578,1902.216024,2133.880093,2400.07443,-37.541753,143.74932
1,201011001,Alfredton,20570,Ballarat,ballarat,8952.0,9296.0,8659.0,8117.0,8458.0,...,8046.799494,7394.350159,7575.798289,6399.774457,6589.892338,7168.616562,7895.910781,8581.786754,-37.541753,143.74932
2,201011002,Ballarat,20570,Ballarat,ballarat,8952.0,9296.0,8659.0,8117.0,8458.0,...,8046.799494,7394.350159,7575.798289,6399.774457,6589.892338,7168.616562,7895.910781,8581.786754,-37.556157,143.836648
3,201011005,Buninyong,22490,Golden Plains,golden plains,646.0,541.0,504.0,470.0,466.0,...,2173.256867,1973.960521,1912.501026,1795.757673,1785.079578,1902.216024,2133.880093,2400.07443,-37.643868,143.880772
4,201011005,Buninyong,25150,Moorabool,moorabool,1755.0,2008.0,1741.0,1613.0,1704.0,...,5022.79153,4506.467745,4612.386314,5241.361118,3771.479001,4270.289633,4586.326418,5442.571097,-37.643868,143.880772


In [None]:
output_path = "../../datasets/raw/crime_dataset.csv"
final.to_csv(output_path, index=False)

### Missing data

In [20]:
missing_crime = final[final["CrimeRate_2025"].isna()][["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"]]
print(f"missing: {len(missing_crime)}")

missing_report = pd.DataFrame({
    "missing_count": final.isna().sum(),
    "missing_pct": (final.isna().sum() / len(final) * 100).round(2)
}).query("missing_count > 0")

# Define which columns to inspect
key_cols = [
    col for col in final.columns 
    if any(keyword in col for keyword in ["2025", "CrimeRate", "VictimRate", "Incidents", "Victims"])
]

# Filter rows with missing values in those key columns
missing_sa2s = final[final[key_cols].isna().any(axis=1)][
    ["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"] + key_cols
]

print("\nsa2s missing data")
display_cols = ["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"]
print(missing_sa2s[display_cols].to_string(index=False))

missing: 13

sa2s missing data
SA2_CODE21                             SA2_NAME21         LGA_NAME21          LGA_clean
 204011054                              Alexandra Unincorporated Vic unincorporated vic
 204011057                       Mansfield (Vic.) Unincorporated Vic unincorporated vic
 204011061                     Upper Yarra Valley Unincorporated Vic unincorporated vic
 204031069                  Bright - Mount Beauty Unincorporated Vic unincorporated vic
 205011077                   Mount Baw Baw Region Unincorporated Vic unincorporated vic
 205021085                                 Orbost Unincorporated Vic unincorporated vic
 205031087                                 Foster Unincorporated Vic unincorporated vic
 205031088                          French Island Unincorporated Vic unincorporated vic
 205031092                     Wilsons Promontory Unincorporated Vic unincorporated vic
 214021379                      Hastings - Somers Unincorporated Vic unincorporated vic
 

## Converting LGAs to SA2 based on LGA/SA2 ratio

In [28]:
# Load Data
P_CORR  = Path("../../datasets/crime/CG_SA2_2021_LGA_2021(in).csv")
P_LGA   = Path("../../datasets/crime/crime_dataset(in) (1).csv")

In [29]:
OUT_SA2 = "../../datasets/raw/crime_dataset_weighted_to_SA2.csv"
OUT_CHECKS = "../../datasets/raw/weight_checks_by_LGA.csv"

In [30]:
corr = pd.read_csv(P_CORR, dtype=str)
lga  = pd.read_csv(P_LGA,  dtype=str)

In [31]:
# Normalize column names
def normcols(df):
    """Strip whitespace from column names."""
    df = df.rename(columns={c: c.strip() for c in df.columns})
    df.columns = [c.strip() for c in df.columns]  # just to be sure
    return df

corr = normcols(corr)
lga  = normcols(lga)

# Helper to pick column from list of possible names
def pick(name_options, cols):
    for n in name_options:
        if n in cols:
            return n
    return None

# Detect expected key/weight columns in correspondence
c_sa2_code = pick(["SA2_CODE_2021","SA2_CODE21","SA2_MAINCODE_2021","SA2_MAINCODE21","SA2_CODE"], corr.columns)
c_sa2_name = pick(["SA2_NAME_2021","SA2_NAME21","SA2_NAME"], corr.columns)
c_lga_code = pick(["LGA_CODE_2021","LGA_CODE21","LGA_CODE"], corr.columns)
c_lga_name = pick(["LGA_NAME_2021","LGA_NAME21","LGA_NAME"], corr.columns)
c_ratio    = pick(["RATIO_FROM_TO","RATIO_TO","RATIO","PROP","WEIGHT"], corr.columns)

missing = [x for x in [c_sa2_code,c_sa2_name,c_lga_code,c_lga_name,c_ratio] if x is None]
if missing:
    raise ValueError(f"Could not find these columns in correspondence file: {missing}\n"
                     f"Have columns: {list(corr.columns)}")

# Ensure codes are strings and trimmed
for c in [c_sa2_code, c_lga_code]:
    corr[c] = corr[c].astype(str).str.strip()

# Filter to Victoria SA2s only (codes starting with '2')
corr_vic = corr[corr[c_sa2_code].str.startswith("2")].copy()

In [None]:
#  Prepare LGA keys 
l_lga_code = pick(["LGA_CODE_2021","LGA_CODE21","LGA_CODE"], lga.columns)
l_lga_name = pick(["LGA_NAME_2021","LGA_NAME21","LGA_NAME"], lga.columns)

if (l_lga_code is None) and (l_lga_name is None):
    raise ValueError(f"LGA dataset has no LGA code/name column. Found columns: {list(lga.columns)}")

#  Compute normalized weights per LGA
corr_vic["ratio_raw"] = pd.to_numeric(corr_vic[c_ratio], errors="coerce").fillna(0.0)
grp_key = c_lga_code if l_lga_code else c_lga_name

if grp_key == c_lga_name:
    corr_vic[c_lga_name + "_key"] = corr_vic[c_lga_name].str.strip().str.lower()
    lga[l_lga_name + "_key"]       = lga[l_lga_name].str.strip().str.lower()
    corr_group_key = c_lga_name + "_key"
    lga_join_key   = l_lga_name + "_key"
else:
    corr_vic[c_lga_code + "_key"] = corr_vic[c_lga_code].str.strip()
    lga[l_lga_code + "_key"]       = lga[l_lga_code].str.strip()
    corr_group_key = c_lga_code + "_key"
    lga_join_key   = l_lga_code + "_key"

# Normalize weights so they sum to 1 within each LGA
sum_by_lga = corr_vic.groupby(corr_vic[corr_group_key])["ratio_raw"].sum().rename("ratio_sum_lga")
corr_vic = corr_vic.merge(sum_by_lga, left_on=corr_group_key, right_index=True, how="left")
corr_vic["weight_norm"] = np.where(
    corr_vic["ratio_sum_lga"] > 0,
    corr_vic["ratio_raw"] / corr_vic["ratio_sum_lga"],
    1.0 / corr_vic.groupby(corr_vic[corr_group_key])[corr_group_key].transform("count")
)

# Identify numeric columns in LGA 
def starts_with_any(s, prefixes):
    return any(s.startswith(p) for p in prefixes)

count_prefixes = ("Incidents_", "Victims_")
rate_prefixes  = ("CrimeRate_", "VictimRate_")

lga_numeric = []
for c in lga.columns:
    if c in [l_lga_code, l_lga_name, lga_join_key]:
        continue
    try:
        pd.to_numeric(lga[c])
        lga_numeric.append(c)
    except Exception:
        pass

count_cols = [c for c in lga_numeric if starts_with_any(c, count_prefixes)]
rate_cols  = [c for c in lga_numeric if starts_with_any(c, rate_prefixes)]

#  Merge LGA data with correspondence weights
to_merge_cols = [col for col in [l_lga_code, l_lga_name, lga_join_key] if col is not None]
lga_for_merge = lga[to_merge_cols + count_cols + rate_cols].copy()

merged = corr_vic.merge(
    lga_for_merge,
    left_on=corr_group_key,
    right_on=lga_join_key,
    how="inner"
)

# Coerce numeric measures
for c in count_cols + rate_cols:
    merged[c] = pd.to_numeric(merged[c], errors="coerce").fillna(0.0)

# Apply weights 
for c in count_cols:
    merged[c + "_alloc"] = merged[c] * merged["weight_norm"]

for c in rate_cols:
    merged[c + "_alloc"] = merged[c] * merged["weight_norm"]

# Aggregate to SA2 
sa2_keys = [c_sa2_code, c_sa2_name]
alloc_count_cols = [c + "_alloc" for c in count_cols]
alloc_rate_cols  = [c + "_alloc" for c in rate_cols]

sa2_counts = (
    merged.groupby(sa2_keys, dropna=False)[alloc_count_cols]
          .sum()
          .rename(columns=lambda x: x.replace("_alloc",""))
)

sa2_weight_sum = merged.groupby(sa2_keys, dropna=False)["weight_norm"].sum().rename("weight_sum_sa2")
sa2_rates_sum = (
    merged.groupby(sa2_keys, dropna=False)[alloc_rate_cols]
          .sum()
          .rename(columns=lambda x: x.replace("_alloc","_weighted_sum"))
)

sa2_rates = sa2_rates_sum.copy()
for c in rate_cols:
    sa2_rates[c] = np.where(sa2_weight_sum.values > 0,
                            sa2_rates_sum[c + "_weighted_sum"].values / sa2_weight_sum.values,
                            np.nan)
    del sa2_rates[c + "_weighted_sum"]

# Combine counts and rates
sa2_final = sa2_counts.join(sa2_rates, how="outer").reset_index()

# Sanity checks & exports 
checks = []
for c in count_cols:
    lga_alloc_back = (
        merged.groupby(lga_join_key)[c + "_alloc"].sum().rename("allocated_sum")
        .reset_index()
        .merge(lga_for_merge[[lga_join_key, c]], on=lga_join_key, how="left")
    )
    lga_alloc_back["abs_diff"] = (lga_alloc_back["allocated_sum"] - pd.to_numeric(lga_alloc_back[c], errors="coerce")).abs()
    lga_alloc_back["measure"] = c
    checks.append(lga_alloc_back[[lga_join_key, "measure", "allocated_sum", c, "abs_diff"]])

weight_checks = pd.concat(checks, ignore_index=True) if checks else pd.DataFrame()

# Export
weight_checks.to_csv(OUT_CHECKS, index=False)
sa2_final.to_csv(OUT_SA2, index=False)

print(f"Done. SA2-level dataset written to:\n  {OUT_SA2}")
if not weight_checks.empty:
    print(f"Allocation check by LGA written to:\n  {OUT_CHECKS}\n(abs_diff should be ~0 per LGA per measure)")

Done. SA2-level dataset written to:
  ../../datasets/raw/crime_dataset_weighted_to_SA2.csv
Allocation check by LGA written to:
  ../../datasets/raw/weight_checks_by_LGA.csv
(abs_diff should be ~0 per LGA per measure)
