import libraries


In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona


load datasets

In [2]:
clusters_df = pd.read_csv("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/raw/mapped_target_suburbs.csv")
clusters_gdf = gpd.GeoDataFrame(
    clusters_df,
    geometry=gpd.points_from_xy(clusters_df["Lng"], clusters_df["Lat"]),
    crs="EPSG:4283"
)

# Load SA2 and LGA boundaries
sa2 = gpd.read_file("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp")
sa2 = sa2[sa2["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

lga = gpd.read_file("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp")
lga = lga[lga["STE_CODE21"] == "2"].copy()


map sa2s to lgas

In [3]:
# map sa2s to lgas
sa2_lga_map = gpd.sjoin(sa2, lga, how="left", predicate="intersects")
sa2_lga_map = sa2_lga_map[["SA2_CODE21", "SA2_NAME21", "LGA_CODE21", "LGA_NAME21", "geometry"]].drop_duplicates()
sa2_lga_map.head()



Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  sa2_lga_map = gpd.sjoin(sa2, lga, how="left", predicate="intersects")


Unnamed: 0,SA2_CODE21,SA2_NAME21,LGA_CODE21,LGA_NAME21,geometry
644,201011001,Alfredton,22490,Golden Plains,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
644,201011001,Alfredton,20570,Ballarat,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
645,201011002,Ballarat,20570,Ballarat,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
646,201011005,Buninyong,22490,Golden Plains,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
646,201011005,Buninyong,25150,Moorabool,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."


clean lga names

In [4]:
# Standardize LGA names (remove extra words and formatting)
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

sa2_lga_map["LGA_clean"] = clean_lga_name(sa2_lga_map["LGA_NAME21"])

# Fix known naming differences (like Merri-bek)
rename_map = {"merri bek": "merri-bek", "moreland": "merri-bek"}
sa2_lga_map["LGA_clean"] = sa2_lga_map["LGA_clean"].replace(rename_map)


load crime files

In [5]:
# Detect header row and load crime data
temp = pd.read_excel("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2025.xlsx", sheet_name="Table 01", header=None)
header_row = temp[temp.apply(lambda r: r.astype(str).str.contains("Year", case=False)).any(axis=1)].index[0]
crime = pd.read_excel("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2025.xlsx", sheet_name="Table 01", skiprows=header_row)
crime.columns = crime.columns.str.strip()
crime.head()


Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,2025,March,1 North West Metro,Banyule,8086,6082.275801
1,2025,March,1 North West Metro,Brimbank,14369,7214.154149
2,2025,March,1 North West Metro,Darebin,15001,9271.641976
3,2025,March,1 North West Metro,Hobsons Bay,6289,6491.46985
4,2025,March,1 North West Metro,Hume,17321,6180.821302


load victim files

In [None]:
# detect header row and load victim data
temp_v = pd.read_excel("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2025.xlsx", sheet_name="Table 01", header=None)
header_row_v = temp_v[temp_v.apply(lambda r: r.astype(str).str.contains("Year", case=False)).any(axis=1)].index[0]
victims = pd.read_excel("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/landing/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2025.xlsx", sheet_name="Table 01", skiprows=header_row_v)
victims.columns = victims.columns.str.strip()
victims.head()

Unnamed: 0,Year,Year ending,Police Region,Local Government Area,Victim Reports,"Rate per 100,000 population"
0,2025,March,1 North West Metro,Banyule,4534,3410.467287
1,2025,March,1 North West Metro,Brimbank,8104,4068.7247
2,2025,March,1 North West Metro,Darebin,9260,5723.312093
3,2025,March,1 North West Metro,Hobsons Bay,3691,3809.829101
4,2025,March,1 North West Metro,Hume,10043,3583.741605


clean columns 

In [7]:
# Rename columns
crime = crime.rename(columns={
    crime.columns[0]: "Year",
    crime.columns[1]: "Year ending",
    crime.columns[2]: "Police Region",
    crime.columns[3]: "Local Government Area",
    crime.columns[4]: "Incidents Recorded",
    crime.columns[5]: "Crime Rate per 100k"
})
victims = victims.rename(columns={
    victims.columns[0]: "Year",
    victims.columns[1]: "Year ending",
    victims.columns[2]: "Police Region",
    victims.columns[3]: "Local Government Area",
    victims.columns[4]: "Victim Reports",
    victims.columns[5]: "Victim Rate per 100k"
})

# Clean and fix names
for df in [crime, victims]:
    df["LGA_clean"] = clean_lga_name(df["Local Government Area"])
    df["LGA_clean"] = df["LGA_clean"].replace(rename_map)


merge crime and victim datasets

In [8]:
crime_incidents_wide = crime.pivot_table(index="LGA_clean", columns="Year", values="Incidents Recorded", aggfunc="sum").add_prefix("Incidents_").reset_index()
crime_rate_wide = crime.pivot_table(index="LGA_clean", columns="Year", values="Crime Rate per 100k", aggfunc="mean").add_prefix("CrimeRate_").reset_index()
victims_wide = victims.pivot_table(index="LGA_clean", columns="Year", values="Victim Reports", aggfunc="sum").add_prefix("Victims_").reset_index()
victim_rate_wide = victims.pivot_table(index="LGA_clean", columns="Year", values="Victim Rate per 100k", aggfunc="mean").add_prefix("VictimRate_").reset_index()

crime_final = crime_incidents_wide.merge(crime_rate_wide, on="LGA_clean", how="left")
victim_final = victims_wide.merge(victim_rate_wide, on="LGA_clean", how="left")
lga_wide = crime_final.merge(victim_final, on="LGA_clean", how="left")


merge with sa2s

In [9]:
sa2_with_lga = sa2_lga_map.merge(lga_wide, on="LGA_clean", how="left")
sa2_with_lga["lat"] = sa2_with_lga.geometry.centroid.y
sa2_with_lga["lng"] = sa2_with_lga.geometry.centroid.x

final = sa2_with_lga.drop(columns="geometry")
final.head()



  sa2_with_lga["lat"] = sa2_with_lga.geometry.centroid.y

  sa2_with_lga["lng"] = sa2_with_lga.geometry.centroid.x


Unnamed: 0,SA2_CODE21,SA2_NAME21,LGA_CODE21,LGA_NAME21,LGA_clean,Incidents_2016,Incidents_2017,Incidents_2018,Incidents_2019,Incidents_2020,...,VictimRate_2018,VictimRate_2019,VictimRate_2020,VictimRate_2021,VictimRate_2022,VictimRate_2023,VictimRate_2024,VictimRate_2025,lat,lng
0,201011001,Alfredton,22490,Golden Plains,golden plains,600.0,591.0,481.0,511.0,470.0,...,1220.300979,1436.371273,1288.680949,1132.894103,1074.20718,1084.766775,1363.10134,1354.460609,-37.54175,143.749324
1,201011001,Alfredton,20570,Ballarat,ballarat,9029.0,9215.0,8898.0,8128.0,8713.0,...,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569,-37.54175,143.749324
2,201011002,Ballarat,20570,Ballarat,ballarat,9029.0,9215.0,8898.0,8128.0,8713.0,...,4992.193889,4344.419848,4677.325451,3521.461799,3882.390463,3769.703537,4313.093763,4245.666569,-37.556157,143.836649
3,201011005,Buninyong,22490,Golden Plains,golden plains,600.0,591.0,481.0,511.0,470.0,...,1220.300979,1436.371273,1288.680949,1132.894103,1074.20718,1084.766775,1363.10134,1354.460609,-37.643867,143.880772
4,201011005,Buninyong,25150,Moorabool,moorabool,1669.0,1978.0,1823.0,1633.0,1637.0,...,2948.473833,2466.962814,2479.428324,2368.768135,2097.28495,2279.70525,2657.356709,2705.299687,-37.643867,143.880772


output 

In [10]:
output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/datasets/raw/crime_dataset.csv"
final.to_csv(output_path, index=False)

missing data

In [11]:
missing_crime = final[final["CrimeRate_2025"].isna()][["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"]]
print(f"Total missing: {len(missing_crime)}")

missing_report = pd.DataFrame({
    "missing_count": final.isna().sum(),
    "missing_pct": (final.isna().sum() / len(final) * 100).round(2)
}).query("missing_count > 0")

# Define which columns to inspect
key_cols = [
    col for col in final.columns 
    if any(keyword in col for keyword in ["2025", "CrimeRate", "VictimRate", "Incidents", "Victims"])
]

# Filter rows (SA2s) with missing values in those key columns
missing_sa2s = final[final[key_cols].isna().any(axis=1)][
    ["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"] + key_cols
]

print("\nSA2s Missing Key Data:")
display_cols = ["SA2_CODE21", "SA2_NAME21", "LGA_NAME21", "LGA_clean"]
print(missing_sa2s[display_cols].to_string(index=False))

Total missing: 13

SA2s Missing Key Data:
SA2_CODE21                             SA2_NAME21         LGA_NAME21          LGA_clean
 204011054                              Alexandra Unincorporated Vic unincorporated vic
 204011057                       Mansfield (Vic.) Unincorporated Vic unincorporated vic
 204011061                     Upper Yarra Valley Unincorporated Vic unincorporated vic
 204031069                  Bright - Mount Beauty Unincorporated Vic unincorporated vic
 205011077                   Mount Baw Baw Region Unincorporated Vic unincorporated vic
 205021085                                 Orbost Unincorporated Vic unincorporated vic
 205031087                                 Foster Unincorporated Vic unincorporated vic
 205031088                          French Island Unincorporated Vic unincorporated vic
 205031092                     Wilsons Promontory Unincorporated Vic unincorporated vic
 214021379                      Hastings - Somers Unincorporated Vic unincorpo