In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona

# Load cluster centroids
clusters_df = pd.read_csv(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/vic_clusters_centroids.csv"
)
clusters_gdf = gpd.GeoDataFrame(
    clusters_df,
    geometry=gpd.points_from_xy(clusters_df["Lng"], clusters_df["Lat"]),
    crs="EPSG:4283"
)

# Load SA2 shapefile
with fiona.open(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
) as src:
    records = list(src)

features = []
for rec in records:
    if rec["geometry"] is not None:
        geom = shape(rec["geometry"])
        props = rec["properties"]
        props["geometry"] = geom
        features.append(props)

sa2_gdf = gpd.GeoDataFrame(features, geometry="geometry")
vic_sa2 = sa2_gdf[sa2_gdf["STE_CODE21"] == "2"].copy()
vic_sa2.set_crs(epsg=4283, inplace=True)

# Load LGA shapefile
lga_gdf = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp"
)
lga_vic = lga_gdf[lga_gdf["STE_CODE21"] == "2"].copy()

# Reproject clusters separately for SA2 and LGA
clusters_gdf_sa2 = clusters_gdf.to_crs(vic_sa2.crs)
clusters_gdf_lga = clusters_gdf.to_crs(lga_vic.crs)

# Direct SA2 ↔ LGA mapping (polygon to polygon)
sa2_lga_map = gpd.sjoin(vic_sa2, lga_vic, how="left", predicate="intersects")
sa2_lga_map = sa2_lga_map[["SA2_CODE21", "SA2_NAME21", "LGA_CODE21", "LGA_NAME21"]].drop_duplicates()

# Now join clusters → SA2, then attach LGA using lookup
clusters_sa2 = gpd.sjoin(clusters_gdf_sa2, vic_sa2, how="left", predicate="within")
clusters_full = clusters_sa2.merge(
    sa2_lga_map,
    on=["SA2_CODE21", "SA2_NAME21"],
    how="left"
)

# clean lga
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True) 
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)      
    )

clusters_full["LGA_clean"] = clean_lga_name(clusters_full["LGA_NAME21"])

# Load crime + victim datasets

crime = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

victims = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

# Rename columns explicitly 

crime = crime.rename(columns={
    crime.columns[0]: "Year",
    crime.columns[1]: "Year ending",
    crime.columns[2]: "Police Region",
    crime.columns[3]: "Local Government Area",
    crime.columns[4]: "Incidents Recorded",
    crime.columns[5]: "Crime Rate per 100k"   
})

victims = victims.rename(columns={
    victims.columns[0]: "Year",
    victims.columns[1]: "Year ending",
    victims.columns[2]: "Police Region",
    victims.columns[3]: "Local Government Area",
    victims.columns[4]: "Victim Reports",
    victims.columns[5]: "Victim Rate per 100k"  
})

# Keep valid rows and clean names

crime_latest = crime[crime["Local Government Area"].notna()].copy()
victims_latest = victims[victims["Local Government Area"].notna()].copy()

crime_latest["LGA_clean"] = clean_lga_name(crime_latest["Local Government Area"])
victims_latest["LGA_clean"] = clean_lga_name(victims_latest["Local Government Area"])

# Ensure one row per LGA
crime_latest = crime_latest.drop_duplicates(subset=["LGA_clean"])
victims_latest = victims_latest.drop_duplicates(subset=["LGA_clean"])

# Merge crime & victims

merged = (
    clusters_full.merge(
        crime_latest[["LGA_clean", "Incidents Recorded", "Crime Rate per 100k"]],
        how="left", on="LGA_clean"
    )
    .merge(
        victims_latest[["LGA_clean", "Victim Reports", "Victim Rate per 100k"]],
        how="left", on="LGA_clean"
    )
)

#  Feature engineering

merged["crime_per_1000"] = merged["Crime Rate per 100k"] / 100
merged["victim_rate_ratio"] = merged["Victim Rate per 100k"] / merged["Crime Rate per 100k"]

# Save final dataset

output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset.csv"

print("\n final dataset saved:", output_path)


  props["geometry"] = geom
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  sa2_lga_map = gpd.sjoin(vic_sa2, lga_vic, how="left", predicate="intersects")



✅ Final dataset saved: /Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset.csv


In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona

import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona

# Load cluster centroids
clusters_df = pd.read_csv(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/vic_clusters_centroids.csv"
)
clusters_gdf = gpd.GeoDataFrame(
    clusters_df,
    geometry=gpd.points_from_xy(clusters_df["Lng"], clusters_df["Lat"]),
    crs="EPSG:4283"
)

# Load SA2 shapefile
with fiona.open(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
) as src:
    records = list(src)

features = []
for rec in records:
    if rec["geometry"] is not None:
        geom = shape(rec["geometry"])
        props = rec["properties"]
        props["geometry"] = geom
        features.append(props)

sa2_gdf = gpd.GeoDataFrame(features, geometry="geometry")
vic_sa2 = sa2_gdf[sa2_gdf["STE_CODE21"] == "2"].copy()
vic_sa2.set_crs(epsg=4283, inplace=True)

# Load LGA shapefile
lga_gdf = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp"
)
lga_vic = lga_gdf[lga_gdf["STE_CODE21"] == "2"].copy()

# Reproject clusters separately for SA2 and LGA
clusters_gdf_sa2 = clusters_gdf.to_crs(vic_sa2.crs)
clusters_gdf_lga = clusters_gdf.to_crs(lga_vic.crs)

# Direct SA2 ↔ LGA mapping (polygon to polygon)
sa2_lga_map = gpd.sjoin(vic_sa2, lga_vic, how="left", predicate="intersects")
sa2_lga_map = sa2_lga_map[["SA2_CODE21", "SA2_NAME21", "LGA_CODE21", "LGA_NAME21"]].drop_duplicates()

# Now join clusters → SA2, then attach LGA using lookup
clusters_sa2 = gpd.sjoin(clusters_gdf_sa2, vic_sa2, how="left", predicate="within")
clusters_full = clusters_sa2.merge(
    sa2_lga_map,
    on=["SA2_CODE21", "SA2_NAME21"],
    how="left"
)

# clean lga
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True) 
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)      
    )

clusters_full["LGA_clean"] = clean_lga_name(clusters_full["LGA_NAME21"])

# Load historical crime + victims
crime = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

victims = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

# Rename columns
crime = crime.rename(columns={
    crime.columns[0]: "Year",
    crime.columns[1]: "Year ending",
    crime.columns[2]: "Police Region",
    crime.columns[3]: "Local Government Area",
    crime.columns[4]: "Incidents Recorded",
    crime.columns[5]: "Crime Rate per 100k"
})

victims = victims.rename(columns={
    victims.columns[0]: "Year",
    victims.columns[1]: "Year ending",
    victims.columns[2]: "Police Region",
    victims.columns[3]: "Local Government Area",
    victims.columns[4]: "Victim Reports",
    victims.columns[5]: "Victim Rate per 100k"
})

# Convert Year ending to datetime (cleaner time axis)
for df in [crime, victims]:
    df["Year ending"] = pd.to_datetime(df["Year ending"], errors="coerce")

# Clean LGA names
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True) 
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

crime["LGA_clean"] = clean_lga_name(crime["Local Government Area"])
victims["LGA_clean"] = clean_lga_name(victims["Local Government Area"])

# Pivot crime
crime_wide = crime.pivot_table(
    index="LGA_clean",
    columns="Year",
    values="Incidents Recorded",
    aggfunc="sum"
).add_prefix("Incidents_").reset_index()

crime_rate_wide = crime.pivot_table(
    index="LGA_clean",
    columns="Year",
    values="Crime Rate per 100k",
    aggfunc="mean"
).add_prefix("CrimeRate_").reset_index()

# Pivot victims
victims_wide = victims.pivot_table(
    index="LGA_clean",
    columns="Year",
    values="Victim Reports",
    aggfunc="sum"
).add_prefix("Victims_").reset_index()

victim_rate_wide = victims.pivot_table(
    index="LGA_clean",
    columns="Year",
    values="Victim Rate per 100k",
    aggfunc="mean"
).add_prefix("VictimRate_").reset_index()

# Merge wide tables
crime_final = crime_wide.merge(crime_rate_wide, on="LGA_clean", how="left")
victim_final = victims_wide.merge(victim_rate_wide, on="LGA_clean", how="left")

final = crime_final.merge(victim_final, on="LGA_clean", how="left")

# Save
output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset_historical_wide.csv"
final.to_csv(output_path, index=False)

print("✅ Wide historical dataset saved:", output_path)
print(final.head())


  props["geometry"] = geom
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  sa2_lga_map = gpd.sjoin(vic_sa2, lga_vic, how="left", predicate="intersects")


✅ Wide historical dataset saved: /Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset_historical_wide.csv
Year   LGA_clean  Incidents_2016  Incidents_2017  Incidents_2018  \
0         alpine           328.0           280.0           319.0   
1         ararat           879.0           909.0           885.0   
2       ballarat          9029.0          9215.0          8898.0   
3        banyule          7216.0          7429.0          7358.0   
4     bass coast          2022.0          1987.0          1798.0   

Year  Incidents_2019  Incidents_2020  Incidents_2021  Incidents_2022  \
0              400.0           438.0           400.0           369.0   
1              912.0           894.0          1085.0           949.0   
2             8128.0          8713.0          7187.0          7567.0   
3             7003.0          7262.0          6283.0          5226.0   
4             1782.0          1788.0          2101.0     

In [10]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import fiona

# Load clusters (centroids with lat/lng)
clusters_df = pd.read_csv(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/mapped_target_suburbs.csv"
)
clusters_gdf = gpd.GeoDataFrame(
    clusters_df,
    geometry=gpd.points_from_xy(clusters_df["Lng"], clusters_df["Lat"]),
    crs="EPSG:4283"
)

# Load SA2 and LGA shapefiles
sa2 = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
)
sa2_vic = sa2[sa2["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

lga = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp"
)
lga_vic = lga[lga["STE_CODE21"] == "2"].copy()

# Join SA2 ↔ LGA
sa2_lga_map = gpd.sjoin(sa2_vic, lga_vic, how="left", predicate="intersects")
sa2_lga_map = sa2_lga_map[
    ["SA2_CODE21", "SA2_NAME21", "LGA_CODE21", "LGA_NAME21", "geometry"]
].drop_duplicates()

# Clean LGA
def clean_lga_name(series):
    return (
        series.astype(str)
        .str.lower()
        .str.replace(r"\(.*?\)", "", regex=True)
        .str.replace("rural city of", "", regex=False)
        .str.replace("city of", "", regex=False)
        .str.replace("shire of", "", regex=False)
        .str.replace("shire", "", regex=False)
        .str.replace("rural", "", regex=False)
        .str.replace("council", "", regex=False)
        .str.replace("city", "", regex=False)
        .str.replace("-", " ")
        .str.replace(r"[^a-z\s]", "", regex=True)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

sa2_lga_map["LGA_clean"] = clean_lga_name(sa2_lga_map["LGA_NAME21"])

# Load crime/victim historical tables
crime = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

victims = pd.read_excel(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2025.xlsx",
    sheet_name="Table 01",
    skiprows=5
)

# Rename columns
crime = crime.rename(columns={
    crime.columns[0]: "Year",
    crime.columns[1]: "Year ending",
    crime.columns[2]: "Police Region",
    crime.columns[3]: "Local Government Area",
    crime.columns[4]: "Incidents Recorded",
    crime.columns[5]: "Crime Rate per 100k"
})
victims = victims.rename(columns={
    victims.columns[0]: "Year",
    victims.columns[1]: "Year ending",
    victims.columns[2]: "Police Region",
    victims.columns[3]: "Local Government Area",
    victims.columns[4]: "Victim Reports",
    victims.columns[5]: "Victim Rate per 100k"
})

# Date parsing
for df in [crime, victims]:
    df["Year ending"] = pd.to_datetime(df["Year ending"], errors="coerce")

# Clean LGA names
crime["LGA_clean"] = clean_lga_name(crime["Local Government Area"])
victims["LGA_clean"] = clean_lga_name(victims["Local Government Area"])

# Pivot wide
crime_inc_wide = crime.pivot_table(
    index="LGA_clean", columns="Year", values="Incidents Recorded", aggfunc="sum"
).add_prefix("Incidents_").reset_index()

crime_rate_wide = crime.pivot_table(
    index="LGA_clean", columns="Year", values="Crime Rate per 100k", aggfunc="mean"
).add_prefix("CrimeRate_").reset_index()

victims_wide = victims.pivot_table(
    index="LGA_clean", columns="Year", values="Victim Reports", aggfunc="sum"
).add_prefix("Victims_").reset_index()

victim_rate_wide = victims.pivot_table(
    index="LGA_clean", columns="Year", values="Victim Rate per 100k", aggfunc="mean"
).add_prefix("VictimRate_").reset_index()

# Merge wide tables together
crime_final = crime_inc_wide.merge(crime_rate_wide, on="LGA_clean", how="left")
victim_final = victims_wide.merge(victim_rate_wide, on="LGA_clean", how="left")
lga_wide = crime_final.merge(victim_final, on="LGA_clean", how="left")

# Attach to SA2 + centroids
sa2_with_lga = sa2_lga_map.merge(lga_wide, on="LGA_clean", how="left")

# Add SA2 centroid lat/lng
sa2_with_lga["lat"] = sa2_with_lga.geometry.centroid.y
sa2_with_lga["lng"] = sa2_with_lga.geometry.centroid.x

# Drop geometry if not needed
final = sa2_with_lga.drop(columns="geometry")

output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset.csv"
final.to_csv(output_path, index=False)

print("✅ Final SA2-level historical dataset saved:", output_path)
print(final.head())


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  sa2_lga_map = gpd.sjoin(sa2_vic, lga_vic, how="left", predicate="intersects")


✅ Final SA2-level historical dataset saved: /Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/crime_dataset.csv
  SA2_CODE21 SA2_NAME21 LGA_CODE21     LGA_NAME21      LGA_clean  \
0  201011001  Alfredton      22490  Golden Plains  golden plains   
1  201011001  Alfredton      20570       Ballarat       ballarat   
2  201011002   Ballarat      20570       Ballarat       ballarat   
3  201011005  Buninyong      22490  Golden Plains  golden plains   
4  201011005  Buninyong      25150      Moorabool      moorabool   

   Incidents_2016  Incidents_2017  Incidents_2018  Incidents_2019  \
0           600.0           591.0           481.0           511.0   
1          9029.0          9215.0          8898.0          8128.0   
2          9029.0          9215.0          8898.0          8128.0   
3           600.0           591.0           481.0           511.0   
4          1669.0          1978.0          1823.0          1633.0   

   Inci


  sa2_with_lga["lat"] = sa2_with_lga.geometry.centroid.y

  sa2_with_lga["lng"] = sa2_with_lga.geometry.centroid.x
