# Mapping Target Suburbs to SA2 Regions

## Purpose
Map target suburbs in Victoria to their corresponding SA2 and LGA regions using spatial joins and fuzzy string matching to clean and normalize suburb names.

## Inputs
- `SA2_2021_AUST_GDA2020.shp` – SA2 polygons for Victoria.  
- `LGA_2021_AUST_GDA2020.shp` – LGA polygons for Victoria.  

## Outputs
- `mapped_target_suburbs.csv` – target suburbs matched to SA2 and LGA with coordinates.

## Key Steps
1. Load SA2 and LGA shapefiles for Victoria and compute centroids for SA2s.  
2. Normalize and tokenize suburb/cluster names to remove directional terms and common stop words.  
3. Define function to match target components to clusters using exact, token-based, and fuzzy matching.  
4. Map target suburbs to clusters, computing average coordinates when multiple matches occur.  
5. Create GeoDataFrame of mapped suburbs and perform spatial joins with SA2 and LGA polygons.  
6. Save the final mapped dataset as CSV.  


In [1]:
# libaries
import re
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from rapidfuzz import process, fuzz

### Load sa2 and compute centroids


In [2]:
shapefile = "../../datasets/raw/district_shape/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
gdf = gpd.read_file(shapefile)
vic = gdf[gdf["STE_CODE21"] == "2"].copy()

vic["Lat"] = vic.geometry.centroid.y
vic["Lng"] = vic.geometry.centroid.x

suburbs = {
    row["SA2_NAME21"]: (row["Lat"], row["Lng"])
    for _, row in vic.iterrows()
}


  vic["Lat"] = vic.geometry.centroid.y

  vic["Lng"] = vic.geometry.centroid.x


In [3]:
# Rename for consistency
vic_out = vic[["SA2_NAME21", "Lat", "Lng"]].rename(columns={"SA2_NAME21": "Cluster"})

### Load cluster centroids and fuzzy matching preparation

In [4]:
# Cleaning setup
DIRECTIONALS = {"north", "south", "east", "west", "upper", "lower", "central", "inner", "outer"}
STOP_TOKENS = DIRECTIONALS | {"vic", "city", "road", "rd"}

def normalize_name(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\(.*?\)", "", s)
    s = s.replace("&", " and ")
    s = re.sub(r"[;/]", " ", s)
    s = s.replace("–", " ").replace("-", " ")
    s = re.sub(r"\bst\.?\b", "st", s)
    s = re.sub(r"\bmt\.?\b", "mount", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_base(s: str):
    norm = normalize_name(s)
    toks = [t for t in re.split(r"[^a-z0-9]+", norm) if t]
    base_toks = [t for t in toks if t not in STOP_TOKENS]
    primary = base_toks[0] if base_toks else (toks[0] if toks else None)
    return norm, toks, base_toks, primary

vic_out["cluster_norm"] = vic_out["Cluster"].apply(normalize_name)
vic_out["tokens_all"] = vic_out["cluster_norm"].apply(lambda s: [t for t in re.split(r"[^a-z0-9]+", s) if t])
vic_out["tokens_base"] = vic_out["tokens_all"].apply(lambda toks: [t for t in toks if t not in STOP_TOKENS])

### Match suburb name to the closest SA2 cluster

In [5]:
def best_match(component: str):
    """
    Find the best matching cluster in 'vic' for a given component name.

    The function first checks for an exact match, then filters candidates
    using primary and base tokens. If multiple candidates remain, it selects
    the best one using fuzzy string matching.
    """
    
    comp_norm, comp_tokens, comp_base, comp_primary = tokenize_base(component)

    # Exact match
    exact = vic_out[vic_out["cluster_norm"] == comp_norm]
    if not exact.empty:
        return exact.iloc[0]

    # Filter primary token
    if comp_primary:
        cand = vic_out[vic_out["tokens_all"].apply(lambda toks: comp_primary in toks)].copy()
    else:
        cand = vic_out.copy()

    # Base token overlap
    if comp_primary and len(comp_base) > 0:
        cand["base_overlap"] = cand["tokens_base"].apply(lambda t: len(set(t) & set(comp_base)))
        if cand["base_overlap"].max() > 0:
            cand = cand[cand["base_overlap"] == cand["base_overlap"].max()]

    # Fuzzy match
    if not cand.empty:
        cand_strings = cand["cluster_norm"].tolist()
        match, score, idx = process.extractOne(comp_norm, cand_strings, scorer=fuzz.token_set_ratio)
        return cand.iloc[idx]

    return None

### Function to map target suburb to cluster centroid via coordinates

In [6]:
def map_target_suburb(target: str):
    parts = [p.strip() for p in re.split(r"\s*-\s*", target) if p.strip()]
    matches, coords = [], []

    for comp in parts:
        row = best_match(comp)
        if row is not None and pd.notnull(row["Lat"]) and pd.notnull(row["Lng"]):
            matches.append(row["Cluster"])
            coords.append((row["Lat"], row["Lng"]))

    if not coords:
        return None, None, None

    lat = sum(c[0] for c in coords) / len(coords)
    lng = sum(c[1] for c in coords) / len(coords)
    return "; ".join(matches), lat, lng

### Map all suburbs into clusters

In [7]:
target_suburbs = [
    "Albert Park-Middle Park-West St Kilda", "Armadale", "Carlton North", "Carlton-Parkville", "CBD-St Kilda Rd",
    "Collingwood-Abbotsford", "Docklands", "East Melbourne", "East St Kilda", "Elwood", "Fitzroy",
    "Fitzroy North-Clifton Hill", "Flemington-Kensington", "North Melbourne-West Melbourne", "Port Melbourne",
    "Prahran-Windsor", "Richmond-Burnley", "South Melbourne", "South Yarra", "Southbank", "St Kilda", "Toorak",
    "Balwyn", "Blackburn", "Box Hill", "Bulleen-Templestowe-Doncaster", "Burwood-Ashburton", "Camberwell-Glen Iris",
    "Canterbury-Surrey Hills-Mont Albert", "Chadstone-Oakleigh", "Clayton", "Doncaster East-Donvale", "East Hawthorn",
    "Glen Waverley-Mulgrave", "Hawthorn", "Kew", "Mount Waverley", "Nunawading-Mitcham",
    "Vermont-Forest Hill-Burwood East", "Aspendale-Chelsea-Carrum", "Bentleigh", "Brighton", "Brighton East",
    "Carnegie", "Caulfield", "Cheltenham", "Elsternwick", "Hampton-Beaumaris", "Malvern", "Malvern East",
    "Mentone-Parkdale-Mordialloc", "Murrumbeena-Hughesdale", "Altona", "Footscray", "Keilor East-Avondale Heights",
    "Melton", "Newport-Spotswood", "St Albans-Deer Park", "Sunshine", "Sydenham", "Werribee-Hoppers Crossing",
    "West Footscray", "Williamstown", "Yarraville-Seddon", "Broadmeadows-Roxburgh Park", "Brunswick",
    "Coburg-Pascoe Vale South", "Craigieburn", "East Brunswick", "Essendon", "Gladstone Park-Tullamarine", "Keilor",
    "Moonee Ponds-Ascot Vale", "Oak Park-Glenroy-Fawkner", "Pascoe Vale-Coburg North", "Sunbury", "West Brunswick",
    "Bundoora-Greensborough-Hurstbridge", "Eltham-Research-Montmorency", "Fairfield-Alphington",
    "Heidelberg-Heidelberg West", "Ivanhoe-Ivanhoe East", "Mill Park-Epping", "Northcote", "Preston", "Reservoir",
    "Thomastown-Lalor", "Thornbury", "Whittlesea", "Bayswater", "Boronia", "Croydon-Lilydale", "Ferntree Gully",
    "Ringwood", "Rowville", "Wantirna-Scoresby", "Yarra Ranges", "Berwick", "Cranbourne", "Dandenong",
    "Dandenong North-Endeavour Hills", "Narre Warren-Hampton Park", "Noble Park", "Pakenham", "Springvale",
    "Dromana-Portsea", "Frankston", "Hastings-Flinders", "Mt Eliza-Mornington-Mt Martha", "Seaford-Carrum Downs",
    "Belmont-Grovedale", "Corio", "Geelong-Newcombe", "Herne Hill-Geelong West", "Lara", "Newtown", "North Geelong",
    "Ballarat", "Mount Clear-Buninyong", "Sebastopol-Delacombe", "Wendouree-Alfredton", "Bendigo",
    "Flora Hill-Bendigo East", "Golden Square-Kangaroo Flat", "North Bendigo", "Bairnsdale", "Benalla", "Castlemaine",
    "Echuca", "Hamilton", "Horsham", "Mildura", "Moe-Newborough", "Morwell", "Ocean Grove-Barwon Heads", "Portland",
    "Sale-Maffra", "Seymour", "Shepparton", "Swan Hill", "Torquay", "Traralgon", "Wanagaratta", "Warragul",
    "Warrnambool", "Wodonga",
]

rows = []
for t in target_suburbs:
    matched, lat, lng = map_target_suburb(t)
    rows.append({"Target_Suburb": t, "Matched_Cluster": matched, "Lat": lat, "Lng": lng})

mapped_fixed = pd.DataFrame(rows)


### Load sa2 and lga shapefiles

In [8]:
sa2 = gpd.read_file("../../datasets/raw/district_shape/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp")
sa2_vic = sa2[sa2["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

lga = gpd.read_file("../../datasets/raw/district_shape/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp")
lga_vic = lga[lga["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

In [9]:
# spatial join
mapped_gdf = gpd.GeoDataFrame(mapped_fixed,geometry=[
        Point(xy) if pd.notnull(xy[0]) and pd.notnull(xy[1]) else None
        for xy in zip(mapped_fixed["Lng"], mapped_fixed["Lat"])
    ],
    crs="EPSG:4283"
)

mapped_with_sa2 = gpd.sjoin(
    mapped_gdf,
    sa2_vic[["SA2_CODE21", "SA2_NAME21", "geometry"]],
    how="left",
    predicate="within"
)

if "index_right" in mapped_with_sa2.columns:
    mapped_with_sa2 = mapped_with_sa2.drop(columns=["index_right"])

mapped_with_both = gpd.sjoin(
    mapped_with_sa2,
    lga_vic[["LGA_CODE21", "LGA_NAME21", "geometry"]],
    how="left",
    predicate="within"
)

mapped_with_both = mapped_with_both.drop(columns=["index_right"])

In [10]:
# save 
output_path = ("../../datasets/raw/mapped_target_suburbs.csv")
mapped_with_both.drop(columns="geometry").to_csv(output_path, index=False)
