In [15]:
import geopandas as gpd
import pandas as pd

# 1. Load SA2 shapefile
shapefile = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
gdf = gpd.read_file(shapefile)

# 2. Filter Victoria (STE_CODE21 = "2")
vic = gdf[gdf["STE_CODE21"] == "2"].copy()

# 3. Compute centroids
vic["Lat"] = vic.geometry.centroid.y
vic["Lng"] = vic.geometry.centroid.x

# 4. Build dictionary (SA2_NAME21 → (Lat, Lng))
suburbs = {
    row["SA2_NAME21"]: (row["Lat"], row["Lng"])
    for _, row in vic.iterrows()
}

# 5. Save to CSV
vic_out = vic[["SA2_NAME21", "Lat", "Lng"]].rename(
    columns={"SA2_NAME21": "Cluster"}
)
vic_out.to_csv("vic_clusters_centroids.csv", index=False)

print("✅ Extracted", len(suburbs), "Victoria SA2 regions")
print("Sample:", list(suburbs.items())[:5])


✅ Extracted 524 Victoria SA2 regions
Sample: [('Alfredton', (-37.54173636281507, 143.749330252453)), ('Ballarat', (-37.5561439450457, 143.83665489612585)), ('Buninyong', (-37.643854141582494, 143.880777903821)), ('Delacombe', (-37.58222851797997, 143.77847784283048)), ('Smythes Creek', (-37.62024909240558, 143.74623319717654))]



  vic["Lat"] = vic.geometry.centroid.y

  vic["Lng"] = vic.geometry.centroid.x


In [None]:
import re
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from rapidfuzz import process, fuzz

# Load cluster centroids
vic = pd.read_csv("/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/vic_clusters_centroids.csv") 

# cleaning
DIRECTIONALS = {"north", "south", "east", "west", "upper", "lower", "central", "inner", "outer"}
STOP_TOKENS = DIRECTIONALS | {"vic", "city", "road", "rd"}

def normalize_name(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\(.*?\)", "", s)                 
    s = s.replace("&", " and ")
    s = re.sub(r"[;/]", " ", s)
    s = s.replace("–", " ").replace("-", " ")     
    s = re.sub(r"\bst\.?\b", "st", s)             
    s = re.sub(r"\bmt\.?\b", "mount", s)          
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_base(s: str):
    norm = normalize_name(s)
    toks = [t for t in re.split(r"[^a-z0-9]+", norm) if t]
    base_toks = [t for t in toks if t not in STOP_TOKENS]
    primary = base_toks[0] if base_toks else (toks[0] if toks else None)
    return norm, toks, base_toks, primary

vic = vic.copy()
vic["cluster_norm"] = vic["Cluster"].apply(normalize_name)
vic["tokens_all"] = vic["cluster_norm"].apply(lambda s: [t for t in re.split(r"[^a-z0-9]+", s) if t])
vic["tokens_base"] = vic["tokens_all"].apply(lambda toks: [t for t in toks if t not in STOP_TOKENS])

# fuzzy matching function
def best_match(component: str):
    comp_norm, comp_tokens, comp_base, comp_primary = tokenize_base(component)

    # exact match
    exact = vic[vic["cluster_norm"] == comp_norm]
    if not exact.empty:
        return exact.iloc[0]

    # filter primary token
    if comp_primary:
        cand = vic[vic["tokens_all"].apply(lambda toks: comp_primary in toks)].copy()
    else:
        cand = vic.copy()

    # check base token overlap
    if comp_primary and len(comp_base) > 0:
        cand["base_overlap"] = cand["tokens_base"].apply(lambda t: len(set(t) & set(comp_base)))
        if cand["base_overlap"].max() > 0:
            cand = cand[cand["base_overlap"] == cand["base_overlap"].max()]

    # fuzzy match
    if not cand.empty:
        cand_strings = cand["cluster_norm"].tolist()
        match, score, idx = process.extractOne(comp_norm, cand_strings, scorer=fuzz.token_set_ratio)
        return cand.iloc[idx]

    return None

# map each suburb to a closest centroid
def map_target_suburb(target: str):
    parts = [p.strip() for p in re.split(r"\s*-\s*", target) if p.strip()]
    matches, coords = [], []

    for comp in parts:
        row = best_match(comp)
        if row is not None and pd.notnull(row["Lat"]) and pd.notnull(row["Lng"]):
            matches.append(row["Cluster"])
            coords.append((row["Lat"], row["Lng"]))

    if not coords:
        return None, None, None

    lat = sum(c[0] for c in coords) / len(coords)
    lng = sum(c[1] for c in coords) / len(coords)
    return "; ".join(matches), lat, lng

# target suburbs from excel
target_suburbs = [
    "Albert Park-Middle Park-West St Kilda",
    "Armadale",
    "Carlton North",
    "Carlton-Parkville",
    "CBD-St Kilda Rd",
    "Collingwood-Abbotsford",
    "Docklands",
    "East Melbourne",
    "East St Kilda",
    "Elwood",
    "Fitzroy",
    "Fitzroy North-Clifton Hill",
    "Flemington-Kensington",
    "North Melbourne-West Melbourne",
    "Port Melbourne",
    "Prahran-Windsor",
    "Richmond-Burnley",
    "South Melbourne",
    "South Yarra",
    "Southbank",
    "St Kilda",
    "Toorak",
    "Balwyn",
    "Blackburn",
    "Box Hill",
    "Bulleen-Templestowe-Doncaster",
    "Burwood-Ashburton",
    "Camberwell-Glen Iris",
    "Canterbury-Surrey Hills-Mont Albert",
    "Chadstone-Oakleigh",
    "Clayton",
    "Doncaster East-Donvale",
    "East Hawthorn",
    "Glen Waverley-Mulgrave",
    "Hawthorn",
    "Kew",
    "Mount Waverley",
    "Nunawading-Mitcham",
    "Vermont-Forest Hill-Burwood East",
    "Aspendale-Chelsea-Carrum",
    "Bentleigh",
    "Brighton",
    "Brighton East",
    "Carnegie",
    "Caulfield",
    "Cheltenham",
    "Elsternwick",
    "Hampton-Beaumaris",
    "Malvern",
    "Malvern East",
    "Mentone-Parkdale-Mordialloc",
    "Murrumbeena-Hughesdale",
    "Altona",
    "Footscray",
    "Keilor East-Avondale Heights",
    "Melton",
    "Newport-Spotswood",
    "St Albans-Deer Park",
    "Sunshine",
    "Sydenham",
    "Werribee-Hoppers Crossing",
    "West Footscray",
    "Williamstown",
    "Yarraville-Seddon",
    "Broadmeadows-Roxburgh Park",
    "Brunswick",
    "Coburg-Pascoe Vale South",
    "Craigieburn",
    "East Brunswick",
    "Essendon",
    "Gladstone Park-Tullamarine",
    "Keilor",
    "Moonee Ponds-Ascot Vale",
    "Oak Park-Glenroy-Fawkner",
    "Pascoe Vale-Coburg North",
    "Sunbury",
    "West Brunswick",
    "Bundoora-Greensborough-Hurstbridge",
    "Eltham-Research-Montmorency",
    "Fairfield-Alphington",
    "Heidelberg-Heidelberg West",
    "Ivanhoe-Ivanhoe East",
    "Mill Park-Epping",
    "Northcote",
    "Preston",
    "Reservoir",
    "Thomastown-Lalor",
    "Thornbury",
    "Whittlesea",
    "Bayswater",
    "Boronia",
    "Croydon-Lilydale",
    "Ferntree Gully",
    "Ringwood",
    "Rowville",
    "Wantirna-Scoresby",
    "Yarra Ranges",
    "Berwick",
    "Cranbourne",
    "Dandenong",
    "Dandenong North-Endeavour Hills",
    "Narre Warren-Hampton Park",
    "Noble Park",
    "Pakenham",
    "Springvale",
    "Dromana-Portsea",
    "Frankston",
    "Hastings-Flinders",
    "Mt Eliza-Mornington-Mt Martha",
    "Seaford-Carrum Downs",
    "Belmont-Grovedale",
    "Corio",
    "Geelong-Newcombe",
    "Herne Hill-Geelong West",
    "Lara",
    "Newtown",
    "North Geelong",
    "Ballarat",
    "Mount Clear-Buninyong",
    "Sebastopol-Delacombe",
    "Wendouree-Alfredton",
    "Bendigo",
    "Flora Hill-Bendigo East",
    "Golden Square-Kangaroo Flat",
    "North Bendigo",
    "Bairnsdale",
    "Benalla",
    "Castlemaine",
    "Echuca",
    "Hamilton",
    "Horsham",
    "Mildura",
    "Moe-Newborough",
    "Morwell",
    "Ocean Grove-Barwon Heads",
    "Portland",
    "Sale-Maffra",
    "Seymour",
    "Shepparton",
    "Swan Hill",
    "Torquay",
    "Traralgon",
    "Wanagaratta",
    "Warragul",
    "Warrnambool",
    "Wodonga",
]

rows = []
for t in target_suburbs:
    matched, lat, lng = map_target_suburb(t)
    rows.append({"Target_Suburb": t, "Matched_Cluster": matched, "Lat": lat, "Lng": lng})

mapped_fixed = pd.DataFrame(rows)


sa2 = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp"
)
sa2_vic = sa2[sa2["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

# load lga
lga = gpd.read_file(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/landing/LGA_2021_AUST_GDA2020_SHP/LGA_2021_AUST_GDA2020.shp"
)
lga_vic = lga[lga["STE_CODE21"] == "2"].copy().to_crs("EPSG:4283")

# convert mapped centroids to geodataframe
mapped_gdf = gpd.GeoDataFrame(
    mapped_fixed,
    geometry=[Point(xy) if pd.notnull(xy[0]) and pd.notnull(xy[1]) else None 
              for xy in zip(mapped_fixed["Lng"], mapped_fixed["Lat"])],
    crs="EPSG:4283"
)

# spatial join to SA2
mapped_with_sa2 = gpd.sjoin(
    mapped_gdf,
    sa2_vic[["SA2_CODE21", "SA2_NAME21", "geometry"]],
    how="left", predicate="within"
)
if "index_right" in mapped_with_sa2.columns:
    mapped_with_sa2 = mapped_with_sa2.drop(columns=["index_right"])

# spatial join to lga
mapped_with_both = gpd.sjoin(
    mapped_with_sa2,
    lga_vic[["LGA_CODE21", "LGA_NAME21", "geometry"]],
    how="left", predicate="within"
)

mapped_with_both = mapped_with_both.drop(columns=["index_right"])

# save output
output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/mapped_target_suburbs.csv"
mapped_with_both.drop(columns="geometry").to_csv(output_path, index=False)


print("Saved:", output_path)


✅ Saved: /Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/mapped_target_suburbs.csv


In [None]:
import pandas as pd
import requests
import time
import numpy as np

# load datasets
df = pd.read_csv(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/mapped_target_suburbs.csv"
)

# filters valid lat and lng data
valid_suburbs = df[df["Lat"].notnull()].copy()

# load vic clusters
vic = pd.read_csv(
    "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/vic_clusters_centroids.csv"
)

# microburbs scraping
def get_microburbs_stat(lat, lng, stat_name):
    url = f"https://www.microburbs.com.au/heat-map-value?stat_name={stat_name}&lat={lat}&lng={lng}"
    resp = requests.get(url)
    if resp.status_code == 200:
        text = resp.text.strip()
        try:
            if "%" in text:
                val = float(text.split("%")[0].strip())
            elif "$" in text:  # e.g. "$2000 per month"
                val = float(text.replace("$", "").replace(",", "").split(" ")[0])
            else:
                val = float(text.split(" ")[0])
            return np.nan if val == 0.0 else val
        except Exception:
            return np.nan
    return np.nan

# get stats for target variable
def get_stats_for_target(target, clusters, metrics):
    results = {m: [] for m in metrics}
    for comp in clusters.split(";"):
        comp = comp.strip()
        row = vic[vic["Cluster"] == comp]
        if not row.empty:
            lat, lng = row.iloc[0]["Lat"], row.iloc[0]["Lng"]
            for m in metrics:
                val = get_microburbs_stat(lat, lng, m)
                if pd.notnull(val):
                    results[m].append(val)
            time.sleep(0.5)  
    return {m: np.nanmean(results[m]) if results[m] else np.nan for m in metrics}

# features
metrics = [
    "earning_over_2000_per_week",
    "earning_under_1000_per_week",
    "average_income",
    "mortgage_repayments",
    "tenant_rate"
]

# data
rows = []
for _, row in valid_suburbs.iterrows():
    clusters = row["Matched_Cluster"]
    stats = get_stats_for_target(row["Target_Suburb"], clusters, metrics)
    
    rows.append({
        "Target_Suburb": row["Target_Suburb"],
        "Matched_Cluster": row["Matched_Cluster"],
        "Lat": row["Lat"],
        "Lng": row["Lng"],
        "SA2_CODE21": row["SA2_CODE21"],   
        **stats   
    })

# save output
final_df = pd.DataFrame(rows)
output_path = "/Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/demographics_dataset.csv"
final_df.to_csv(output_path, index=False)

print("Saved results to:", output_path)


✅ Saved results to: /Users/ariqasri/Desktop/project-2-group-real-estate-industry-project-7-2025/ariqasri-workspace/dataset/raw/demographics_dataset.csv
                           Target_Suburb                      Matched_Cluster  \
0  Albert Park-Middle Park-West St Kilda         Albert Park; St Kilda - West   
1                               Armadale                             Armadale   
2                          Carlton North                              Carlton   
3                      Carlton-Parkville                   Carlton; Parkville   
4                        CBD-St Kilda Rd  Melbourne CBD - East; St Kilda East   

         Lat         Lng   SA2_CODE21  earning_over_2000_per_week  \
0 -37.853484  144.970161  206051128.0                         8.0   
1 -37.856747  145.020711  206061135.0                        26.0   
2 -37.800500  144.967804  206041117.0                        15.0   
3 -37.793796  144.959672  206041124.0                        15.0   
4 -37.839927  14