# Amenities Data Preprocessing

## Purpose

Clean and categorize Features of Interest (FOI) points in Victoria, assign them to SA2 regions, and create aggregated counts by amenity type for spatial analysis.

## Inputs
- `FOI_POINT.shp` – raw FOI point shapefile  
- `SA2_2021_AUST_GDA2020.shp` – SA2 boundaries  

## Outputs
- `pivot_counts.csv` – counts of amenities per SA2, grouped by category (education, health, tourist, cultural, others)  

## Key Steps
1. Load FOI points and SA2 shapefiles using GeoPandas.  
2. Clean FOI attributes and filter for Victoria.  
3. Ensure geometries are points and match CRS with SA2 boundaries.  
4. Assign each FOI to its SA2; for unmatched points, assign to nearest SA2.  
5. Categorize FOIs into groups: education, health, tourist, cultural, others.  
6. Aggregate counts per SA2 and save as CSV.

In [None]:
# libraries
from pathlib import Path
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
import pandas as pd
from shapely.geometry import MultiPoint

In [26]:
foi_points = gpd.read_file("../../datasets/amenities/VMFEAT/FOI_POINT.shp")
sa2 = gpd.read_file("../../datasets/district_shape/SA2_GDA2020_SHAPEFILE/SA2_2021_AUST_GDA2020.shp")

# Preprocess and assign foi counts to SA2 regions

In [27]:
cols_to_drop = [
    "UFI", "PFI", "FEATURE_ID", "PARENTFTID",
    "SUPER_PFI", "CRDATE_PFI", "CRDATE_UFI",
    "FEATURE_UF", "FEATURE_CR", "NAME_LABEL",
    "PARENTNAME", "VICNMSTATC" , "CHILDEXIST",
    "AUTHORGC", "AUTHORGID", "AUTHORGVER",
    "VMADD_PFI", "VICNAMESID" ,"THEME1","THEME2",
    "FEATSTATUS" 
]

foi_points_clean = foi_points.drop(columns=[c for c in cols_to_drop if c in foi_points.columns])
foi_points_clean = foi_points_clean[foi_points_clean["STATE"].str.upper() == "VIC"].copy()

In [28]:

def to_point(g):
    return g.geoms[0] if isinstance(g, MultiPoint) and len(g.geoms) > 0 else g

foi_points_clean["geometry"] = foi_points_clean.geometry.apply(to_point)

# Ensure CRS match
foi_points_clean = foi_points_clean.to_crs(sa2.crs)

# assign SA2 to each FOI
foi_points_clean = gpd.sjoin(foi_points_clean, sa2[["SA2_CODE21","SA2_NAME21","geometry"]],
    how="left", predicate="intersects").drop(columns=["index_right"])

In [29]:
# For the points that is not within any boundary we put them to the closest point
needs = foi_points_clean["SA2_CODE21"].isna()
if needs.any():
    sa2_pts = sa2.copy()
    sa2_pts["geometry"] = sa2_pts.geometry.representative_point()
    fix = gpd.sjoin_nearest(
        foi_points_clean.loc[needs, ["geometry"]],
        sa2_pts[["SA2_CODE21","SA2_NAME21","geometry"]],
        how="left",
        distance_col="dist_to_sa2_m"
    )
    foi_points_clean.loc[needs, ["SA2_CODE21","SA2_NAME21"]] = fix[["SA2_CODE21","SA2_NAME21"]].values




In [33]:
foi_points_clean

Unnamed: 0,FTYPE,FEATSUBTYP,NAME,STATE,geometry,SA2_CODE21,SA2_NAME21,group
0,control point,survey monument,,VIC,POINT (143.52433 -38.84588),217031476,Otway,others
1,control point,survey monument,,VIC,POINT (146.16829 -36.72986),204021064,Benalla Surrounds,others
2,control point,survey monument,,VIC,POINT (145.1764 -37.09963),204011060,Seymour Surrounds,others
3,control point,survey monument,,VIC,POINT (144.12032 -37.4246),201021011,Daylesford,others
4,control point,survey monument,,VIC,POINT (144.52872 -36.06312),216011408,Lockington - Gunbower,others
...,...,...,...,...,...,...,...,...
50672,place of worship,church,,VIC,POINT (146.38005 -36.57062),204021067,Wangaratta Surrounds,cultural
50673,place of worship,church,APSLEY CATHOLIC CHURCH,VIC,POINT (141.08392 -36.96737),215011393,West Wimmera,cultural
50675,community space,camp ground,LODDON FLOODWAY - MIDDLE BEND CAMPING AREA,VIC,POINT (143.69958 -35.3812),215031405,Swan Hill Surrounds,others
50676,health facility,maternal/child health centre,DIGGERS REST MATERNAL AND CHILD HEALTH,VIC,POINT (144.70974 -37.62036),210041539,Diggers Rest,health


In [30]:
education =['primary school', 'secondary school', 'primary/secondary school','university']

health = ['maternal/child health centre', 'community health centre', 'day procedure centre', 'disability support centre',
          'general hospital', 'general hospital (emergency)',
          'bush nursing hospital', 'ambulance station']

tourist = ['tourist information centre', 'tourist attraction']

cultural = ['church', 'mosque', 'monastry', 'vihara (buddhist)', 'mandir (hindu)']

def assign_group(category):
    if category in education:
        return 'education'
    elif category in health:
        return 'health'
    elif category in tourist:
        return 'tourist'
    elif category in cultural:
        return 'cultural'
    else:
        return 'others'

foi_points_clean['group'] = foi_points_clean['FEATSUBTYP'].apply(assign_group)
foi_points_clean.head()

Unnamed: 0,FTYPE,FEATSUBTYP,NAME,STATE,geometry,SA2_CODE21,SA2_NAME21,group
0,control point,survey monument,,VIC,POINT (143.52433 -38.84588),217031476,Otway,others
1,control point,survey monument,,VIC,POINT (146.16829 -36.72986),204021064,Benalla Surrounds,others
2,control point,survey monument,,VIC,POINT (145.1764 -37.09963),204011060,Seymour Surrounds,others
3,control point,survey monument,,VIC,POINT (144.12032 -37.4246),201021011,Daylesford,others
4,control point,survey monument,,VIC,POINT (144.52872 -36.06312),216011408,Lockington - Gunbower,others


In [None]:
pivot_counts = pd.pivot_table(foi_points_clean,
    index="SA2_CODE21", columns="group",
    values="FTYPE", aggfunc="count",
    fill_value=0).reset_index()
pivot_counts.head()

group,SA2_CODE21,cultural,education,health,others,tourist
0,101041023,0,0,0,1,0
1,109031180,0,0,0,1,0
2,109031181,0,0,0,2,0
3,109031183,0,0,0,2,0
4,109031185,0,0,0,3,0


In [32]:
pivot_counts.to_csv("../../datasets/amenities/pivot_counts.csv", index=False)