# Setup

In [221]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from difflib import get_close_matches
from IPython.display import display

# Current working directory
%cd /content/drive/My Drive/unimelb-cluster-and-cloud-computing-comp90024-2020-sm1/city_analytics/geospatial

# Global constants
DATADIR = Path("../geodata")
POSTCODE_FILE = Path(DATADIR/"australian_postcodes.csv")
LOCALITY_FILE = DATADIR / "localities.csv"  # source statistical areas (SAs)
CITYFILE_TEST = DATADIR / "city_names_test.json"  # city names to map to SAs

/content/drive/My Drive/unimelb-cluster-and-cloud-computing-comp90024-2020-sm1/city_analytics/geospatial


# Create source SAs from postal areas
See references for download link of original file.  


In [222]:
# Original data
data_postcode = pd.read_csv(DATAFILE_POSTCODE, dtype={"sa3": "Int64", "sa4": "Int64"},
                            converters={"locality": str.lower, "state": str.lower})
data_postcode.dropna(subset=["sa3", "sa4"], inplace=True)  # clear NAs
data_postcode.drop_duplicates(subset=["locality", "state"], keep="last", inplace=True, ignore_index=True)

# Augmented data
duplicated = data_postcode.duplicated(subset="locality", keep=False)  # same locality, different state
data_postcode["locality_mixed"] = data_postcode.locality + duplicated*(" "+data_postcode.state)  # combine locality and state into mixed name for duplicated
display(data_postcode[duplicated].sort_values("locality"))
data_postcode.to_csv(LOCALITY_FILE, index=False)

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,sa3name,sa4,sa4name,region,locality_mixed
482,4586,2046,abbotsford,nsw,151.133865,-33.866044,LIDCOMBE DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,12003,Strathfield - Burwood - Ashfield,120,Sydney - Inner West,R1,abbotsford nsw
5686,4625,3067,abbotsford,vic,144.998203,-37.803515,FITZROY DC,Delivery Area,Updated 6-Feb-2020,20607,Yarra,206,Melbourne - Inner,R1,abbotsford vic
11086,12626,4670,abbotsford,qld,152.297855,-24.887760,BUNDABERG DC,Delivery Area,Updated 6-Feb-2020,31902,Burnett,319,Wide Bay,R3,abbotsford qld
1553,3151,2336,aberdeen,nsw,151.102917,-32.146220,ABERDEEN LPO,Delivery Area,Updated 6-Feb-2020,10604,Upper Hunter,106,Hunter Valley exc Newcastle,R3,aberdeen nsw
16906,11359,7310,aberdeen,tas,146.239406,-41.237355,DEVONPORT DC,Delivery Area,Updated 6-Feb-2020,60402,Devonport,604,West and North West,R3,aberdeen tas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7199,7822,3505,yelta,vic,142.004192,-34.180532,MERBEIN,Delivery Area,Updated 6-Feb-2020,21502,Mildura,215,North West,R3,yelta vic
4365,3702,2681,yenda,nsw,146.146740,-34.116715,YENDA,Delivery Area,Updated 6-Feb-2020,11301,Griffith - Murrumbidgee (West),113,Riverina,R3,yenda nsw
10917,13026,4625,yenda,qld,151.644182,-25.571054,GAYNDAH LPO,Delivery Area,Updated 6-Feb-2020,31902,Burnett,319,Wide Bay,R3,yenda qld
16468,10043,7120,york plains,tas,147.487270,-42.316940,EASTERN SHORE DC,Delivery Area,Updated 6-Feb-2020,60303,South East Coast,603,South East,R3,york plains tas


# Map city names to SA codes

In [223]:
# City names
with open(CITYFILE_TEST) as f:
  citynames = list(map(str.lower, json.load(f)))
print(citynames)

# Source SA codes
data_locality = pd.read_csv(LOCALITY_FILE, dtype={"sa3": "Int64", "sa4": "Int64"})
codes = dict(zip(map(str.lower, data_locality.locality_mixed), data_locality.sa4))  # using SA4
print(codes)

def locality_to_sa4(locality, cutoff=0.6):
  """
  Map locality to code of statistical area level 4
  locality: city/suburb name
  cutoff: minimum match ratio to approve similar locality
  """

  locality = locality.lower()

  # Exact mapping
  if locality in codes:
    return codes[locality]
  
  # Approx. mapping
  matches = get_close_matches(locality, codes, n=1, cutoff=cutoff)
  if matches:
    [m] = matches  # best matched locality
    return codes[m]

  return 0

print(locality_to_sa4("melbourn"))  # exact map
print(locality_to_sa4("gold coast"))  # approx. map

['aberdeen', 'adaminaby', 'adelaide', 'adelaide river', 'adelong', 'agnes water', 'aireys inlet - fairhaven', 'airlie beach - cannonvale', 'albany', 'albury', 'aldinga', 'alexandra', 'alice river', 'alice springs', 'allansford', 'allendale east', 'alligator creek', 'allora', 'alstonville', 'alyangula', 'amata', 'american river', 'amity point', 'angaston', 'angle vale', 'anglesea', 'anna bay', 'apollo bay', 'appin', 'ararat', 'aratula', 'arcadia bay', 'arcadia downs', 'ardlethan', 'ardrossan', 'armidale', 'armstrong beach', 'arrawarra', 'arthurs seat', 'ashley', 'atherton', 'auburn', 'augathella', 'augusta', 'avenel', 'avoca', 'awaba', 'axedale', 'ayr', 'babinda', 'bacchus marsh', 'bagdad', 'bairnsdale', 'bakers hill', 'balaklava', 'baldivis', 'balgal beach', 'balhannah', 'balingup', 'ballan', 'ballarat', 'ballina', 'balnarring - balnarring beach', 'balranald', 'bangalow', 'bannockburn', 'barcaldine', 'barellan', 'bargara - innes park', 'bargo', 'barham', 'barmera', 'barnawartha', 'baro

# References
Urban centers and localities  
https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1270.0.55.004July%202016?OpenDocument  

Postal codes including SA3, SA4  
https://www.matthewproctor.com/australian_postcodes  
