In [97]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

In [10]:

ALL_STATES = ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']

# Load postcodes data
df = pd.read_csv('data/postcodes/australian_postcodes.csv', dtype={'locality': str, 'state': str, 'postcode': str})
df2 = pd.read_csv('data/postcodes/postcode-dataout.txt', header=None, names=['suburb', 'state', 'postcode'], dtype={'suburb': str, 'state': str, 'postcode': str})

# Clean postcodes data
df.rename(columns={'locality': 'name'}, inplace=True)
df2.rename(columns={'suburb': 'name'}, inplace=True)
df['name'] = df['name'].str.upper()
df['state'] = df['state'].str.upper()
df2['name'] = df2['name'].str.upper()
df2['state'] = df2['state'].str.upper()

In [3]:
gdfs_states = []
for state in ALL_STATES:
    
    state_lower = state.lower()
    
    gdf_state : gpd.GeoDataFrame = gpd.read_file(f'data/geojson/suburb-10-{state_lower}.geojson')
    
    new_column_names = {}
    for old_column in gdf_state.columns:
        old_column : str
        column = old_column
        column = column.replace(f'{state_lower}_', '')
        column = column.replace(f'loca_', 'local_')
        column = column.replace(f'localit', 'locali')
        column = column.replace(f'locali', 'locality')
        new_column_names[old_column] = column

    gdf_state['state'] = state

    gdf_state = gdf_state.rename(columns=new_column_names).to_crs('EPSG:4326')
    
    gdfs_states.append(gdf_state)


# Full N/A: local_1 local_3 local_6
# No N/A: local_2 local_5 local_7
# Some N/A: local_4
# loc_pid is not unique
# lc_ply_pid is not unique
# id is unique

gdf = gpd.GeoDataFrame(pd.concat(gdfs_states, ignore_index=True))
gdf.rename(columns={'locality': 'dt_locality', 'local_2' : 'name', 'local_4' : 'postcode'}, inplace=True)


In [134]:
gdf = gdf[['name', 'state', 'postcode', 'local_5', 'lc_ply_pid', 'loc_pid', 'dt_create', 'dt_locality', 'id', 'geometry']]

In [75]:
def find_ids_gdf(long, lat):
    '''
    Find the list of ids of all rows in gdf that contain the coordinate
    '''
    return gdf[gdf.contains(gpd.GeoSeries([Point(long, lat)]).unary_union)]['id'].tolist()

def find_ids_df(gdf_row_geometry):
    '''
    Find the list of ids of all rows in df where the gdf_row contains the coordinate
    gdf_row is a row in gdf
    '''
    return df[df.apply(lambda row: gdf_row_geometry.contains(Point(row['long'], row['lat'])), axis=1)]['id'].tolist()


In [None]:
df['polygon_ids'] = df.apply(lambda row: find_ids_gdf(row['long'], row['lat']), axis=1)
# 3m

In [36]:
df['polygon_precise'] = df.apply(lambda row: find_ids_gdf(row['Long_precise'], row['Lat_precise']), axis=1)
# 4m

In [79]:
polygon_postcode_pair = df[['id', 'polygon_ids']].explode('polygon_ids').dropna()
polygon_postcode_pair_precise = df[['id', 'polygon_precise']].explode('polygon_precise').dropna()

In [141]:
polygon_postcode = polygon_postcode_pair.groupby('polygon_ids').agg({'id': list}).reset_index().rename(columns={'id': 'ps', 'polygon_ids': 'id'})
polygon_postcode_precise = polygon_postcode_pair_precise.groupby('polygon_precise').agg({'id': list}).reset_index().rename(columns={'id': 'ps_precise', 'polygon_precise': 'id'})

In [142]:
gdf : gpd.GeoDataFrame = gpd.read_file('suburbs.geojson')
gdf['postcode'] = gdf['postcode'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)

In [143]:
gdf = gdf.merge(polygon_postcode, left_on='id', right_on='id', how='left')
gdf = gdf.merge(polygon_postcode_precise, left_on='id', right_on='id', how='left')

In [170]:
# Transform df into : { id: [name, state, postcode], ... }
id_to_data = df[['id', 'name', 'state', 'postcode']].set_index('id').apply(lambda x: x.to_list(), axis=1).to_dict()
id_to_postcode = df[['id', 'postcode']].set_index('id').to_dict()['postcode']

In [171]:

gdf['ps_len'] = gdf['ps'].apply(lambda x: len(x) if isinstance(x, list) else x)
gdf['ps_precise_len'] = gdf['ps_precise'].apply(lambda x: len(x) if isinstance(x, list) else x)
gdf['ps1'] = gdf['ps'].apply(lambda x: [id_to_data[i] for i in x] if isinstance(x, list) else np.nan)
gdf['ps1_precise'] = gdf['ps_precise'].apply(lambda x: [id_to_data[i] for i in x] if isinstance(x, list) else np.nan)
gdf['psp'] = gdf['ps'].apply(lambda x: list(set(id_to_postcode[i] for i in x)) if isinstance(x, list) else np.nan)
gdf['psp_precise'] = gdf['ps_precise'].apply(lambda x: list(set(id_to_postcode[i] for i in x)) if isinstance(x, list) else np.nan)

In [167]:
gdf[gdf['postcode'].isna()][['name','state','postcode','ps1','ps1_precise', 'ps_len', 'ps_precise_len']].to_csv('missing-postcodes-analysis.csv', index=False)

In [172]:
gdf[['name', 'state', 'postcode', 'local_5', 'lc_ply_pid', 'loc_pid', 'dt_create', 'dt_locality', 'psp','psp_precise', 'ps_len', 'ps_precise_len', 'id']].to_csv('suburbs-postcodes.csv', index=False)