In [1]:
import pyarrow.parquet as pq
import glob
from sklearn.neighbors import BallTree
import numpy as np
files = glob.glob('data/*.parquet')
def find_property(lat, lon, km = None , n = 1):
    deg = 110.574 # 1 lat equals 110.574km
    # min and max lat for all properties within Australia
    lat_min = -43.58301104
    lat_max = -9.23000371
    lon_min = 96.82159219
    lon_max = 167.99384663
      
        
    # lat and lon must be within above range
    lat = max(lat, lat_min)
    lat = min(lat, lat_max)
    lon = max(lon, lon_min)
    lon = min(lon, lon_max)
    
    def load_parquet():
        df = pq.\
            read_pandas(
            files, 
            columns=['ADDRESS_DETAIL_PID','LATITUDE', 'LONGITUDE'],
            filters=
            [('LATITUDE', '>=', lat - distance),
             ('LATITUDE', '<=', lat + distance),
             ('LONGITUDE', '>=', lon - distance),
             ('LONGITUDE', '<=', lon + distance)   
            ]).\
            to_pandas()
        return df

    min_distance = 0
    if km == None:
        distance = 1 / deg # start with 10 km
        while True:
            df = load_parquet()
            if df.shape[0] >= n: break
            min_distance = distance
            distance *= 2 
    else:
        distance = abs(km) / deg 
        df = load_parquet()
        if df.empty: return None
    
    while True:
        if df.shape[0] <= n + 10000: break
        middle_distance = (distance-min_distance)/2
        temp_df = df[
            df.LATITUDE.\
            between(
                lat - middle_distance, 
                lat + middle_distance
            ) &
            df.LONGITUDE.\
            between(
                lon - middle_distance, 
                lon + middle_distance
            )]
        if temp_df.shape[0] <= n + 10000: 
            break
        else:
            distance = middle_distance
            df = temp_df  
    bt = BallTree(np.deg2rad(df[['LATITUDE', 'LONGITUDE']].values), metric='haversine')
    distances, indices = bt.query(np.deg2rad(np.c_[lat, lon]), k= min(n, df.shape[0]))
    indices = indices[0].tolist()
    pids = df.ADDRESS_DETAIL_PID.iloc[indices].tolist()
    distance_map = dict(zip(pids ,[distance * 6371 for distance in distances[0]]))
    df =  pq.\
            read_pandas(
            files, 
            filters=
            [[('ADDRESS_DETAIL_PID', '=', pid)] for pid in pids]).\
            to_pandas()
    df['DISTANCE'] = df['ADDRESS_DETAIL_PID'].map(distance_map)
    return df.sort_values('DISTANCE')
#     return df.sort_values('DISTANCE').to_json(orient='records')



In [2]:
find_property(-33.8965368,151.2066979, n=10)

Unnamed: 0_level_0,ADDRESS_DETAIL_PID,DATE_CREATED,DATE_LAST_MODIFIED,DATE_RETIRED,BUILDING_NAME,LOT_NUMBER_PREFIX,LOT_NUMBER,LOT_NUMBER_SUFFIX,FLAT_TYPE_CODE,FLAT_NUMBER_PREFIX,...,FULL_ADDRESS,CARTESIAN_COOR,LGA_NAME_2016,SSC_NAME_2016,SA4_NAME_2016,SA3_NAME_2016,SA2_NAME_2016,SA1_7DIGITCODE_2016,MB_CODE_2016,DISTANCE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648507,GANSW706213980,2008-04-18,2021-05-18,,,,,,,,...,676ELIZABETHSTREETWATERLOONSW2017,"(-4634.411653893615, 2547.058076540619, -3553....",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.023088
648501,GANSW706213978,2008-04-18,2021-05-18,,,,,,,,...,674ELIZABETHSTREETWATERLOONSW2017,"(-4634.414212878535, 2547.0588099694655, -3553...",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.02323
648545,GANSW706213982,2008-04-18,2021-05-18,,,,,,,,...,678ELIZABETHSTREETWATERLOONSW2017,"(-4634.409175534078, 2547.0573663590226, -3553...",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.023846
648532,GANSW706213976,2008-04-19,2021-05-18,,,,41.0,,,,...,41672ELIZABETHSTREETWATERLOONSW2017,"(-4634.416760745312, 2547.0595404466985, -3553...",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.024288
648506,GANSW706213984,2008-04-18,2021-05-18,,,,,,,,...,680ELIZABETHSTREETWATERLOONSW2017,"(-4634.406770932971, 2547.0566777570384, -3553...",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.02535
648549,GANSW706213974,2008-04-19,2021-05-18,,,,42.0,,,,...,42670ELIZABETHSTREETWATERLOONSW2017,"(-4634.419257180301, 2547.060256347886, -3553....",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.026105
648009,GANSW706221291,2007-04-06,2021-05-18,,,,1.0,,,,...,1645ELIZABETHSTREETWATERLOONSW2017,"(-4634.39102525185, 2547.1034254093565, -3553....",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133840,10753210000,0.02713
648019,GANSW710337250,2008-01-05,2021-05-18,,,,,,,,...,645AELIZABETHSTREETWATERLOONSW2017,"(-4634.39102525185, 2547.1034254093565, -3553....",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133840,10753210000,0.02713
648005,GANSW706221290,2008-04-18,2021-05-18,,,,,,,,...,643ELIZABETHSTREETWATERLOONSW2017,"(-4634.393609713892, 2547.104108623732, -3553....",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133840,10753210000,0.027273
648546,GANSW706213986,2008-04-18,2021-05-18,,,,,,,,...,682ELIZABETHSTREETWATERLOONSW2017,"(-4634.404236248303, 2547.05595030938, -3553.0...",Sydney (C),Waterloo (NSW),Sydney - City and Inner South,Sydney Inner City,Waterloo - Beaconsfield,1133837,10755030000,0.027619


In [3]:
lat_min = -43.58301104
lat_max = -9.23000371
lon_min = 96.82159219
lon_max = 167.99384663
lat_list = np.random.uniform(lat_min, lat_max, size=100).tolist()
lon_list = np.random.uniform(lon_min, lon_max, size=100).tolist()

In [4]:
for lat, lon in zip(lat_list, lon_list):

    df = find_property(lat,lon, n=10)
    print(f'Most close address to {lat:.4f},{lon:.4f} is {df.iloc[0].FULL_ADDRESS} in {df.iloc[0].DISTANCE:.2f}km')

Most close address to -28.9312,129.3205 is 23OAKDRIVEOAK VALLEYSA5690 in 147.02km
Most close address to -10.3812,152.8654 is 73MAROUROADMER ISLANDQLD4875 in 966.09km
Most close address to -16.4123,142.9496 is 4BURKE DEVELOPMENTALROADHIGHBURYQLD4892 in 6.40km
Most close address to -35.0298,99.3826 is 1CONTOROADBORANUPWA6286 in 1433.10km
Most close address to -32.0410,155.5665 is 728SOUTHTERRACEWOOLINSW2462 in 325.76km
Most close address to -11.3672,148.5304 is 105BMAROUROADMER ISLANDQLD4875 in 515.35km
Most close address to -27.4146,103.9106 is 3090USELESS LOOPROADCARRARANGWA6532 in 946.52km
Most close address to -34.3595,145.6071 is 1577CARRATHOOLROADCARRATHOOLNSW2711 in 3.09km
Most close address to -39.3544,131.7315 is 102RIGHT WHALEROADSLEAFORDSA5607 in 601.65km
Most close address to -32.7956,144.4159 is 19913COBAR-IVANHOEROADIVANHOENSW2878 in 6.01km
Most close address to -36.8043,115.1344 is 299MACALLENWAYWINDY HARBOURWA6262 in 233.07km
Most close address to -23.8606,148.5541 is 6CO

In [5]:
 import timeit
    
%timeit -n 5 -r 4 find_property(np.random.uniform(lat_min, lat_max),np.random.uniform(lon_min, lon_max), n=50)

4.84 s ± 1.38 s per loop (mean ± std. dev. of 4 runs, 5 loops each)


In [6]:
import glob
import os
glob.glob(os.path.join('data', '*.parquet'))

['data/VIC-GNAF.parquet',
 'data/NT-GNAF.parquet',
 'data/ACT-GNAF.parquet',
 'data/NSW-GNAF.parquet',
 'data/SA-GNAF.parquet',
 'data/OT-GNAF.parquet',
 'data/QLD-GNAF.parquet',
 'data/WA-GNAF.parquet',
 'data/TAS-GNAF.parquet']