In [69]:
import pyarrow.parquet as pq
import glob
import os
from sklearn.neighbors import BallTree
import numpy as np
from pyarrow import fs

local = fs.LocalFileSystem()
files = glob.glob(os.path.join('data1','*.parquet'))

filepath = os.path.join('data1')

def find_property(lat, lon, filepath, km = None , n = 1, ):
    deg = 110.574 # 1 lat equals 110.574km
    # min and max lat for all properties within Australia
    lat_min = -43.58301104
    lat_max = -9.23000371
    lon_min = 96.82159219
    lon_max = 167.99384663
    files = glob.glob(os.path.join(filepath,'*.parquet'))
        
    # lat and lon must be within above range
#     lat = max(lat, lat_min)
#     lat = min(lat, lat_max)
#     lon = max(lon, lon_min)
#     lon = min(lon, lon_max)
    
    def load_parquet():
        df = pq.\
            read_pandas(
            files, 
            columns=['IDX','LATITUDE', 'LONGITUDE', 'FILE_NAME'],
            filesystem=local,
            filters=
            [('LATITUDE', '>=', lat - distance),
             ('LATITUDE', '<=', lat + distance),
             ('LONGITUDE', '>=', lon - distance),
             ('LONGITUDE', '<=', lon + distance)   
            ]).\
            to_pandas()
        return df

    min_distance = 0
    if km == None:
        distance = 10 / deg # start with 10 km
        while True:
            df = load_parquet()
            if df.shape[0] >= n: break
            min_distance = distance
            distance *= 2 ** 0.5
    else:
        distance = abs(km) / deg 
        df = load_parquet()
        if df.empty: return None
    
    while True:
        if df.shape[0] <= n + 10000: break
        middle_distance = (distance-min_distance)/2
        temp_df = df[
            df.LATITUDE.\
            between(
                lat - middle_distance, 
                lat + middle_distance
            ) &
            df.LONGITUDE.\
            between(
                lon - middle_distance, 
                lon + middle_distance
            )]
        if temp_df.shape[0] <= n + 10000: 
            break
        else:
            distance = middle_distance
            df = temp_df  
    bt = BallTree(np.deg2rad(df[['LATITUDE', 'LONGITUDE']].values), metric='haversine')
    distances, indices = bt.query(np.deg2rad(np.c_[lat, lon]), k= min(n, df.shape[0]))
    indices = indices[0].tolist()
    pids = df.IDX.iloc[indices].tolist()
    distance_map = dict(zip(pids ,[distance * 6371 for distance in distances[0]]))
    files = [os.path.join(filepath, filename) for filename in df.FILE_NAME.unique().tolist()]
    
    df =  pq.\
          read_pandas(
            files, 
            filesystem=local,
            filters=
            [[('IDX', '=', pid)] for pid in pids]).\
            to_pandas()
    df['DISTANCE'] = df['IDX'].map(distance_map)
    return df.sort_values('DISTANCE')
#     return df.sort_values('DISTANCE').to_json(orient='records')



In [58]:
find_property(-30.4379424, 137.916172, filepath, n=10)[['FULL_ADDRESS', 'LATITUDE', 'LONGITUDE', 'DISTANCE']]

Unnamed: 0_level_0,FULL_ADDRESS,LATITUDE,LONGITUDE,DISTANCE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
126682,1682 MYRTLE SPRINGS ROAD MYRTLE SPRINGS SA 5731,-30.450277,138.217683,28.936483
126681,388 MYRTLE SPRINGS ROAD MYRTLE SPRINGS SA 5731,-30.452686,138.325823,28.936483
126645,19 SOUTH TERRACE FARINA SA 5731,-30.077352,138.27947,52.888333
126644,1 SOUTH TERRACE FARINA SA 5731,-30.077349,138.274681,52.888333
126643,21 SOUTH TERRACE FARINA SA 5731,-30.077351,138.280235,52.888333
126642,45 SOUTH TERRACE FARINA SA 5731,-30.07735,138.275258,52.888333
126641,133 SOUTH TERRACE FARINA SA 5731,-30.07735,138.28081,52.888333
126640,2 SOUTH TERRACE FARINA SA 5731,-30.077351,138.273909,52.888333
126638,160 SOUTH TERRACE FARINA SA 5731,-30.075504,138.273761,52.888333
126637,12 SOUTH TERRACE FARINA SA 5731,-30.07735,138.276979,52.888333


In [60]:
lat_min = -43.58301104
lat_max = -9.23000371
lon_min = 96.82159219
lon_max = 167.99384663
lat_list = np.random.uniform(lat_min, lat_max, size=100).tolist()
lon_list = np.random.uniform(lon_min, lon_max, size=100).tolist()

In [61]:
pq.read_pandas(files[0]).schema

index: int64
IDX: int32
FULL_ADDRESS: string
LATITUDE: double
LONGITUDE: double
LGA_NAME_2016: string
SSC_NAME_2016: string
SA4_NAME_2016: string
SA3_NAME_2016: string
SA2_NAME_2016: string
SA1_7DIGITCODE_2016: string
MB_CODE_2016: string
STREET_NAME: string
STREET_TYPE_CODE: string
LOCALITY_NAME: string
STATE: string
POSTCODE: string
ADDRESS_DETAIL_PID: string
FILE_NAME: string
-- schema metadata --
pandas: '{"column_indexes": [{"field_name": null, "metadata": null, "name' + 2540

In [71]:
for lat, lon in zip(lat_list, lon_list):

    df = find_property(lat,lon, filepath, n=10)
    print(f'Most close address to {lat:.4f},{lon:.4f} is {df.iloc[0].FULL_ADDRESS} in {df.iloc[0].DISTANCE:.2f}km')

Most close address to -9.5695,154.1721 is 97 BRAMSTON BEACH ROAD BRAMSTON BEACH QLD 4871 in 1234.44km
Most close address to -20.3126,97.5867 is 130 AIR FORCE ROAD WEST ISLAND 6799 in 905.02km
Most close address to -30.1784,135.6596 is 20031 BILLA KALINA ROAD MOUNT EBA SA 5720 in 0.75km
Most close address to -33.0397,119.5515 is 4 NEWDEGATE-RAVENSTHORPE ROAD LAKE KING WA 6356 in 5.59km
Most close address to -11.4221,166.9911 is 3 LADY ELLIOT ISLAND CORAL SEA QLD 4800 in 2065.15km
Most close address to -14.4016,148.3882 is 44 BACK BEACH ROAD YARRABAH QLD 4871 in 383.00km
Most close address to -40.5879,119.3027 is 501 SALMON HOLE ROAD TORNDIRRUP WA 6330 in 621.30km
Most close address to -41.6550,154.3305 is 1 AIRPORT ROAD MALLACOOTA VIC 3892 in 599.41km
Most close address to -40.0170,114.7730 is 348 TINGLEWOOD ROAD BROKE WA 6398 in 582.27km
Most close address to -31.6933,123.6530 is 43 EYRE HIGHWAY FRASER RANGE WA 6443 in 74.00km
Most close address to -42.3717,144.2510 is 5 CANNONBALL DRI

In [73]:
 import timeit
    
%timeit find_property(np.random.uniform(lat_min, lat_max),np.random.uniform(lon_min, lon_max), filepath, n=50)

The slowest run took 35.70 times longer than the fastest. This could mean that an intermediate result is being cached.
664 ms ± 1.04 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [74]:
def find_no_property(lat, lon, filepath, km = 10):
    deg = 110.574 # 1 lat equals 110.574km
    distance = abs(km) / deg 
    # min and max lat for all properties within Australia
    files = glob.glob(os.path.join(filepath,'*.parquet'))
        
    rowcount = pq.\
        read_pandas(
        files, 
        columns=['LATITUDE', 'LONGITUDE'],
        filesystem=local,
        filters=
        [('LATITUDE', '>=', lat - distance),
         ('LATITUDE', '<=', lat + distance),
         ('LONGITUDE', '>=', lon - distance),
         ('LONGITUDE', '<=', lon + distance)   
        ]).num_rows
    return rowcount

In [75]:
find_no_property(-33.8688, 151.2093, filepath, km = 2)

138817