In [41]:
import sys
import numpy as np
import pandas as pd
import os
from xml.dom.minidom import parse
from collections import Counter
from glob import glob
import math
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types, Row
spark = SparkSession.builder.appName('OSM point of interest extracter').getOrCreate()
assert spark.version >= '2.4' # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
spark.conf.set("spark.sql.session.timeZone", "UTC")

# Function Definitions

In [76]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    """ adapted from: https://stackoverflow.com/questions/40807225/how-to-call-data-from-a-dataframe-into-haversine-function """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2) ** 2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0) ** 2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [2]:
df = pd.read_json('amenities-vancouver.json.gz', lines=True)
named_loc_df = df.dropna(axis=0)

In [3]:
# df.sort_values(by='timestamp', ascending=False)
# am_c = Counter(list(df[df['name'].isnull()].amenity))

In [4]:
amenities_list = set(named_loc_df['amenity'])
counter_list = Counter(list(named_loc_df['amenity']))

In [5]:
# named_loc_df[named_loc_df['amenity'] == 'restaurant'].tags.map(lambda x: print(x))

In [54]:
# named_loc_df[named_loc_df['tags'] == {}]
named_loc_df

Unnamed: 0,lat,lon,timestamp,amenity,name,tags
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
4,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W..."
7,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe..."
13,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
...,...,...,...,...,...,...
17712,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
17713,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{}
17714,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'}
17716,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


# Read GPX Data into a DataFrame

In [68]:
row_data = []
gpx_files = glob('*.gpx')
for file in gpx_files:   
    xmldata = parse(file)
    trkpts = xmldata.getElementsByTagName("trkpt")
    for i in trkpts:
        row_data.append([file, float(i.getAttribute('lat')), float(i.getAttribute('lon'))])
df = pd.DataFrame(row_data, columns=['source', 'gpx_lat', 'gpx_lon'])
df1 = df[df['source'] == gpx_files[0]]

In [71]:
df1_merge = df1.merge(named_loc_df, how='cross')

In [75]:
df1_merge

Unnamed: 0,source,gpx_lat,gpx_lon,lat,lon,timestamp,amenity,name,tags
0,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
2,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W..."
3,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe..."
4,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
...,...,...,...,...,...,...,...,...,...
26565583,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
26565584,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{}
26565585,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'}
26565586,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


In [80]:
df1_merge['distance'] = df1_merge.apply(lambda x: haversine(x['gpx_lat'], x['gpx_lon'], x['lat'] , x['lon']), axis=1)

In [102]:
set(df1_merge[df1_merge['distance'] < 0.5]['amenity'])

{'bank',
 'cafe',
 'fast_food',
 'hospital',
 'parking',
 'pharmacy',
 'place_of_worship',
 'post_office',
 'restaurant',
 'school'}