In [124]:
import sys
import numpy as np
import pandas as pd
import os
import math
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
from xml.dom.minidom import parse
from collections import Counter
from glob import glob
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types, Row
spark = SparkSession.builder.appName('OSM point of interest extracter').getOrCreate()
assert spark.version >= '2.4' # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
spark.conf.set("spark.sql.session.timeZone", "UTC")

# Function Definitions

In [104]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    """ adapted from: https://stackoverflow.com/questions/40807225/how-to-call-data-from-a-dataframe-into-haversine-function """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2) ** 2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0) ** 2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [105]:
df = pd.read_json('amenities-vancouver.json.gz', lines=True)
named_loc_df = df.dropna(axis=0)

In [106]:
# df.sort_values(by='timestamp', ascending=False)
# am_c = Counter(list(df[df['name'].isnull()].amenity))

In [107]:
amenities_list = set(named_loc_df['amenity'])
counter_list = Counter(list(named_loc_df['amenity']))

In [108]:
# named_loc_df[named_loc_df['amenity'] == 'restaurant'].tags.map(lambda x: print(x))

In [109]:
# named_loc_df[named_loc_df['tags'] == {}]
named_loc_df

Unnamed: 0,lat,lon,timestamp,amenity,name,tags
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
4,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W..."
7,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe..."
13,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
...,...,...,...,...,...,...
17712,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
17713,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{}
17714,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'}
17716,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


# Read GPX Data into a DataFrame

In [68]:
row_data = []
gpx_files = glob('gpx_data/*.gpx')
for file in gpx_files:   
    xmldata = parse(file)
    trkpts = xmldata.getElementsByTagName("trkpt")
    for i in trkpts:
        row_data.append([file, float(i.getAttribute('lat')), float(i.getAttribute('lon'))])
df = pd.DataFrame(row_data, columns=['source', 'gpx_lat', 'gpx_lon'])
df1 = df[df['source'] == gpx_files[0]]

In [71]:
df1_merge = df1.merge(named_loc_df, how='cross')

In [75]:
df1_merge

Unnamed: 0,source,gpx_lat,gpx_lon,lat,lon,timestamp,amenity,name,tags
0,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
2,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W..."
3,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe..."
4,route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
...,...,...,...,...,...,...,...,...,...
26565583,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
26565584,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{}
26565585,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'}
26565586,route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


In [80]:
df1_merge['distance'] = df1_merge.apply(lambda x: haversine(x['gpx_lat'], x['gpx_lon'], x['lat'] , x['lon']), axis=1)

In [None]:
set(df1_merge[df1_merge['distance'] < 0.5]['amenity'])

{'bank',
 'cafe',
 'fast_food',
 'hospital',
 'parking',
 'pharmacy',
 'place_of_worship',
 'post_office',
 'restaurant',
 'school'}

## Read Exif data from photos

In [237]:
jpg_files = set(glob('jpg_photos/*.jpg') + glob('jpg_photos/*.jpeg') + glob('jpg_photos/*.JPG') + glob('jpg_photos/*.JPEG'))
georows = []
_TAGS_r = dict(((v, k) for k, v in TAGS.items()))

def _get_if_exist(data, key):
    # ref from: https://gist.github.com/erans/983821
    if key in data:
        return data[key]
    return None

def convert_lat_lon(value):
    # ref: https://gis.stackexchange.com/questions/136925/how-to-parse-exif-gps-information-to-lat-lng-decimal-numbers
    return value[0] + value[1]/60.0 + value[2]/3600.0 

for jpg in jpg_files:
    #ref: https://hhsprings.bitbucket.io/docs/programming/examples/python/PIL/ExifTags.html
    with Image.open(jpg) as fh:
        exifd = fh.getexif()._get_merged_dict()
        keys = list(exifd.keys())
        keys = [k for k in keys if k in TAGS]
        gpsinfo = exifd[_TAGS_r["GPSInfo"]]
        # print(gpsinfo.keys())
        gps_data = [(GPSTAGS[k], gpsinfo[k]) for k, v in gpsinfo.items()]
        
        # ref: # ref from: https://gist.github.com/erans/983821
        lat_lon_dict = dict(gps_data[:4])
        gps_latitude = _get_if_exist(lat_lon_dict, "GPSLatitude")
        gps_latitude_ref = _get_if_exist(lat_lon_dict, 'GPSLatitudeRef')
        gps_longitude = _get_if_exist(lat_lon_dict, 'GPSLongitude')
        gps_longitude_ref = _get_if_exist(lat_lon_dict, 'GPSLongitudeRef')
        
        if gps_latitude and gps_latitude_ref and gps_longitude and gps_longitude_ref:
            lat = convert_lat_lon(gps_latitude)
            if gps_latitude_ref != 'N':
                lat = 0 - lat
            lon = convert_lat_lon(gps_longitude)
            if gps_longitude_ref != 'E':
                lon = 0 - lon
        georows.append([fh.filename, lat, lon])
        

[['jpg_photos/hand.jpeg', 49.27921388888889, -122.92164722222223],
 ['jpg_photos/food2.jpeg', 49.22669166666667, -123.09059166666667],
 ['jpg_photos/IMG_1019.JPG', 49.28308055555556, -123.11562222222221],
 ['jpg_photos/IMG_1027.JPG', 49.25456944444444, -123.02246111111111],
 ['jpg_photos/coffee.jpeg', 49.25308888888889, -123.10087499999999],
 ['jpg_photos/IMG_1020.JPG', 49.28401111111111, -123.11727777777777],
 ['jpg_photos/donut.jpeg', 49.206069444444445, -123.12573333333333]]

## Create Dataframe with Exif Lat Lon datapoints

In [238]:
exif_df = pd.DataFrame(georows, columns=['source', 'lat', 'lon'])

In [239]:
exif_df

Unnamed: 0,source,lat,lon
0,jpg_photos/hand.jpeg,49.279214,-122.921647
1,jpg_photos/food2.jpeg,49.226692,-123.090592
2,jpg_photos/IMG_1019.JPG,49.283081,-123.115622
3,jpg_photos/IMG_1027.JPG,49.254569,-123.022461
4,jpg_photos/coffee.jpeg,49.253089,-123.100875
5,jpg_photos/IMG_1020.JPG,49.284011,-123.117278
6,jpg_photos/donut.jpeg,49.206069,-123.125733


In [240]:
df_exif_merge = exif_df.merge(named_loc_df, how='cross')

In [244]:
df_exif_merge['distance'] = df_exif_merge.apply(lambda x: haversine(x['lat_x'], x['lon_x'], x['lat_y'] , x['lon_y']), axis=1)
df_exif_merge[df_exif_merge['distance'] < 0.5]

Unnamed: 0,source,lat_x,lon_x,lat_y,lon_y,timestamp,amenity,name,tags,distance
2543,jpg_photos/hand.jpeg,49.279214,-122.921647,49.278963,-122.915520,2019-05-03T23:33:22.000-07:00,fast_food,Smoke's Poutinerie,{'opening_hours': 'Mo-Th 11:00-19:00; Fr 11:00...,0.445385
3102,jpg_photos/hand.jpeg,49.279214,-122.921647,49.279624,-122.921021,2020-02-14T18:42:42.000-08:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",0.064328
3103,jpg_photos/hand.jpeg,49.279214,-122.921647,49.279929,-122.921205,2019-10-22T06:08:58.000-07:00,cafe,Tim Hortons,"{'brand:wikidata': 'Q175106', 'cuisine': 'coff...",0.085769
3824,jpg_photos/hand.jpeg,49.279214,-122.921647,49.279281,-122.917294,2012-05-11T07:34:52.000-07:00,fast_food,Triple O White Spot,{'cuisine': 'burger'},0.315837
3828,jpg_photos/hand.jpeg,49.279214,-122.921647,49.278885,-122.918930,2012-05-11T07:34:52.000-07:00,restaurant,Food Court,{},0.200437
...,...,...,...,...,...,...,...,...,...,...
53152,jpg_photos/donut.jpeg,49.206069,-123.125733,49.204850,-123.129117,2019-08-11T00:05:00.000-07:00,car_rental,Enterprise,"{'brand:wikidata': 'Q17085454', 'official_name...",0.280758
54121,jpg_photos/donut.jpeg,49.206069,-123.125733,49.205521,-123.131274,2019-09-13T13:57:04.000-07:00,restaurant,Cravings Restaurant and Sports Bar,"{'addr:housenumber': '8808', 'addr:street': 'O...",0.407084
54122,jpg_photos/donut.jpeg,49.206069,-123.125733,49.204737,-123.130834,2019-09-13T13:57:04.000-07:00,restaurant,White Spot,"{'brand:wikidata': 'Q7995414', 'addr:housenumb...",0.399059
56266,jpg_photos/donut.jpeg,49.206069,-123.125733,49.208028,-123.123115,2019-10-28T15:41:35.000-07:00,fuel,Husky,"{'brand:wikidata': 'Q702049', 'website': 'http...",0.289181
