In [10]:
import sys
import numpy as np
import pandas as pd
import os
import math
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
from xml.dom.minidom import parse
from collections import Counter
from glob import glob
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types, Row
spark = SparkSession.builder.appName('OSM point of interest extracter').getOrCreate()
assert spark.version >= '2.4' # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
spark.conf.set("spark.sql.session.timeZone", "UTC")

## Code installation instructions

code is run using anaconda, and the imported libraries above.
note: most libraries used in this project was pre-installed from the labs such as Numpy, Pandas and Pyspark.

new libraries installed are:
- folium using 'pip install folium'

# Function Definitions

In [11]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    """ adapted from: https://stackoverflow.com/questions/40807225/how-to-call-data-from-a-dataframe-into-haversine-function """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2) ** 2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0) ** 2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [24]:
df = pd.read_json('amenities-vancouver.json.gz', lines=True)
named_loc_df = df.dropna(subset=['name'])

In [25]:
# df.sort_values(by='timestamp', ascending=False)
# am_c = Counter(list(df[df['name'].isnull()].amenity))

In [26]:
amenities_list = set(named_loc_df['amenity'])
counter_list = Counter(list(named_loc_df['amenity']))

In [None]:
# named_loc_df[named_loc_df['amenity'] == 'restaurant'].tags.map(lambda x: print(x))

## Filter tourist locations

In [84]:
tourism_df = named_loc_df[named_loc_df['tags'].map(lambda x: True if 'tourism' in x and 'information' not in x.values() else False)]
tourism_df

Unnamed: 0,lat,lon,timestamp,amenity,name,tags
814,49.069151,-122.282881,2019-05-05T23:20:15.000-07:00,bench,Co-op Community Viewing Platform,"{'tourism': 'viewpoint', 'layer': '1'}"
1531,49.284384,-123.10889,2018-02-01T22:41:54.000-08:00,clock,Gastown Steam Clock,"{'wheelchair': 'yes', 'addr:housenumber': '305..."
7795,49.048782,-122.304278,2015-04-08T17:11:27.000-07:00,bench,Valley Feed Bag bench,{'website': 'http://www.tourismabbotsford.ca/B...
11371,49.40636,-123.213733,2010-12-23T22:11:54.000-08:00,bench,Bowen Lookout,{'tourism': 'viewpoint'}
11475,49.326682,-122.950121,2019-12-21T20:00:30.000-08:00,cafe,Honey Doughnuts & Goodies,"{'addr:housenumber': '4373', 'website': 'https..."
13684,49.047899,-122.290489,2018-07-30T07:50:41.000-07:00,bench,Expressions,{'inscription': 'Expressions bench sponsored b...
14989,49.285145,-123.114008,2019-01-21T19:15:04.000-08:00,clock,Birks Clock,"{'visibility': 'street', 'old_name': 'Trorey',..."
15137,49.049153,-122.290205,2015-06-14T02:34:20.000-07:00,bench,Three Men Telling Tall Tales Bench 1999,{'website': 'http://www.tourismabbotsford.ca/B...
15754,49.048869,-122.288672,2015-12-12T22:07:34.000-08:00,bench,STRAUSS AND GALLO WATCHING THE FARM,"{'description': 'The Abbotsford News', 'touris..."


# Read GPX Data into a DataFrame

In [29]:
row_data = []
gpx_files = glob('gpx_data/*.gpx')
for file in gpx_files:   
    xmldata = parse(file)
    trkpts = xmldata.getElementsByTagName("trkpt")
    for i in trkpts:
        row_data.append([file, float(i.getAttribute('lat')), float(i.getAttribute('lon'))])
df = pd.DataFrame(row_data, columns=['source', 'gpx_lat', 'gpx_lon'])
df1 = df[df['source'] == gpx_files[0]]

In [30]:
df1_merge = df1.merge(named_loc_df, how='cross')

In [31]:
df1_merge

Unnamed: 0,source,gpx_lat,gpx_lon,lat,lon,timestamp,amenity,name,tags
0,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
2,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W..."
3,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe..."
4,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
...,...,...,...,...,...,...,...,...,...
26565583,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
26565584,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{}
26565585,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'}
26565586,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


In [32]:
df1_merge['distance'] = df1_merge.apply(lambda x: haversine(x['gpx_lat'], x['gpx_lon'], x['lat'] , x['lon']), axis=1)Marker

In [40]:
set(df1_merge[df1_merge['distance'] < 0.5]['amenity'])

Unnamed: 0,source,gpx_lat,gpx_lon,lat,lon,timestamp,amenity,name,tags,distance
0,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",12.733605
1,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,Salad Loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...,12.725778
2,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.370898,-123.280448,2015-05-03T00:42:25.000-07:00,place_of_worship,St. Monica's Anglican Church,"{'addr:housenumber': '6404', 'addr:street': 'W...",4.558148
3,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.264041,-123.153407,2019-08-29T18:50:05.000-07:00,fuel,Shell,"{'brand:wikidata': 'Q154950', 'addr:housenumbe...",10.972462
4,gpx_data/route_2021-12-02_11.32am.gpx,49.331310,-123.264115,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,Best Bite Indian Cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6...",23.516757
...,...,...,...,...,...,...,...,...,...,...
26565583,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,House of Dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-...",10.714204
26565584,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,Creekside Coffee,{},24.921591
26565585,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,Togo Sushi,{'cuisine': 'japanese'},25.546549
26565586,gpx_data/route_2021-12-02_11.32am.gpx,49.337907,-123.138132,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,Brown's Social House,"{'addr:housenumber': '215', 'brewery': 'Guinne...",23.378419


## Read Exif data from photos

In [41]:
jpg_files = set(glob('jpg_photos/*.jpg') + glob('jpg_photos/*.jpeg') + glob('jpg_photos/*.JPG') + glob('jpg_photos/*.JPEG'))
georows = []
_TAGS_r = dict(((v, k) for k, v in TAGS.items()))

def _get_if_exist(data, key):
    # ref from: https://gist.github.com/erans/983821
    if key in data:
        return data[key]
    return None

def convert_lat_lon(value):
    # ref: https://gis.stackexchange.com/questions/136925/how-to-parse-exif-gps-information-to-lat-lng-decimal-numbers
    return value[0] + value[1]/60.0 + value[2]/3600.0 

for jpg in jpg_files:
    #ref: https://hhsprings.bitbucket.io/docs/programming/examples/python/PIL/ExifTags.html
    with Image.open(jpg) as fh:
        exifd = fh.getexif()._get_merged_dict()
        keys = list(exifd.keys())
        keys = [k for k in keys if k in TAGS]
        gpsinfo = exifd[_TAGS_r["GPSInfo"]]
        # print(gpsinfo.keys())
        gps_data = [(GPSTAGS[k], gpsinfo[k]) for k, v in gpsinfo.items()]
        
        # ref: # ref from: https://gist.github.com/erans/983821
        lat_lon_dict = dict(gps_data[:4])
        gps_latitude = _get_if_exist(lat_lon_dict, "GPSLatitude")
        gps_latitude_ref = _get_if_exist(lat_lon_dict, 'GPSLatitudeRef')
        gps_longitude = _get_if_exist(lat_lon_dict, 'GPSLongitude')
        gps_longitude_ref = _get_if_exist(lat_lon_dict, 'GPSLongitudeRef')
        
        if gps_latitude and gps_latitude_ref and gps_longitude and gps_longitude_ref:
            lat = convert_lat_lon(gps_latitude)
            if gps_latitude_ref != 'N':
                lat = 0 - lat
            lon = convert_lat_lon(gps_longitude)
            if gps_longitude_ref != 'E':
                lon = 0 - lon
        georows.append([fh.filename, lat, lon])
        

## Create Dataframe with Exif Lat Lon datapoints

In [42]:
exif_df = pd.DataFrame(georows, columns=['source', 'lat', 'lon'])

In [43]:
exif_df

Unnamed: 0,source,lat,lon
0,jpg_photos/IMG_1020.JPG,49.284011,-123.117278
1,jpg_photos/coffee.jpeg,49.253089,-123.100875
2,jpg_photos/IMG_1027.JPG,49.254569,-123.022461
3,jpg_photos/donut.jpeg,49.206069,-123.125733
4,jpg_photos/food2.jpeg,49.226692,-123.090592
5,jpg_photos/hand.jpeg,49.279214,-122.921647
6,jpg_photos/IMG_1019.JPG,49.283081,-123.115622


In [44]:
df_exif_merge = exif_df.merge(named_loc_df, how='cross')

In [45]:
df_exif_merge['distance'] = df_exif_merge.apply(lambda x: haversine(x['lat_x'], x['lon_x'], x['lat_y'] , x['lon_y']), axis=1)
df_exif_merge[df_exif_merge['distance'] < 0.25][['lat_y', 'lon_y']]

Unnamed: 0,lat_y,lon_y
156,49.282394,-123.118585
184,49.286003,-123.116602
191,49.284269,-123.119760
776,49.282128,-123.116459
777,49.282095,-123.116502
...,...,...
56036,49.282147,-123.118554
56649,49.284329,-123.117812
56706,49.284777,-123.116885
56752,49.281911,-123.113501


In [46]:
lat_lon_amenity = df_exif_merge[df_exif_merge['distance'] < 0.5][['lat_y', 'lon_y', 'name']]
lat_lon_amenity.drop_duplicates(inplace=True)

## Create plot of significant locations near photos taken

In [51]:
import matplotlib.pyplot as plt
import folium
import random
latmean=lat_lon_amenity['lat_y'].mean()
lonmean=lat_lon_amenity['lon_y'].mean()
map1 = folium.Map(location=[latmean, lonmean])
for lat, lon, name in zip(lat_lon_amenity['lat_y'], lat_lon_amenity['lon_y'], lat_lon_amenity['name']):
    folium.Marker(location=[lat, lon], popup = name).add_to(map1)
# plt.scatter(x=lat_lon_amenity['lon_y'], y=lat_lon_amenity['lat_y'])    
# map1

In [87]:

latmean=tourism_df['lat'].mean()
lonmean=tourism_df['lon'].mean()
map2 = folium.Map(location=[latmean, lonmean])
for lat, lon, name in zip(tourism_df['lat'], tourism_df['lon'], tourism_df['name']):
    folium.Marker(location=[lat, lon], popup=f"{name}").add_to(map2)
map2