# Getty TGN | check on longitudes and latitudes by cross-checking other sources
- 2024-05-18
- Geo check on Getty TGN data fetched via SPARQL-endpoint
- https://amandinancy16.medium.com/reverse-geocoding-with-geopy-c26cfb63f74c
- V. Martens

# Import

## Import packages

In [None]:
# creating time stamps
import time

# importing files
import glob
import os

# progress bar
# from tqdm.notebook import tqdm
from tqdm import tqdm

# regex module
import re

# for multi-threading
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp

# retrieve country names from country codes
import pycountry

# data wrangling
import pandas as pd
import numpy as np

# back up files
import pickle

# retrieve various geo locations
import reverse_geocoder as rg
from geopy.geocoders import Nominatim

# measure distance between two geo points
from haversine import haversine, Unit

# save and dump
import pickle

# surpress, warnings, uhoohhhh
import warnings
warnings.filterwarnings("ignore")

# preferences
# adjust pandas to show all cols
pd.set_option('display.max_colwidth', None)

## Import functions

In [None]:
def load_latest_file(filepath:str) -> str:
    '''
    loads created latest create file from a directory
    args: string with filepath
    returns: latest file from a list of files
    '''

    list_of_files = glob.glob(filepath)
    latest_file = max(list_of_files, key=os.path.getctime)
    print(latest_file)
    
    return latest_file

def join_lats_lons(df:pd.DataFrame) -> pd.DataFrame:
    '''
    args: pd.DataFrame with lats and lons
    returns: pd.DataFrame with corrected lats and lons and joined into one col 
    '''
    
    df['lat'] = df['lat'].str.replace('-.','-0.')
    df['lon'] = df['lon'].str.replace('-.','-0.')
    df['coordinates'] = list(zip(df['lat'], df['lon']))
    
    return df

def get_country(df:pd.DataFrame, i:int) -> list:

    '''
    args: pd.DataFrame() with coordinates column
    returns: a list with dicts with geo locations, every dict contains keys with country name, 3 admin names, lat, lon
    '''
    
    results = rg.search(df['coordinates'].iloc[i])

    for item in results:
        query = df['coordinates'].iloc[i]
        tgn_id = df['tgn_id'].iloc[i]
        country = item['cc']
        looked_lat = item['lat']
        looked_lon = item['lon']
        looked_name = item['name']
        
    return [tgn_id, query, country, looked_lat, looked_lon, looked_name]

def parse_reverse_geo_lookup(list_of_dicts:list) -> pd.DataFrame:
    '''
    args: a list with dicts with geo locations
    returns: pd.DataFrame() with parsed geo locations
    '''
    
    dfs = pd.DataFrame()

    for coord_dict in list_of_dicts:
        df = pd.DataFrame(coord_dict).T
        dfs = pd.concat([dfs, df])
    
    dfs = dfs.rename(columns={0 : 'tgn_id_lookup',
                              1 : 'query_look_up',
                              2 : 'country_code_lookup', 
                              3 : 'lat_lookup',
                              4 : 'lon_lookup', 
                              5 : 'municipality_lookup'})
    
    return dfs
    
def parse_parentstring(df:pd.DataFrame) -> pd.DataFrame:
    '''
    parses out info from parentstring
    args: pd.DataFrame() with tgn parentstrings
    returns: pd.DataFrame() with reordered and parsed columns
    '''
    
    df['parentstring_province'] = (df['parentstring']
                                    .str.split(',')
                                    .str[0]
                                    .str.strip())
    
    df['parentstring_country'] = (df['parentstring']
                                    .str.split(',')
                                    .str[1]
                                    .str.strip())
    
    df['parentstring_continent'] = (df['parentstring']
                                    .str.split(',')
                                    .str[2]
                                    .str.strip())  
    
    df['parentstring_world'] = (df['parentstring']
                                    .str.split(',')
                                    .str[3]
                                    .str.strip())  
    
    df = df[['tgn_id', 'city_name', 'inferred_city_name', 'parentstring_province',
             'parentstring_country', 'parentstring_continent', 'parentstring_world',
             'broader_parentstring', 'lat', 'lon', 'coordinates', 'query_look_up',
             'country_code_lookup', 'lat_lookup', 'lon_lookup',
             'municipality_lookup']]
    
    return df    

def parse_broader_parentstring(df:pd.DataFrame) -> pd.DataFrame:
    '''
    parses out info from broader parentstring
    args: pd.DataFrame() with tgn broader parentstrings
    returns: pd.DataFrame() with reordered and parsed columns
    '''
    
    df['broader_parentstring_country'] = (df['broader_parentstring']
                                    .str.split()
                                    .str[0]
                                    .str.replace(',','')
                                    .str.strip())
    
    df['broader_parentstring_continent'] = (df['broader_parentstring']
                                    .str.split()
                                    .str[1]
                                    .str.replace(',','')
                                    .str.strip())  
    
    df['broader_parentstring_world'] = (df['broader_parentstring']
                                    .str.split()
                                    .str[2]
                                    .str.replace(',','')
                                    .str.strip())  
    
    df = df[['tgn_id', 'city_name', 'inferred_city_name', 'parentstring_province',
       'parentstring_country', 'parentstring_continent', 'parentstring_world',
       'broader_parentstring_country', 'broader_parentstring_continent',
       'broader_parentstring_world', 'lat', 'lon', 'coordinates', 'query_look_up',
       'country_code_lookup', 'lat_lookup', 'lon_lookup', 'municipality_lookup']]
    
    return df

def country_code_lookup(country_code:str) -> str:
    '''
    
    retrieves countryname from countrycode
    args: string with countrycode
    returns: returns country name
    '''
    return pycountry.countries.get(alpha_2=country_code).name

def municipality_look_up(df:pd.DataFrame) -> dict:
    '''
    pd.DataFrame with city names retrieves city info, lats, lons
    args: pd.DataFrame with city name/municipalities
    returns: a dict with city names as keys and location info and lats, lons as values
    '''
    geolocator = Nominatim(user_agent="Nancy Amandi", timeout=10)
    
    dict_locs = {}
    
    for i in tqdm(range(len(df)), total=len(df), desc='find geo info based on city names'):
        dict_locs[df['municipality_lookup'].iloc[i]] = geolocator.geocode(df['municipality_lookup'].iloc[i])
    
    return dict_locs

def parse_geo_lookup(results_lookup:dict) -> pd.DataFrame:
    '''
    parses looked up geo data into a pd.DataFrame
    args: a dict with looked up geo locations based on city names
    returns: a pd.DataFrame with city names, lat-lons
    '''
    df_parsed_geo = (pd.DataFrame(results_lookup)
                    .T.reset_index()
                    .rename(columns={'index':'query_city_name',
                                    0 : 'query_city_city_info',
                                    1 : 'query_city_geo_data'}))
    
    return df_parsed_geo

def data_haversine_tuple_creator(df:pd.DataFrame) -> pd.DataFrame:
    '''
    args: pd.DataFrame with seperate lat lon
    returns: pd.Dataframe with a tuple of lat lon in one col
    '''
    
    df['coordinates_looked_up'] = list(zip(df['lat_lookup'].astype(float), df['lon_lookup'].astype(float)))
    df = df.drop(columns=['lat_lookup', 'lon_lookup'])
    
    return df

def haversine_distance_calculator(df) -> pd.DataFrame:
    '''
    measures distance between two tuples containing two geo points each 
    args: a pd.DataFrame with two columns containing tuples with floats
    returns: a pd.DataFrame with a column that has measured the difference in kms between two tuples with geo points
    '''
    df['km_difference'] = '' 
    
    for i in tqdm(range(len(df)), total=len(df), desc='find geo info based on city names'):
        df['km_difference'].iloc[i] = round(haversine(df['query_city_geo_data'].iloc[i], df['coordinates_looked_up'].iloc[i], unit=Unit.KILOMETERS),2)
        
    return df

## Constants

In [None]:
# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_df_errors = f'{time_stamp}_df_errors_tgn.pickle'

# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_df_lod_results = f'{time_stamp}_df_lod_results_tgn.pickle'

# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_excel_export = f'{time_stamp}_loc_check_TGN.xlsx'

# country_check 
country = 'Netherlands'
continent = 'Europe'

print(f"{filename_df_errors}, {filename_df_lod_results}, {filename_excel_export}")

20240607-143032_df_errors_tgn.pickle, 20240607-143032_df_lod_results_tgn.pickle, 20240607-143032_loc_check_TGN.xlsx


## Import data

In [None]:
latest_picke_file = load_latest_file('data_dumps/*tgn_country.pickle')

# Open the file in binary mode
with open(latest_picke_file, 'rb') as file:
      
    # Call load method to deserialze
    df_tgn = pickle.load(file)
    
# enrich lats, lons
df_tgn = join_lats_lons(df_tgn)
df_tgn.shape



data_dumps\results_df_tgn_country.pickle


# Subset data

In [None]:
# subset data
df_nl = df_tgn[(df_tgn['broader_parentstring'].str.contains(f'{country}') == True)]
df_europe = df_tgn[(df_tgn['broader_parentstring'].str.contains(f'{europe}') == True)]
df_nl.shape, df_europe.shape

# Geo data-check
 1. Reverse city look-up on geo lats + lons 
 1. Retrieve lats, lons based on reversed look-up
 1. Compare distances between two geo-points

In [7]:
%%time

import multiprocessing as mp
pools = mp.cpu_count()

# Start pool
thread_pool = ThreadPoolExecutor(max_workers=pools, thread_name_prefix = 'thread')

# reate futures
futures = [thread_pool.submit(get_country, df_nl, i) for i in range(len(df_nl))]

# submit tasks
results = [future.result() for future in tqdm(futures, total=len(futures), desc='find reverse geo codes')]

# reverse geo lookup to a df
df_geo_data = parse_reverse_geo_lookup(results)

# merge results geo lookop with main tgn dataset
df_nl = (pd.merge(df_nl, df_geo_data, left_on='tgn_id', right_on='tgn_id_lookup', how='left')
         .drop(columns=['tgn_id_lookup'])
         )

# parse tgn dataset into seperate cols
df_nl = parse_parentstring(df_nl)
df_nl = parse_broader_parentstring(df_nl)

# change country code into country name to allign with tgn data
df_nl['country_name_lookup'] = (df_nl['country_code_lookup']
                                .apply(country_code_lookup)
                                )

del df_nl['country_code_lookup']

# look up lat lons based on reverse geo lookup and return lat lons
results_lookup = municipality_look_up(df_nl)

# parse geo lookup into df
df_results = parse_geo_lookup(results_lookup)

# merge geo lookup with main tgn dataset
df_nl = (pd.merge(df_nl, df_results, left_on='municipality_lookup', right_on='query_city_name', how='left')
         .drop(columns=['query_city_name'])
         )

# creates a col with coordinate tuples for haversine distance measurements
df_nl = data_haversine_tuple_creator(df_nl)

# measures differences between two geo locations
df_nl = haversine_distance_calculator(df_nl)

df_nl.head()

Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...


find reverse geo codes:  53%|█████▎    | 1032/1947 [37:17<22:06,  1.45s/it] 

In [30]:
df_nl['lat_diff'] = ''
for i in tqdm(range(len(df_nl)), total=len(df_nl), desc='find geo info based on city names'):
    df_nl['lat_diff'].iloc[i] = float(df_nl['query_look_up'][i][0]) - float(df_nl['query_city_geo_data'][i][0])

find geo info based on city names: 100%|██████████| 50/50 [00:00<00:00, 1704.16it/s]


# Back-up data

In [None]:
with open(f'data_dumps/{filename_excel_export}.pickle', 'wb') as handle:
    pickle.dump(df_nl, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Geo location checks

In [43]:
df_nl[(df_nl['km_difference'] > 5) & (df_nl['query_look_up'].astype(str).str.contains('52') == False)]

Unnamed: 0,tgn_id,city_name,inferred_city_name,parentstring_province,parentstring_country,parentstring_continent,parentstring_world,broader_parentstring_country,broader_parentstring_continent,broader_parentstring_world,...,lon,coordinates,query_look_up,municipality_lookup,country_name_lookup,query_city_city_info,query_city_geo_data,coordinates_looked_up,km_difference,lat_diff
5,http://vocab.getty.edu/tgn/7270017,Aabeek,Limburg,Limburg,Netherlands,Europe,World,Netherlands,Europe,World,...,6.0,"(51.25, 6)","(51.25, 6)",Leeuwen,Netherlands,"Leeuwen, Wageningen-Hoog, Wageningen, Gelderland, Nederland, 6704 AN, Nederland","(51.9826484, 5.6758503)","(51.21032, 5.99862)",88.73,-0.732648
26,http://vocab.getty.edu/tgn/7006756,Arnemuiden,Zuid Beveland,Zuid Beveland,Zeeland,Netherlands,Europe,Zeeland,Netherlands,Europe,...,3.666667,"(51.5, 3.666667)","(51.5, 3.666667)",Veere,Netherlands,"Veere, Zeeland, Nederland","(51.55640385, 3.577269355747127)","(51.54833, 3.66667)",6.25,-0.056404


# Other options
- with geopy package

In [47]:
import concurrent.futures
import multiprocessing as mp
from functools import partial
from pprint import pprint

cpu_count_laptop = mp.cpu_count()
geolocator = Nominatim(user_agent="Nancy Amandi", timeout=10)
geocode = RateLimiter(geolocator, swallow_exceptions=True, min_delay_seconds=0.1, return_value_on_exception=None) 

with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count_laptop, thread_name_prefix='thread') as e:
    locations = list(e.map(partial(geocode, language='en', exactly_one=True), tqdm(list(set(df_nl["municipality_lookup"]))), chunksize=100))
    
for i in range(len(locations)):
    # pprint(locations[i].raw)
    print(locations[i].raw['address']['country'])
    # print(locations[i].raw['address']['city'])

100%|██████████| 5/5 [00:00<00:00, 1670.64it/s]


TypeError: 'Nominatim' object is not callable

In [46]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
tqdm.pandas()

# option 1
geolocator = Nominatim(user_agent="geopy.geocoders.Nominatim")
geocode = RateLimiter(geolocator, min_delay_seconds=1)
# df_nl['new_location'] = df_nl['municipality_lookup'].progress_apply(geocode)

# option 2
geolocator = Nominatim(user_agent="Nancy Amandi", timeout=10)
geocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)
df_nl['new_location'] = df_nl['municipality_lookup'].apply(geocode, language='en', exactly_one=True )

# optione 3
df_tgn['address'] = df_tgn.progress_apply(lambda row: geocode((row['lat.value'], row['long.value']), language='en', exactly_one=True), axis=1)
df_tgn['country'] = df_tgn['address'].astype(str).str.split(',').str[-1]

# option 4
geolocator = Nominatim(user_agent="Nancy Amandi", timeout= 10)
rgeocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)
df_tgn["location"] = df_tgn["coordinates"].progress_apply(rgeocode)

# func to work with try and except on multi-threaded geocode, doube apply
def eval_results(x):
    try:
        return (x.latitude, x.longitude)
    except:
        return (None, None)


TypeError: 'Nominatim' object is not callable