In [2]:
import xml.etree.ElementTree as ET

# Load XML file
tree = ET.parse('../data/WikToR.xml')
root = tree.getroot()

# List to store dictionaries
pages_list = []

# Iterate over each <page> element
for page in root.findall('.//page'):
    page_dict = {}

    # Extract data from each <page> element
    page_dict['number'] = page.get('number')
    page_dict['pageTitle'] = page.find('pageTitle').text
    page_dict['toponymName'] = page.find('toponymName').text
    page_dict['text'] = page.find('text').text
    page_dict['url'] = page.find('url').text
    page_dict['lat'] = page.find('lat').text
    page_dict['lon'] = page.find('lon').text
    page_dict['feature'] = page.find('feature').text
    page_dict['country'] = page.find('country').text

    # Extract data from <toponymIndices>
    toponym_indices = []
    for toponym in page.findall('.//toponymIndices/toponym'):
        index_dict = {
            'start': int(toponym.find('start').text),
            'end': int(toponym.find('end').text)
        }
        toponym_indices.append(index_dict)
    page_dict['toponymIndices'] = toponym_indices

    # Append page dictionary to the list
    pages_list.append(page_dict)

In [3]:
len(pages_list)

5000

In [5]:
import pickle
from genre.trie import Trie

# load the prefix tree (trie)
with open("../data/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

from genre.hf_model import GENRE

model = GENRE.from_pretrained("../models/hf_entity_disambiguation_aidayago").eval()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [21]:
import requests


def get_coordinates(place_name):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": place_name,
        "prop": "coordinates"
    }

    try:
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        PAGES = DATA['query']['pages']

        for k, v in PAGES.items():
            coordinates = v.get('coordinates')
            if coordinates:
                lat = round(float(coordinates[0]['lat']), 4)
                lon = round(float(coordinates[0]['lon']), 4)
                return lat, lon
        return 0, 0
    except Exception as e:
        print("An error occurred:", e)
        return 0, 0

In [22]:
from math import radians, sin, cos, sqrt, atan2


def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the Earth (specified in decimal degrees)
    """
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius_of_earth = 6371  # Radius of the Earth in kilometers
    distance = radius_of_earth * c

    return distance

In [25]:
# Iterate over the modified list of dictionaries
for page_dict in pages_list[11:13]:
    print(page_dict['pageTitle'])
    text = page_dict['text']
    for index_pair in page_dict['toponymIndices']:
        start_index = index_pair['start']
        end_index = index_pair['end']
        modified_text = text[:start_index] + " [START_ENT] " + text[start_index:end_index] + " [END_ENT] " + text[
                                                                                                             end_index:]

        sentences = [modified_text]
        res = model.sample(
            sentences,
            prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
        )
        # pprint(res)
        # print("\n")

        if res[0][0]["text"] == page_dict['pageTitle']:
            print("\tCorrect")
        else:
            print(f"\tIncorrect. Got: {res[0][0]['text']}")
        
        lat, lon = get_coordinates(res[0][0]["text"])
        haversine_dist = haversine_distance(float(page_dict['lat']), float(page_dict['lon']), lat, lon)
        print("\t"+str(haversine_dist))

Victoria, Seychelles
	Incorrect. Got: Victoria, Chile
	12838.35504879085
	Incorrect. Got: Port Victoria
	6180.510501800203
	Incorrect. Got: Port Victoria
	6180.510501800203
	Correct
	0.21779862287022636
Victoria, Gozo
	Correct
	4269.793132696058
	Correct
	4269.793132696058
	Correct
	4269.793132696058


In [27]:
get_coordinates("Victoria, Chile")

(-38.2333, -72.3333)