In [2]:
import xml.etree.ElementTree as ET

# Load XML file
tree = ET.parse('../data/WikToR.xml')
root = tree.getroot()

# List to store dictionaries
pages_list = []

# Iterate over each <page> element
for page in root.findall('.//page'):
    page_dict = {}

    # Extract data from each <page> element
    page_dict['number'] = page.get('number')
    page_dict['pageTitle'] = page.find('pageTitle').text
    page_dict['toponymName'] = page.find('toponymName').text
    page_dict['text'] = page.find('text').text
    page_dict['url'] = page.find('url').text
    page_dict['lat'] = page.find('lat').text
    page_dict['lon'] = page.find('lon').text
    page_dict['feature'] = page.find('feature').text
    page_dict['country'] = page.find('country').text

    # Extract data from <toponymIndices>
    toponym_indices = []
    for toponym in page.findall('.//toponymIndices/toponym'):
        index_dict = {
            'start': int(toponym.find('start').text),
            'end': int(toponym.find('end').text)
        }
        toponym_indices.append(index_dict)
    page_dict['toponymIndices'] = toponym_indices

    # Append page dictionary to the list
    pages_list.append(page_dict)

In [3]:
len(pages_list)

5000

In [4]:
import pickle
from genre.trie import Trie

# load the prefix tree (trie)
with open("../data/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

from genre.hf_model import GENRE

model = GENRE.from_pretrained("../models/hf_entity_disambiguation_aidayago").eval()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [5]:
import requests


def get_coordinates(place_name):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": place_name,
        "prop": "coordinates"
    }

    try:
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        PAGES = DATA['query']['pages']

        for k, v in PAGES.items():
            coordinates = v.get('coordinates')
            if coordinates:
                lat = coordinates[0]['lat']
                lon = coordinates[0]['lon']
                return lat, lon
        print(f"Coordinates not found for {place_name}")
        return 0, 0
    except Exception as e:
        print("An error occurred:", e)
        return 0, 0

In [6]:
from math import radians, sin, cos, sqrt, atan2


def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the Earth (specified in decimal degrees)
    """
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius_of_earth = 6371  # Radius of the Earth in kilometers
    distance = radius_of_earth * c

    return distance

In [10]:
max_error = 20039

In [11]:
res_distances = []

In [1]:
# Iterate over the modified list of dictionaries
iter_count = 0
for page_dict in pages_list:
    print(page_dict['pageTitle'])
    text = page_dict['text']
    for index_pair in page_dict['toponymIndices']:
        iter_count+=1
        if iter_count <= len(res_distances):
            continue

        start_index = index_pair['start']
        end_index = index_pair['end']
        modified_text = text[:start_index] + " [START_ENT] " + text[start_index:end_index] + " [END_ENT] " + text[
                                                                                                             end_index:]

        sentences = [modified_text]
        res = model.sample(
            sentences,
            prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
        )
        # pprint(res)
        # print("\n")

        if res[0][0]["text"] == page_dict['pageTitle']:
            # print("\tCorrect")
            dist = 0.0
        else:
            print(f"\tIncorrect. Got: {res[0][0]['text']}")
            lat, lon = get_coordinates(res[0][0]["text"])
            if lat == 0 and lon == 0:  # use max_error if predicted entity is not a place
                dist = max_error
            else:
                dist = haversine_distance(round(float(page_dict['lat']), 2), round(float(page_dict['lon']), 2),
                                          round(lat, 2), round(lon, 2))

        res_distances.append(dist)

        # print("\t" + str(dist))

NameError: name 'pages_list' is not defined

In [15]:
import pickle

with open("../data/wiktor_distances.pkl", 'wb') as f:
        pickle.dump(res_distances, f)

In [ ]:
def compute_accuracy_at_161km(distances):
    """
    Compute Accuracy@161km from a list of distances.
    """
    count_within_threshold = sum(1 for distance in distances if distance <= 161)
    accuracy = count_within_threshold / len(distances)
    return accuracy

In [ ]:
def compute_mean_error(distances):
    """
    Compute the mean error from a list of distances.
    """
    mean_error = sum(distances) / len(distances)

    return mean_error

In [ ]:
from numpy import trapz
import numpy as np


def compute_auc(distances):
    """
    Compute the Area Under the Curve (AUC) given list of distance.
    """
    distances.sort()
    dim_error = [(np.log(x + 1) / np.log(max_error)) for x in distances]
    y = np.array(dim_error)

    # Compute the area using the composite trapezoidal rule.
    area = trapz(y) / len(distances)
    return area

In [ ]:
compute_accuracy_at_161km(res_distances)

In [ ]:
compute_mean_error(res_distances)

In [ ]:
compute_auc(res_distances)