In [1]:
import pandas as pd
import xml.dom.minidom
import xml.etree.cElementTree as et
import os
from geopy.distance import great_circle
import numpy as np

In [2]:
dir_geoparsed_results = os.path.dirname(os.getcwd())+'\\geoparsed-results'
lgl_dir = dir_geoparsed_results+'\\lgl_geocoded_results_Edinburgh_Geoparser\\'
geovirus_dir = dir_geoparsed_results+'\\geovirus_geocoded_results_Edinburgh_Geoparser\\'
wiktor_dir = dir_geoparsed_results+'\\wiktor_geocoded_results_Edinburgh_Geoparser\\'

In [3]:
## Functions to read output xmls produced by Edinburgh Geoparser
def getGeoparsedResults_lgl(index):
    parsed_xml_out = et.parse(lgl_dir+str(index)+'.out.xml')
    root = list(parsed_xml_out.getroot())
    geoparsed_list = []
    for ent in root[2].find('ents'):
        try:
            lat = float(ent.attrib['lat'])
            lon = float(ent.attrib['long'])
            toponym = ent.find('parts').find('part').text
            geoparsed_list.append({'name':toponym,'lat':lat,'lon':lon})
        except KeyError:
            continue
        except ValueError:
            continue
    return geoparsed_list

def getGeoparsedResults_geovirus(index):
    parsed_xml_out = et.parse(geovirus_dir+str(index)+'.out.xml')
    root = list(parsed_xml_out.getroot())
    geoparsed_list = []
    for ent in root[2].find('ents'):
        try:
            lat = float(ent.attrib['lat'])
            lon = float(ent.attrib['long'])
            toponym = ent.find('parts').find('part').text
            geoparsed_list.append({'name':toponym,'lat':lat,'lon':lon})
        except KeyError:
            continue
        except ValueError:
            continue
    return geoparsed_list

def getGeoparsedResults_wiktor(index):
    parsed_xml_out = et.parse(wiktor_dir+str(index)+'.out.xml')
    root = list(parsed_xml_out.getroot())
    geoparsed_list = []
    for ent in root[2].find('ents'):
        try:
            lat = float(ent.attrib['lat'])
            lon = float(ent.attrib['long'])
            toponym = ent.find('parts').find('part').text
            geoparsed_list.append({'name':toponym,'lat':lat,'lon':lon})
        except KeyError:
            continue
        except ValueError:
            continue
    return geoparsed_list

In [4]:
## Functions to calculate keys for annotated locations in data patches
def calc_mlg_key(toponym, old_coord):
    toponym = toponym.strip(' ')
    old_coord_split = old_coord.split(',')
    old_lat = old_coord_split[0].strip(' ')
    old_lon = old_coord_split[1].strip(' ')
    return toponym+' '+old_lat+','+old_lon

def calc_lgl_key(name, lat, lon):
    name = name.strip(' ')
    if (lat != None) & (lon != None):
        lat = lat.strip(' ')
        lon = lon.strip(' ')
        return name+' '+lat+','+lon
    
def calc_geovirus_key(name, lat, lon):
    name = name.strip(' ')
    if (lat != None) & (lon != None):
        lat = lat.strip(' ')
        lon = lon.strip(' ')
        return name+' '+lat+','+lon
    
def calc_wiktor_key(name, lat, lon):
    name = name.strip(' ')
    if (lat != None) & (lon != None):
        lat = lat.strip(' ')
        lon = lon.strip(' ')
        return name+' '+lat+','+lon

## Function to calculate median error distances
def calc_median_error_distance(new_coord, geocoded_coordinates_list):
    errors = []
    new_coord = new_coord.split(',')
    lat = float(new_coord[0].strip(' '))
    lon = float(new_coord[1].strip(' '))
    for (geoparsed_lat, geoparsed_lon) in geocoded_coordinates_list:
        if (geoparsed_lat, geoparsed_lon) != (None, None):
            errors.append(great_circle((float(lat),float(lon)), (float(geoparsed_lat), float(geoparsed_lon))).km)
    if len(errors) == 0:
        return None
    else:
        return np.median(errors)

## Integrate Edinburgh Geoparser's Geoparsing Results on LGL

In [5]:
## read and preprocess LGL
dir_lgl = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\original-datasets\\lgl.xml'
parsed_xml_lgl = et.parse(dir_lgl)

url_list = []
text_list = []
toponyms_list = []
index_list = []
i = 0
for article in parsed_xml_lgl.getroot():
    i += 1
    url = article.find('url')
    text = article.find('text')
    toponyms = article.find('toponyms')
    
    toponym_list = []
    for toponym in toponyms:
        start = toponym.find('start')
        end = toponym.find('end')
        gaztag = toponym.find('gaztag')
        try:
            name = gaztag.find('name')
            lat = gaztag.find('lat')
            lon = gaztag.find('lon')
            toponym_list.append({'name':name.text, 'start':start.text, 'end': end.text, 'lat': lat.text, 'lon': lon.text})    
        except AttributeError:
            name = toponym.find('phrase')
            toponym_list.append({'name':name.text, 'start':start.text, 'end': end.text, 'lat': None, 'lon': None})    
    
    url_list.append(url.text)
    text_list.append(text.text)
    toponyms_list.append(toponym_list)
    index_list.append(i)
    
df_lgl = pd.DataFrame({'url' :url.text, 'text': text_list, 'toponyms': toponyms_list,'index':index_list})

## extract annotated locations from LGL
name_list = []
lat_list = []
lon_list = []

for article in parsed_xml_lgl.getroot():
    toponyms = article.find('toponyms')
    for toponym in toponyms:
        gaztag = toponym.find('gaztag')
        try:
            name = gaztag.find('name')
            lat = gaztag.find('lat')
            lon = gaztag.find('lon')
            name_list.append(name.text)
            lat_list.append(lat.text)
            lon_list.append(lon.text)
        except AttributeError:
            name = toponym.find('phrase')
            name_list.append(name.text)
            lat_list.append(None)
            lon_list.append(None)

df_lgl_poi = pd.DataFrame({'name' :name_list, 'lat': lat_list, 'lon': lon_list})

df_lgl_poi = df_lgl_poi.drop_duplicates()

## read LGL data patches
dir_mlg_lgl = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\data-patches\\lgl_patches.tsv'
mlg_lgl = pd.read_csv(dir_mlg_lgl,sep = '\t', header = None)

mlg_lgl = mlg_lgl.rename(columns = {0:'toponym', 1:'old_coord', 2:'new_coord'})

mlg_lgl['mlg_key'] = mlg_lgl.apply(lambda x: calc_mlg_key(x['toponym'],x['old_coord']),axis = 1)

df_lgl_poi['lgl_key'] = df_lgl_poi.apply(lambda x: calc_lgl_key(x['name'], x['lat'], x['lon']),axis = 1)

## unifying LGL
df_lgl_poi_unified = pd.merge(df_lgl_poi, mlg_lgl, how = 'outer', left_on = 'lgl_key', right_on = 'mlg_key')

df_lgl_poi_unified= df_lgl_poi_unified.dropna(subset=['new_coord'])

In [6]:
## get geoparsed results of every article in LGL
df_lgl['geoparsed_result'] = df_lgl['index'].apply(lambda index: getGeoparsedResults_lgl(index))

df_lgl_poi_unified['geocoded_coordinates_list'] = df_lgl_poi_unified['name'].apply(lambda x: [])

for i in range(len(df_lgl)):
    toponyms = df_lgl['toponyms'].iloc[i]
    geoparsed_result = df_lgl['geoparsed_result'].iloc[i]
    for toponym in toponyms:
        try:
            df_lgl_poi_toponym_index = df_lgl_poi_unified[(df_lgl_poi_unified['name'] == toponym['name']) & (df_lgl_poi_unified['lat'] == toponym['lat']) & (df_lgl_poi_unified['lon'] == toponym['lon'])].index[0]
        except IndexError:
            continue ## no coordinate information for this annotated toponym in LGL
        for geoparsed_toponym in geoparsed_result:
            if (toponym['name'] == geoparsed_toponym['name']):
                ##if (int(toponym['start']) == int(geoparsed_toponym['start'])) & (int(toponym['end']) == int(geoparsed_toponym['end'])):
                    toponym['geoparsed_lat'] = geoparsed_toponym['lat']
                    toponym['geoparsed_lon'] = geoparsed_toponym['lon']
                    break
        try:
            df_lgl_poi_unified['geocoded_coordinates_list'][df_lgl_poi_toponym_index].append((toponym['geoparsed_lat'],toponym['geoparsed_lon']))
        except KeyError:
            continue ## no coordinate information for this annotated toponym in GeoNames

## MdnED calculation
df_lgl_poi_unified['median_error_distance'] = df_lgl_poi_unified.apply(lambda x: calc_median_error_distance(x['new_coord'], x['geocoded_coordinates_list']), axis = 1)

## save toponym resolution results
dir_lgl_results = os.path.dirname(os.getcwd())+'\\geoparsed-results'
df_lgl_poi_unified.to_csv(dir_lgl_results+'\\lgl_geocoded_results_Edinburgh_Geoparser.csv')

## Integrate Edinburgh Geoparser's Geoparsing Results on GeoVirus

In [7]:
## read and preprocess GeoVirus
dir_geovirus = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\original-datasets\\GeoVirus.xml'
parsed_xml_geovirus = et.parse(dir_geovirus)

source_list = []
text_list = []
locations_list = []
index_list = []

i = 0
for article in parsed_xml_geovirus.getroot():
    i += 1
    source = article.find('source')
    text = article.find('text')
    locations = article.find('locations')
    location_list = []
    for location in locations:
        name = location.find('name')
        start = location.find('start')
        end = location.find('end')
        lat = location.find('lat')
        lon = location.find('lon')
        page = location.find('page')
        
        page_split = page.text.split('/')
        wikipedia_name = page_split[len(page_split)-1]
        wikipedia_name = wikipedia_name.replace("_"," ")
        wikipedia_name = wikipedia_name.replace("%27","\'")
        
        location_list.append({'name':name.text, 'wikipedia_name':wikipedia_name, 'start':start.text, 'end': end.text, 'lat': lat.text, 'lon': lon.text, 'page': page.text})    
    
    source_list.append(source.text)
    text_list.append(text.text)
    locations_list.append(location_list)
    index_list.append(i)
    

df_geovirus = pd.DataFrame({'source' :source_list, 'text': text_list, 'locations': locations_list, 'index':index_list})

## extract annotated locations from GeoVirus
name_list = []
lat_list = []
lon_list = []
page_list = []
wikipedia_name_list = []

for article in parsed_xml_geovirus.getroot():
    locations = article.find('locations')
    for location in locations:
        name = location.find('name')
        lat = location.find('lat')
        lon = location.find('lon')
        page = location.find('page')
        
        page_split = page.text.split('/')
        wikipedia_name = page_split[len(page_split)-1]
        wikipedia_name = wikipedia_name.replace("_"," ")
        wikipedia_name = wikipedia_name.replace("%27","\'")
        
        name_list.append(name.text)
        lat_list.append(lat.text)
        lon_list.append(lon.text)
        page_list.append(page.text)
        wikipedia_name_list.append(wikipedia_name)

df_geovirus_poi = pd.DataFrame({'name' :name_list, 'wikipedia_name':wikipedia_name_list, 'lat': lat_list, 'lon': lon_list, 'page': page_list})

df_geovirus_poi = df_geovirus_poi.drop_duplicates()

## read GeoVirus data patches
dir_mlg_geovirus = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\data-patches\\GeoVirus_patches.tsv'
mlg_geovirus = pd.read_csv(dir_mlg_geovirus,sep = '\t', header = None)

mlg_geovirus = mlg_geovirus.rename(columns = {0:'toponym', 1:'old_coord', 2:'new_coord'})

mlg_geovirus['mlg_key'] = mlg_geovirus.apply(lambda x: calc_mlg_key(x['toponym'],x['old_coord']),axis = 1)

df_geovirus_poi['geovirus_key'] = df_geovirus_poi.apply(lambda x: calc_geovirus_key(x['wikipedia_name'], x['lat'], x['lon']),axis = 1)

## unifying GeoVirus
df_geovirus_poi_unified = pd.merge(df_geovirus_poi, mlg_geovirus, how = 'outer', left_on = 'geovirus_key', right_on = 'mlg_key')

df_geovirus_poi_unified= df_geovirus_poi_unified.dropna(subset=['new_coord'])

In [8]:
## get geoparsed results of every article in GeoVirus
df_geovirus['geoparsed_result'] = df_geovirus['index'].apply(lambda index: getGeoparsedResults_geovirus(index))
    
df_geovirus_poi_unified['geocoded_coordinates_list'] = df_geovirus_poi_unified['name'].apply(lambda x: [])

for i in range(len(df_geovirus)):
    toponyms = df_geovirus['locations'].iloc[i]
    geoparsed_result = df_geovirus['geoparsed_result'].iloc[i]
    for toponym in toponyms:
        try:
            df_geovirus_poi_toponym_index = df_geovirus_poi_unified[(df_geovirus_poi_unified['name'] == toponym['name']) & (df_geovirus_poi_unified['lat'] == toponym['lat']) & (df_geovirus_poi_unified['lon'] == toponym['lon'])].index[0]
        except IndexError:
            continue ## no coordinate information for this annotated toponym in geovirus
        for geoparsed_toponym in geoparsed_result:
            if (toponym['name'] == geoparsed_toponym['name']):
                ##if (int(toponym['start']) == int(geoparsed_toponym['start'])) & (int(toponym['end']) == int(geoparsed_toponym['end'])):
                    toponym['geoparsed_lat'] = geoparsed_toponym['lat']
                    toponym['geoparsed_lon'] = geoparsed_toponym['lon']
                    break
        try:
            df_geovirus_poi_unified['geocoded_coordinates_list'][df_geovirus_poi_toponym_index].append((toponym['geoparsed_lat'],toponym['geoparsed_lon']))
        except KeyError:
            continue ## no coordinate information for this annotated toponym in GeoNames

## MdnED calculation            
df_geovirus_poi_unified['median_error_distance'] = df_geovirus_poi_unified.apply(lambda x: calc_median_error_distance(x['new_coord'], x['geocoded_coordinates_list']), axis = 1)

## save toponym resolution results
dir_geovirus_results = os.path.dirname(os.getcwd())+'\\geoparsed-results'
df_geovirus_poi_unified.to_csv(dir_geovirus_results+'\\geovirus_geocoded_results_Edinburgh_Geoparser.csv')

## Integrate Edinburgh Geoparser's Geoparsing Results on WikToR

In [9]:
## read and preprocess WikToR
dir_wiktor = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\original-datasets\\WikToR.xml'
parsed_xml_wiktor = et.parse(dir_wiktor)

url_list = []
text_list = []
toponyms_list = []
index_list = []

i = 0
for page in parsed_xml_wiktor.getroot():
    i += 1
    url = page.find('url')
    text = page.find('text')
    name = page.find('toponymName')
    wikipedia_name = page.find('pageTitle')
    lat = page.find('lat')
    lon = page.find('lon')

    toponyms = page.find('toponymIndices')
    toponym_list = []
    for toponym in toponyms:
        start = toponym.find('start')
        end = toponym.find('end')        
        toponym_list.append({'name':name.text, 'wikipedia_name':wikipedia_name.text, 'start':start.text, 'end': end.text, 'lat': lat.text, 'lon': lon.text, 'page': url.text})    
    
    url_list.append(url.text)
    text_list.append(text.text)
    toponyms_list.append(toponym_list)
    index_list.append(i)
    

df_wiktor = pd.DataFrame({'url' :url_list, 'text': text_list, 'toponyms': toponyms_list, 'index':index_list})

## extract annotated locations from WikToR
name_list = []
lat_list = []
lon_list = []
page_list = []
wikipedia_name_list = []

for page in parsed_xml_wiktor.getroot():
    url = page.find('url')
    name = page.find('toponymName')
    wikipedia_name = page.find('pageTitle')
    lat = page.find('lat')
    lon = page.find('lon')

    name_list.append(name.text)
    lat_list.append(lat.text)
    lon_list.append(lon.text)
    page_list.append(url.text)
    wikipedia_name_list.append(wikipedia_name.text)

df_wiktor_poi = pd.DataFrame({'name' :name_list, 'wikipedia_name':wikipedia_name_list, 'lat': lat_list, 'lon': lon_list, 'page': url_list})

df_wiktor_poi = df_wiktor_poi.drop_duplicates()

## read WikToR data patches
dir_mlg_wiktor = os.path.dirname(os.getcwd())+'\\data\\evaluation-corpora\\data-patches\\WikToR_patches.tsv'
mlg_wiktor = pd.read_csv(dir_mlg_wiktor,sep = '\t', header = None)

mlg_wiktor = mlg_wiktor.rename(columns = {0:'toponym', 1:'old_coord', 2:'new_coord'})

mlg_wiktor['mlg_key'] = mlg_wiktor.apply(lambda x: calc_mlg_key(x['toponym'],x['old_coord']),axis = 1)

df_wiktor_poi['wiktor_key'] = df_wiktor_poi.apply(lambda x: calc_wiktor_key(x['wikipedia_name'], x['lat'], x['lon']),axis = 1)

## unifying WikToR
df_wiktor_poi_unified = pd.merge(df_wiktor_poi, mlg_wiktor, how = 'outer', left_on = 'wiktor_key', right_on = 'mlg_key')

df_wiktor_poi_unified= df_wiktor_poi_unified.dropna(subset=['new_coord'])

In [10]:
## get geoparsed results of every article in WikToR
df_wiktor['geoparsed_result'] = df_wiktor['index'].apply(lambda index: getGeoparsedResults_wiktor(index))
    
df_wiktor_poi_unified['geocoded_coordinates_list'] = df_wiktor_poi_unified['name'].apply(lambda x: [])

for i in range(len(df_wiktor)):
    toponyms = df_wiktor['toponyms'].iloc[i]
    geoparsed_result = df_wiktor['geoparsed_result'].iloc[i]
    for toponym in toponyms:
        try:
            df_wiktor_poi_toponym_index = df_wiktor_poi_unified[(df_wiktor_poi_unified['name'] == toponym['name']) & (df_wiktor_poi_unified['lat'] == toponym['lat']) & (df_wiktor_poi_unified['lon'] == toponym['lon'])].index[0]
        except IndexError:
            continue ## no coordinate information for this annotated toponym in wiktor
        for geoparsed_toponym in geoparsed_result:
            if (toponym['name'] == geoparsed_toponym['name']):
                    toponym['geoparsed_lat'] = geoparsed_toponym['lat']
                    toponym['geoparsed_lon'] = geoparsed_toponym['lon']
                    break
        try:
            df_wiktor_poi_unified['geocoded_coordinates_list'][df_wiktor_poi_toponym_index].append((toponym['geoparsed_lat'],toponym['geoparsed_lon']))
        except KeyError:
            continue ## no coordinate information for this annotated toponym in GeoNames

## MdnED calculation                        
df_wiktor_poi_unified['median_error_distance'] = df_wiktor_poi_unified.apply(lambda x: calc_median_error_distance(x['new_coord'], x['geocoded_coordinates_list']), axis = 1)

## save toponym resolution results
dir_wiktor_results = os.path.dirname(os.getcwd())+'\\geoparsed-results'
df_wiktor_poi_unified.to_csv(dir_wiktor_results+'\\wiktor_geocoded_results_Edinburgh_Geoparser.csv')