In [1]:
# Retrieve a Random Wikipedia Abstract Article 

import urllib.request as ur
import lxml.etree as et
import re

try:
    with ur.urlopen('https://en.wikipedia.org/wiki/Special:Random') as response:
        random_url = response.geturl()
        title = re.sub('^.*/', '', random_url)

    url  = "http://en.wikipedia.org/w/api.php?"
    url += "format=xml&action=query&prop=extracts&explaintext&exintro&titles=" + title

    with ur.urlopen(url) as response:
        xml = response.read()
   
        abstract = str(et.fromstring(xml).xpath("/api/query/pages/page/extract/text()"))
        abstract = str.replace(abstract, '\\n', ' ')
        abstract = str.replace(abstract, '\\', '')

    print(title,  '\n\n', url, '\n\n', abstract) 
    
except:
    print("Check your Internet connection...")

Davenport_Community_School_District 

 http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=extracts&explaintext&exintro&titles=Davenport_Community_School_District 

 ['The Davenport Community School District Is a public school district in Scott County, Iowa. The school district covers 109 square miles (280 km2) that includes the city of Davenport, where it is based, and the western Scott County communities of Blue Grass, Buffalo and Walcott. Founded in 1858 it established one of the first publicly funded high schools in the United States, the third teachers’ training school and hired the first female superintendent in the country. It serves nearly 16,000 students in 31 school buildings.']


In [2]:
# Use Spacy, Da Python NLP Library: https://spacy.io/
import spacy as sp

# Load two already trained models
nlp_en_lg = sp.load('en_core_web_lg')   # EN document large one
nlp_xx_sm = sp.load('xx_ent_wiki_sm')   # Multilingual small one

In [3]:
# NER classification with an EN Large model

ner_en_lg = nlp_en_lg(abstract)
sp.displacy.render(ner_en_lg, jupyter=True, style='ent')

In [4]:
# NER classification with a multilingual small model

ner_xx_sm = nlp_xx_sm(abstract)
sp.displacy.render(ner_xx_sm, jupyter=True, style='ent')

In [5]:
# We really want to avoid any false positive in spatial entities 
# (too easy to find any related toponym somewhere on earth) 
# so we only keep toponym found by both models

spatial_ent_en_lg = [ ent for ent in ner_en_lg.ents if ent.label_ == 'LOC' or ent.label_ == 'GPE' ]
spatial_ent_xx_sm = [ ent for ent in ner_xx_sm.ents if ent.label_ == 'LOC' ]
        
spatial_ent = list( { str(val) for val in spatial_ent_en_lg if val in spatial_ent_xx_sm } )

print(spatial_ent)

['Iowa', 'Davenport', 'Buffalo', 'Scott County']


In [6]:
# Now we call GeoNames to find best candidate for each toponym
# and retrieve lat/lon coordinates
#
# Imply -before-, to create a GeoName account 
# *and* activate free WebService usage

from geopy import geocoders
g = geocoders.GeoNames(username="datapink")  

poi = []
for toponym in spatial_ent:
    try:
        loc = g.geocode(toponym)
        poi.append([toponym, loc, loc.latitude, loc.longitude])
        print(toponym, '->', loc, (loc.latitude, loc.longitude))
        
    except:
        print(toponym, ': NO GEOCODING\n')

Iowa -> Iowa, IA, US (42.00027, -93.50049)
Davenport -> Davenport, IA, US (41.52364, -90.57764)
Buffalo -> Buffalo, NY, US (42.88645, -78.87837)
Scott County -> Davenport, IA, US (41.52364, -90.57764)


In [7]:
# And Map them all, with folium

import folium 
map = folium.Map(tiles='CartoDB dark_matter')

for p in poi:
    folium.CircleMarker( location =  [ p[2], p[3] ], 
                         popup=str(p[0]), 
                         color='#ff1493', 
                         fill_color='#ff1493', 
                         fill=True           
                       ).add_to(map)   
    
display(map)

In [8]:
# An another, and more elaborate, approach to perform a TextToMap, 
# is to use Mordecai.
# cf: https://github.com/openeventdata/mordecai

# Mordecai use also Spacy NER model, to retrieve spatial entities,
# but add several treatments to improve toponym/Geonames matching
# and will also rank geocoding confidence (country based) 

# Mordecai models are TensorFlow based

from mordecai import Geoparser
geo = Geoparser()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
# Perform Text2Geo Treatment 

res = geo.geoparse(abstract)

import pprint as pp
pp.pprint(res)

[{'country_conf': 0.93927664,
  'country_predicted': 'USA',
  'geo': {'admin1': 'Arkansas',
          'country_code3': 'USA',
          'feature_class': 'P',
          'feature_code': 'PPL',
          'geonameid': '4130357',
          'lat': '34.69648',
          'lon': '-92.09626',
          'place_name': 'Scott'},
  'spans': [{'end': 5, 'start': 0}],
  'word': 'Scott'},
 {'country_conf': 0.9971506,
  'country_predicted': 'USA',
  'geo': {'admin1': 'Iowa',
          'country_code3': 'USA',
          'feature_class': 'A',
          'feature_code': 'ADM1',
          'geonameid': '4862182',
          'lat': '42.00027',
          'lon': '-93.50049',
          'place_name': 'Iowa'},
  'spans': [{'end': 4, 'start': 0}],
  'word': 'Iowa'},
 {'country_conf': 0.9971506,
  'country_predicted': 'USA',
  'geo': {'admin1': 'Iowa',
          'country_code3': 'USA',
          'feature_class': 'P',
          'feature_code': 'PPLA2',
          'geonameid': '4853423',
          'lat': '41.52364',
     

In [10]:
# We keep only the ones with a decent confidence rate

poi = {}
for r in res:
    try:
        if r['country_conf'] > 0.8 and 'County' not in r['geo']['place_name']:
            poi[r['word']] = (r['geo']['lat'], r['geo']['lon'])     
    except:
          continue
            
pp.pprint(poi)

{'Buffalo': ('42.88645', '-78.87837'),
 'Davenport': ('41.52364', '-90.57764'),
 'Iowa': ('42.00027', '-93.50049'),
 'Scott': ('34.69648', '-92.09626'),
 'United States': ('39.76', '-98.5')}


In [11]:
#And Map them

import folium 

map = folium.Map(tiles='CartoDB dark_matter')

for toponym, lonlat in poi.items():
    folium.CircleMarker( location =  [ float(lonlat[0]), float(lonlat[1]) ], 
                         popup=str(toponym), 
                         color='#ff1493', 
                         fill_color='#ff1493', 
                         fill=True
                        
                       ).add_to(map)   
    
display(map)