# Extracting and Mapping Locations from Around the World in Eighty Days

* [Gutenberg Book Text](https://www.gutenberg.org/files/103/103-0.txt)

In [1]:
import re

#import altair as alt
import spacy

from pathlib import Path

from geopy.geocoders import Nominatim
from vega_datasets import data


geolocator = Nominatim(user_agent='Around The World in Eighty Days')
nlp = spacy.load('en_core_web_lg')
text = Path('data/around-the-world-in-eighty-days.txt').read_text()
chapters = re.split(re.compile(r'\nCHAPTER\s+[IVX]+\.\n'), text)[1:]
assert len(chapters) == 37

In [2]:
doc = nlp(chapters[0])

In [22]:
gpes = []

for ent in doc.ents:
    if 'GPE' == ent.label_:
        gpes.append({
            'name': ent.lemma_,
            'sentence': ent.sent,
            'start_char': ent.start_char
        })

In [25]:
locations = {}

for gpe in gpes:
    name = gpe['name']
    if name in locations:
        locations[name]['references'].append(gpe)
    else:
        meta = geolocator.geocode(name)
        locations[name] = {
            'references': [gpe],
            'class': meta.raw['class'],
            'display_name': meta.address,
            'latitude': meta.latitude,
            'longitude': meta.longitude,
            'type': meta.raw['type']
        }

In [26]:
locations

{'London': {'references': [{'label': 'GPE',
    'name': 'London',
    'sentence': He was never seen on ’Change, nor at the Bank, nor in the
    counting-rooms of the “City”; no ships ever came into London docks of
    which he was the owner; he had no public employment; he had never been
    entered at any of the Inns of Court, either at the Temple, or Lincoln’s
    Inn, or Gray’s Inn; nor had his voice ever resounded in the Court of
    Chancery, or in the Exchequer, or the Queen’s Bench, or the
    Ecclesiastical Courts.,
    'start_char': 796},
   {'label': 'GPE',
    'name': 'London',
    'sentence': It was at least certain that Phileas Fogg had not absented himself from
    London for many years.,
    'start_char': 3212}],
  'class': 'place',
  'display_name': 'London, Greater London, England, United Kingdom',
  'latitude': 51.5073219,
  'longitude': -0.1276474,
  'type': 'city'},
 'Inn': {'references': [{'label': 'GPE',
    'name': 'Inn',
    'sentence': He was never seen on ’Cha

In [62]:
# Source of land data
source = alt.topo_feature(data.world_110m.url, 'land')

# airport positions on background
airports = data.airports.url

points = alt.Chart(airports).transform_aggregate(
    latitude='mean(latitude)',
    longitude='mean(longitude)',
    count='count()',
    groupby=['state']
).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.Size('count:Q', title='Number of Mentions'),
    color=alt.value('steelblue'),
    tooltip=['state:N','count:Q']
).properties(
    title='Locations in Around the World in Eighty Days'
)

# Layering and configuring the components
background = alt.layer(
    alt.Chart(source).mark_geoshape(fill=None, stroke='black')
).project(
    'naturalEarth1'
).properties(width=600, height=400).configure_view(stroke=None)

background + points

In [71]:
points.to_dict()

{'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}},
 'data': {'url': 'https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/airports.csv'},
 'mark': 'circle',
 'encoding': {'color': {'value': 'steelblue'},
  'latitude': {'field': 'latitude', 'type': 'quantitative'},
  'longitude': {'field': 'longitude', 'type': 'quantitative'},
  'size': {'field': 'count',
   'title': 'Number of Mentions',
   'type': 'quantitative'},
  'tooltip': [{'field': 'state', 'type': 'nominal'},
   {'field': 'count', 'type': 'quantitative'}]},
 'title': 'Locations in Around the World in Eighty Days',
 'transform': [{'aggregate': [{'op': 'mean',
     'field': 'latitude',
     'as': 'latitude'},
    {'op': 'mean', 'field': 'longitude', 'as': 'longitude'},
    {'op': 'count', 'as': 'count'}],
   'groupby': ['state']}],
 '$schema': 'https://vega.github.io/schema/vega-lite/v4.17.0.json'}