# Extracting and Mapping Locations from Around the World in Eighty Days

* [Gutenberg Book Text](https://www.gutenberg.org/files/103/103-0.txt)

In [53]:
import re

#import altair as alt
import spacy

from pathlib import Path

from geopy.geocoders import Nominatim
from vega_datasets import data


geolocator = Nominatim(user_agent='Around The World in Eighty Days')
nlp = spacy.load('en_core_web_lg')
text = Path('data/around-the-world-in-eighty-days.txt').read_text()
chapters = re.split(re.compile(r'\nCHAPTER\s+[IVX]+\.\n'), text)[1:]
assert len(chapters) == 37

In [54]:
doc = nlp(chapters[0])

In [50]:
locations = []

for ent in doc.ents:
    if 'GPE' == ent.label_:
        locations.append({
            'label': ent.label_,
            'name': ent.lemma_,
            'sentence': ent.sent,
            'start_char': ent.start_char
        })

In [58]:
#location.doc.text[location.start_char-20:location.start_char+20]
#locations
gc.search_cities('London')

[{'geonameid': 6058560,
  'name': 'London',
  'latitude': 42.98339,
  'longitude': -81.23304,
  'countrycode': 'CA',
  'population': 346765,
  'timezone': 'America/Toronto',
  'admin1code': '08',
  'alternatenames': ['Landona',
   'London',
   'Londonas',
   'Londono',
   'YXU',
   'leondeon',
   'lndn',
   'lndn  antaryw',
   'londoni',
   'lun dui',
   'lun dun',
   'lwndwn',
   'rondon',
   'Лондон',
   'לונדון',
   'لندن',
   'لندن، انتاریو',
   'لندن، اونٹاریو',
   'ლონდონი',
   'ロンドン',
   '伦敦',
   '런던']},
 {'geonameid': 2643743,
  'name': 'London',
  'latitude': 51.50853,
  'longitude': -0.12574,
  'countrycode': 'GB',
  'population': 7556900,
  'timezone': 'Europe/London',
  'admin1code': 'ENG',
  'alternatenames': ['ILondon',
   'LON',
   'Lakana',
   'Landan',
   'Landen',
   'Ljondan',
   'Llundain',
   'Lodoni',
   'Londain',
   'Londan',
   'Londar',
   'Londe',
   'Londen',
   'Londin',
   'Londinium',
   'Londino',
   'Londn',
   'London',
   'London osh',
   'Londona',
 

In [62]:
# Source of land data
source = alt.topo_feature(data.world_110m.url, 'land')

# airport positions on background
airports = data.airports.url

points = alt.Chart(airports).transform_aggregate(
    latitude='mean(latitude)',
    longitude='mean(longitude)',
    count='count()',
    groupby=['state']
).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.Size('count:Q', title='Number of Mentions'),
    color=alt.value('steelblue'),
    tooltip=['state:N','count:Q']
).properties(
    title='Locations in Around the World in Eighty Days'
)

# Layering and configuring the components
background = alt.layer(
    alt.Chart(source).mark_geoshape(fill=None, stroke='black')
).project(
    'naturalEarth1'
).properties(width=600, height=400).configure_view(stroke=None)

background + points

In [71]:
points.to_dict()

{'config': {'view': {'continuousWidth': 400, 'continuousHeight': 300}},
 'data': {'url': 'https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/airports.csv'},
 'mark': 'circle',
 'encoding': {'color': {'value': 'steelblue'},
  'latitude': {'field': 'latitude', 'type': 'quantitative'},
  'longitude': {'field': 'longitude', 'type': 'quantitative'},
  'size': {'field': 'count',
   'title': 'Number of Mentions',
   'type': 'quantitative'},
  'tooltip': [{'field': 'state', 'type': 'nominal'},
   {'field': 'count', 'type': 'quantitative'}]},
 'title': 'Locations in Around the World in Eighty Days',
 'transform': [{'aggregate': [{'op': 'mean',
     'field': 'latitude',
     'as': 'latitude'},
    {'op': 'mean', 'field': 'longitude', 'as': 'longitude'},
    {'op': 'count', 'as': 'count'}],
   'groupby': ['state']}],
 '$schema': 'https://vega.github.io/schema/vega-lite/v4.17.0.json'}