# Extracting and Mapping Locations from Around the World in Eighty Days

* [Gutenberg Book Text](https://www.gutenberg.org/files/103/103-0.txt)

In [10]:
import re

import altair as alt
import spacy

from pathlib import Path

from geonamescache import GeonamesCache
from geopy.geocoders import Nominatim
from vega_datasets import data


gc = GeonamesCache(min_city_population=500)
country_names = {c.get('name') for c in gc.get_countries().values()}
geolocator = Nominatim(user_agent='Around The World in Eighty Days')
nlp = spacy.load('en_core_web_lg')
text = Path('data/around-the-world-in-eighty-days.txt').read_text()
chapters = re.split(re.compile(r'\nCHAPTER\s+[IVX]+\.\n'), text)[1:]
assert len(chapters) == 37

In [11]:
doc = nlp(chapters[10])

In [12]:
gpes = []

for ent in doc.ents:
    if 'GPE' == ent.label_:
        gpes.append({
            'name': ent.lemma_,
            'sentence': ent.sent,
            'start_char': ent.start_char
        })

In [39]:
countries = {} 
locations = {}

for gpe in gpes:
    name = gpe['name']
    sentence = gpe['sentence']
    # if name in countries:
    #     countries[name]['references'].append(sentence)
    # elif name in country_names:
    #     countries[name] = {'references': [sentence]}
    if name in locations:
        locations[name]['references'].append(sentence)
    else:
        resp = geolocator.geocode(name, featuretype='city')
        if not resp:
            continue
        locations[name] = {
            'references': [sentence],
            'latitude': resp.latitude,
            'longitude': resp.longitude
        }
        # FIXME: check whether location is a country rather than cities. 
        # Color countries by frequency of mentions and use a circle of all other locations.
        # meta = geolocator.geocode(name)
        # if meta and ('city' == meta.raw['type'] or gc.search_cities(name, attribute='name')):
            # 'class': meta.raw['class'],
            # 'display_name': meta.address,
            # 'latitude': meta.latitude,
            # 'longitude': meta.longitude,
            # 'type': meta.raw['type']


In [42]:
geolocator.geocode('England', exactly_one=False, featuretype='city')

[Location(Worcester, Worcestershire, England, United Kingdom, (52.1911849, -2.2206585, 0.0)),
 Location(England, Lonoke County, Arkansas, United States, (34.5442609, -91.9690285, 0.0))]

In [5]:
# Source of land data
source = alt.topo_feature(data.world_110m.url, 'land')

# airport positions on background
airports = data.airports.url

points = alt.Chart(airports).transform_aggregate(
    latitude='mean(latitude)',
    longitude='mean(longitude)',
    count='count()',
    groupby=['state']
).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.Size('count:Q', title='Number of Mentions'),
    color=alt.value('steelblue'),
    tooltip=['state:N','count:Q']
).properties(
    title='Locations in Around the World in Eighty Days'
)

# Layering and configuring the components
background = alt.layer(
    alt.Chart(source).mark_geoshape(fill=None, stroke='black')
).project(
    'naturalEarth1'
).properties(width=600, height=400).configure_view(stroke=None)

background + points

In [19]:
gc.get_cities_by_name('India')

[{'2413391': {'geonameid': 2413391,
   'name': 'India',
   'latitude': 13.56667,
   'longitude': -15.75,
   'countrycode': 'GM',
   'population': 529,
   'timezone': 'Africa/Banjul',
   'admin1code': '07',
   'alternatenames': ['']}}]

In [20]:
len(locations)

7