In [131]:
import spacy
from spacy import displacy
from spacy.tokens import Span
import pandas as pd
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/jacobo/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [132]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

NLTK provides a very small version of the gutenberg corpus. So, I downloaded and cleaned manually a text from Gutenberg  that suited better my needs, namely "From Pole to Pole" by Sven Anders Hedin.

In [133]:
text = open("FromPoleToPole.txt").read()

Now I load the English language model and process the tex with it, what result into an annotated document that I name 'doc'.

In [134]:

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

Since *From Pole to Pole* mentions a lot of places, and locattions, I want to extract all Named Entities. 

In [135]:
for ent in doc.ents:
    print(ent, ent.label_)

Stockholm GPE
all night TIME
next morning TIME
Sweden GPE
Trelleborg ORG
the Baltic Sea LOC
Stockholm GPE
yesterday DATE
evening TIME
the Baltic Sea LOC
Berlin GPE
Germany GPE
quay GPE
Swedish NORP
the Baltic Sea LOC
370 feet QUANTITY
first ORDINAL
Swedish NORP
German NORP
Swedish NORP
German NORP
the day DATE
Trelleborg ORG
Baltic LOC
Swedes NORP
Danes NORP
Russians NORP
Germans NORP
summer days DATE
Four hours TIME
Rügen ORG
Germany GPE
Sassnitz ORG
German NORP
a few minutes TIME
German NORP
Rügen ORG
Rügen ORG
the Baltic Sea LOC
Sweden GPE
Rügen PERSON
Stralsund PERSON
Swedish NORP
Gustavus Adolphus PERSON
Stralsund Charles XII. PERSON
a year DATE
Pomerania GPE
Brandenburg PERSON
autumn evening TIME
North German NORP
Sweden GPE
Swedish NORP
the winter DATE
North German NORP
Swedish NORP
Scandinavian NORP
Europe LOC
the Ice Age EVENT
Scandinavia LOC
the Baltic Sea LOC
northern Germany GPE
Scandinavia LOC
the night TIME
one CARDINAL
third ORDINAL
Europe LOC
Berlin GPE
BERLIN ORG
Europ

|Type Label|Description|
|:---:|:---:|
|PERSON|People, including fictional.|
|NORP|Nationalities or religious or political groups.|
|FAC|Buildings, airports, highways, bridges, etc.|
|ORG|Companies, agencies, institutions, etc.|
|GPE|Countries, cities, states.|
|LOC|Non-GPE locations, mountain ranges, bodies of water.|
|PRODUCT|Objects, vehicles, foods, etc. (Not services.)|
|EVENT|Named hurricanes, battles, wars, sports events, etc.|
|WORK_OF_ART|Titles of books, songs, etc.|
|LAW|Named documents made into laws.|
|LANGUAGE|Any named language.|
|DATE|Absolute or relative dates or periods.|
|TIME|Times smaller than a day.|
|PERCENT|Percentage, including ”%“.|
|MONEY|Monetary values, including unit.|
|QUANTITY|Measurements, as of weight or distance.|
|ORDINAL|“first”, “second”, etc.|
|CARDINAL|Numerals that do not fall under another type.|

We can display the Named Entities using spaCy displacy module. 

In [138]:

options = {"compact": True}


#displacy.render(doc, style='dep', options = {"compact": True})
displacy.render(list(doc.sents)[0:10], style='ent', options=options)



I proceed to filter the GPE entities only. 

In [140]:
places = []
for ent in doc.ents:
    if ent.label_ == "GPE":
        places.append(f'{ent}')
       

print(places[0:20])



['Stockholm', 'Sweden', 'Stockholm', 'Berlin', 'Germany', 'quay', 'Germany', 'Sweden', 'Pomerania', 'Sweden', 'northern Germany', 'Berlin', 'North Germany', 'Berlin', 'Berlin', 'Germany', 'Russia', 'Berlin', 'London', 'Paris']


I could try and turn the list of places into a pandas dataframe.

In [141]:

df = pd.DataFrame()

df['place'] = places


and remove duplicates.

In [142]:
df = df.drop_duplicates()
df

Unnamed: 0,place
0,Stockholm
1,Sweden
3,Berlin
4,Germany
5,quay
...,...
757,Formosa
778,Manchuria
779,the Japanese Korea
791,overran


As we can see there are some word that were misinterpreted as places like quay. I could fix this by removing all words in lowercase as names of places are capitalized. 

In [None]:
#This is the short way to do it with a list:

places = list(set([ent.text for ent in doc.ents if ent.label_ == "GPE" and ent.text[0].isupper()]))

In [143]:
#But I prefer loops because they are easier to read. 
places = []
for ent in doc.ents:
    if ent.label_ == "GPE" and ent.text[0].isupper():
        if ent.text not in places:
            places.append(ent.text)


In [144]:
print(places)

['Stockholm', 'Sweden', 'Berlin', 'Germany', 'Pomerania', 'North Germany', 'Russia', 'London', 'Paris', 'St.\nPetersburg', 'Rome', 'Hamburg', 'Vienna', 'France', 'Prussia', 'China', 'Japan', 'Tibet', 'Bavaria', 'Saxony', 'Lübeck', 'Munich', 'Dresden', 'Leipzig', 'Cologne', 'England', 'Scotland', 'Austria', 'Brandenburg', 'Bohemia', 'Prague', "St.\nStephen's Church", 'Chechs', 'Magyars', 'Hungary', 'Polacks', 'Galicia', 'Baden', 'Rumania', 'Bulgaria', 'Budapest', 'South Germany', 'Alps', 'Magyar', 'Belgrade', 'Servia', 'Sofia', 'States', 'Albania', 'Macedonia', 'Greece', 'Turkey', 'Constantinople', 'Stambul', 'St.\nSophia', 'Asia Minor', 'Caucasia', 'Hellespont', 'Dardanelles', 'Asiatic', 'Jerusalem', 'Mecca', 'Persia', 'Turkestan', 'Arabia', 'India', 'Nishapur', 'Badakshan', 'Bahrein', 'North Africa', 'Afghanistan', 'Cairo', 'Tatars', 'Crimea', 'Samarkand', 'Bokhara', 'Terapia', 'Wales', 'Scania', 'Batum', 'Trebizond', 'Tabriz', 'Teheran', 'Turkish Armenia', 'Erzerum', 'BAGHDAD', 'Meso

Since I am going to study this geographical fiction, I will plot it as a map in simple way (there are many ways to do this).

In [145]:
#First I get the coordinates of the places with the service NOMINATIM from OpenStreet
from geopy.geocoders import Nominatim

# create a geocoder object for the Nominatim service
geolocator = Nominatim(user_agent="myMap")

# create an empty list to store the coordinates
coordinates = []

# iterate over each place in the list
for place in places:
    # geocode the place using Nominatim
    location = geolocator.geocode(place, exactly_one=True)

    # extract the coordinates from the location object
    if location is not None:
        lat = location.latitude
        lng = location.longitude
        coordinates.append((place, lat, lng))

print(coordinates)


[('Stockholm', 59.3251172, 18.0710935), ('Sweden', 59.6749712, 14.5208584), ('Berlin', 52.5170365, 13.3888599), ('Germany', 51.1638175, 10.4478313), ('Pomerania', 54.24556, 18.1099), ('North Germany', 52.404886950000005, 9.712372836460514), ('Russia', 64.6863136, 97.7453061), ('London', 51.5073359, -0.12765), ('Paris', 48.8588897, 2.3200410217200766), ('St.\nPetersburg', 27.7703796, -82.6695085), ('Rome', 41.8933203, 12.4829321), ('Hamburg', 53.550341, 10.000654), ('Vienna', 48.2083537, 16.3725042), ('France', 46.603354, 1.8883335), ('Prussia', 41.3738765, -94.506357), ('China', 35.000074, 104.999927), ('Japan', 36.5748441, 139.2394179), ('Tibet', 29.8556443, 90.8749807), ('Bavaria', 48.9467562, 11.4038717), ('Saxony', 50.9295798, 13.4585052), ('Lübeck', 53.866444, 10.684738), ('Munich', 48.1371079, 11.5753822), ('Dresden', 51.0493286, 13.7381437), ('Leipzig', 51.3406321, 12.3747329), ('Cologne', 50.938361, 6.959974), ('England', 52.5310214, -1.2649062), ('Scotland', 56.7861112, -4.114

Now that I have the coordinate, I plot the map. 

In [146]:
import folium

# create a map centered on the first coordinate in the list
map_center = coordinates[0][1], coordinates[0][2]
m = folium.Map(location=map_center, zoom_start=10)

# add a marker for each coordinate in the list
for place, lat, lng in coordinates:
    tooltip = place
    marker = folium.Marker([lat, lng], tooltip=tooltip)
    marker.add_to(m)

# display the map
m


But I want more than the map. It would be nice I could track the movement in space of people and objects.  For this I will use other annotations provided by the spaCy doc. Let's take a look at the syntax of one sentence from the book which describes movement in space:

In [147]:
text = 'The traveller goes from Paris to St. Petersburg'

doc = nlp(text)

options = {"compact": True, "bg": "#9FA6B2",
           "color": "black"}


displacy.render(doc, style='dep', options = {"compact": True})
#displacy.render(list(doc.sents)[0], style='dep', options=options)

In [151]:
#We can use spacy.explain() function to know what the technical termns mean

spacy.explain("nsubj")
#spacy.explain("pobj")
#spacy.explain("PROPN")

'proper noun'


| POS | Description |
| --- | ----------- |
| ADJ | adjective |
| ADP| adposition |
| ADV | adverb |
| AUX | auxiliary |
| CCONJ|  coordinating conjunction |
| DET | determiner |
| INTJ | interjection |
| NOUN | noun |
| NUM | numeral |
| PART | particle |
| PRON | pronoun |
| PROPN | proper noun |
| PUNCT | punctuation |
| SCONJ | subordinating conjunction |
| SYM | symbol |
| VERB | verb |
| X | other |

The terminology comes from the Universal Dependencies project. UD treebanks are used to train language models. Take a look at a sample of the training data:

```
# sent_id = GUM_academic_art-8
# s_prominence = 2
# s_type = wh
# transition = establishment
# text = Which elements of specific artworks do they focus on?
1	Which	which	DET	WDT	PronType=Int	2	det	2:det	Discourse=joint-list_m:8->7:0|Entity=(21-abstract-new-cf2-2-sgl
2	elements	element	NOUN	NNS	Number=Plur	8	obl	8:obl:on	_
3	of	of	ADP	IN	_	5	case	5:case	_
4	specific	specific	ADJ	JJ	Degree=Pos	5	amod	5:amod	Entity=(22-object-new-cf3-2-sgl
5	artworks	artwork	NOUN	NNS	Number=Plur	2	nmod	2:nmod:of	Entity=22)21)
6	do	do	AUX	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	8	aux	8:aux	_
7	they	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	8	nsubj	8:nsubj	Entity=(19-person-giv:act-cf1*-1-ana)
8	focus	focus	VERB	VB	VerbForm=Inf	0	root	0:root	_
9	on	on	ADP	IN	_	2	case	2:case	SpaceAfter=No
10	?	?	PUNCT	.	_	8	punct	8:punct	_
```

Now that we have an idea of the syntax of the sentence, we can proceed to extract some travel information using a syntantic machter:

In [153]:
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Span



# Define the pattern for matching the "go to" construction
pattern = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"LEMMA": "go"},
        #"RIGHT_ATTRS": {"POS": "VERB"},
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {   "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "preposition",
        "RIGHT_ATTRS": {"DEP": "prep"},
        "RIGHT_ATTRS": {"LEMMA": "to"},
    },
    {   "LEFT_ID": "preposition",
        "REL_OP": ">",
        "RIGHT_ID": "place",
        "RIGHT_ATTRS": {"ENT_TYPE": "GPE"},
    }
]

# Create a dependency matcher object
matcher = DependencyMatcher(nlp.vocab)
matcher.add("go_to", [pattern])

# Define a sample text
#text = "John went to Paris. Mary will go to Rome tomorrow."
text = open("FromPoleToPole.txt").read()

# Process the text with the nlp object
doc = nlp(text)

dep_matches = matcher(doc)

#match_id, token_ids = matches[1]
#for i in range(len(token_ids)):
#    print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

print(dep_matches)

[(14076869952056337076, [1698, 1697, 1701, 1704]), (14076869952056337076, [68321, 68315, 68323, 68324])]


In [154]:
# Loop over each tuple in the list 'dep_matches'
for match in dep_matches:
    
    # Take the first item in the tuple at [0] and assign it under
    # the variable 'pattern_name'. This item is a spaCy Lexeme object.
    pattern_name = match[0]
    
    # Take the second item in the tuple at [1] and assign it under
    # the variable 'matches'. This is a list of indices referring to the
    # Doc object under 'doc' that we just matched.
    matches = match[1]
    
    # Let's unpack the matches list into variables for clarity
    verb, subject, prep, place = matches[0], matches[1], matches[2], matches[3]
    4
    # Print the matches by first fetching the name of the pattern from the 
    # Vocabulary object. Next, use the 'subject' and 'verb' variables to 
    # index the Doc object. This gives us the actual Tokens matched. Use a
    # tabulator ('\t') and some stops ('...') to separate the output.
    print(nlp.vocab[pattern_name].text, '\t', doc[subject], doc[verb], doc[prep], doc[place])

go_to 	 traveller goes to Petersburg
go_to 	 opium goes to China


Since we are getting to few matches we need to consider adding more verbs or making the patterns more flexible.  Let take a look at the verbs in the texts:

In [155]:
verbs = []
for token in doc:
    if token.pos_ == "VERB":
       verbs.append(token.lemma_)

#turn list to set to eliminate duplicates:
verbs = sorted(set(verbs))       

print(verbs)

['ARABIA', 'BUKHARA', 'abandon', 'abash', 'abdicate', 'abound', 'absorb', 'abstain', 'accommodate', 'accompany', 'accord', 'accustom', 'achieve', 'acknowledge', 'acquaint', 'acquire', 'act', 'adhere', 'admire', 'adopt', 'adorn', 'advance', 'affect', 'affix', 'afford', 'agree', 'aim', 'allow', 'ally', 'alter', 'amount', 'amuse', 'anchor', 'animate', 'annex', 'announce', 'annul', 'answer', 'appear', 'apply', 'appoint', 'appreciate', 'approach', 'arise', 'arm', 'arouse', 'arrange', 'arrive', 'articulate', 'ascend', 'ascertain', 'ask', 'assassinate', 'assemble', 'assert', 'assume', 'assure', 'attach', 'attack', 'attain', 'attempt', 'attend', 'attract', 'augment', 'avail', 'avoid', 'await', 'awake', 'awaken', 'back', 'baku', 'balance', 'bar', 'barebacke', 'barefoote', 'bareheade', 'barge', 'bark', 'barter', 'bathe', 'be', 'bear', 'beat', 'beckon', 'become', 'befall', 'beg', 'begin', 'behave', 'behold', 'believe', 'belong', 'bend', 'benefit', 'benumb', 'beseech', 'beset', 'bestow', 'betake',

Since there are too many verbs, we filter them using a list of synonyms that I can extract from Wordnet, using nltk. 

In [156]:

from nltk.corpus import wordnet
nltk.download('wordnet')

synonyms = []

for syn in wordnet.synsets("go"):
    for lemma in syn.lemmas():
        if lemma.name() in verbs:
            synonyms.append(lemma.name())

print(set(synonyms))


{'lead', 'pass', 'perish', 'move', 'choke', 'fling', 'offer', 'work', 'start', 'turn', 'belong', 'live', 'travel', 'get', 'run', 'become', 'fit', 'go', 'whirl', 'sound', 'depart', 'crack', 'die', 'extend', 'survive', 'last', 'fail', 'break', 'proceed', 'blend'}


[nltk_data] Downloading package wordnet to /Users/jacobo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [157]:
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Span


# Define the pattern for matching the "go to" construction
pattern = [
    {
        "RIGHT_ID": "verb",
    #Here I add the list of verbs to the lemma the matcher should check. 
        "RIGHT_ATTRS": {"LEMMA": {"IN": synonyms,"POS": "VERB"}}, 

    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {   "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "preposition",
        "RIGHT_ATTRS": {"DEP": "prep"},
        "RIGHT_ATTRS": {"LEMMA": "to"},
    },
    {   "LEFT_ID": "preposition",
        "REL_OP": ">",
        "RIGHT_ID": "place",
        "RIGHT_ATTRS": {"ENT_TYPE": "GPE"},
    }
]

# Create a dependency matcher object
matcher = DependencyMatcher(nlp.vocab)
matcher.add("go_to", [pattern])

# Define a sample text
#text = "John went to Paris. Mary will go to Rome tomorrow."
text = open("FromPoleToPole.txt").read()

# Process the text with the nlp object
doc = nlp(text)

dep_matches = matcher(doc)

#match_id, token_ids = matches[1]
#for i in range(len(token_ids)):
#    print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

# Loop over each tuple in the list 'dep_matches'
for match in dep_matches:
    
    # Take the first item in the tuple at [0] and assign it under
    # the variable 'pattern_name'. This item is a spaCy Lexeme object.
    pattern_name = match[0]
    
    # Take the second item in the tuple at [1] and assign it under
    # the variable 'matches'. This is a list of indices referring to the
    # Doc object under 'doc' that we just matched.
    matches = match[1]
    
    # Let's unpack the matches list into variables for clarity
    verb, subject, prep, place = matches[0], matches[1], matches[2], matches[3]
    4
    # Print the matches by first fetching the name of the pattern from the 
    # Vocabulary object. Next, use the 'subject' and 'verb' variables to 
    # index the Doc object. This gives us the actual Tokens matched. Use a
    # tabulator ('\t') and some stops ('...') to separate the output.
    print(nlp.vocab[pattern_name].text, '\t', doc[subject], doc[verb], doc[prep], doc[place])

go_to 	 traveller goes to Petersburg
go_to 	 which runs to Teheran
go_to 	 I travelled to Teheran
go_to 	 he offered to Nereids
go_to 	 opium goes to China
