# Filter place references using the Wikidata results

Using the data on countries provided by Wikidata, we can filter the place references extracted from the texts to only include references to countries. 

In [4]:
import pandas as pd
import spacy
import re
from pathlib import Path
import itertools
from tqdm.auto import tqdm
from spacy.matcher import Matcher, PhraseMatcher
nlp = spacy.load("en_core_web_sm")

In [5]:
# Load the NER references
refs = pd.read_csv("spacy_ner.txt", delimiter="\t", header=None, names=["book_id", "text"], dtype={"book_id": str, "text": str})

In [6]:
refs

Unnamed: 0,book_id,text
0,00000003,Scotland
1,00000003,New Jersey
2,00000003,Los Angeles
3,00000003,Aberdeen
4,00000003,Joannes
...,...,...
7762987,12040473,Orel
7762988,12040473,las
7762989,12040473,English Languages
7762990,12040473,Vorwort


In [7]:
# Load the countries data
countries = pd.read_json("countries.ndjson", lines=True)

In [8]:
countries

Unnamed: 0,text,country,startDate,endDate,countryLabel,countryTypeLabel,lat,lon
0,Japan,http://www.wikidata.org/entity/Q205662,1603-01-01T00:00:00Z,1868-01-01T00:00:00Z,Tokugawa Shogunate,historical country,,
1,Scotland,http://www.wikidata.org/entity/Q230791,0843-01-01T00:00:00Z,1707-04-30T00:00:00Z,Kingdom of Scotland,historical country,,
2,England,http://www.wikidata.org/entity/Q179876,0927-01-01T00:00:00Z,1707-05-11T00:00:00Z,Kingdom of England,historical country,52.066667,-1.316667
3,Japan,http://www.wikidata.org/entity/Q188712,1868-01-03T00:00:00Z,1947-05-03T00:00:00Z,Empire of Japan,historical country,35.683333,139.766667
4,Ireland,http://www.wikidata.org/entity/Q215530,1542-01-01T00:00:00Z,1801-01-01T00:00:00Z,Kingdom of Ireland,historical country,53.333333,-6.250000
...,...,...,...,...,...,...,...,...
2589,NL,http://www.wikidata.org/entity/Q29999,1815-03-16T00:00:00Z,,Kingdom of the Netherlands,sovereign state,52.366667,4.883333
2590,NL,http://www.wikidata.org/entity/Q29999,1815-03-16T00:00:00Z,,Kingdom of the Netherlands,sovereign state,52.366667,4.883333
2591,NL,http://www.wikidata.org/entity/Q29999,1815-03-16T00:00:00Z,,Kingdom of the Netherlands,country,52.366667,4.883333
2592,NL,http://www.wikidata.org/entity/Q29999,1815-03-16T00:00:00Z,,Kingdom of the Netherlands,country,52.366667,4.883333


In [9]:
# Remove duplicates from the countries data
countries.sort_values(by="endDate", inplace=True, na_position='last')
# If there are duplicate entries with different end dates, this should get the latest or Nan (ie not ended)
countries.drop_duplicates(subset=["text", "country"], keep="last", inplace=True)
countries.shape

(975, 8)

In [10]:
# Remove country refs that are only two letters long except for US and UK
# In some cases Wikidata has matched character combinations of ISO two letter country codes.
countries_gt2 = countries.loc[(countries["text"].str.len() > 2) & (~countries["text"].isin(["US", "UK"]))]

In [11]:
# Because WD can return multiple results for a single country 
# if there are multiple values in fields like instance type of date,
# we need to try and select the 'best' candidate.
countries_selected = []
# Group country records by the place name reference
for text, group in countries_gt2.groupby(by="text"):
    #print(f"\n{text}")
    # If there's only one record for a country, then keep it
    if group.shape[0] == 1:
        countries_selected.append(group.iloc[0])
    else:
        # Get countries that have no end date
        countries_current = group.loc[(group["countryTypeLabel"].isin(["country", "sovereign state"]))]
        # Get coutnries that have geocoordinates
        countries_with_geo = group.loc[(group["countryTypeLabel"].isin(["country", "sovereign state"])) & (group["lat"].notnull())]
        # Get countries that have geocoords and no end date
        countries_nodate_with_geo = group.loc[(group["countryTypeLabel"].isin(["country", "sovereign state"])) & (group["lat"].notnull()) & (group["endDate"].isnull())]
        # If there are records with geocoords & no end date, select the first
        if not countries_nodate_with_geo.empty:
            countries_selected.append(countries_nodate_with_geo.iloc[0])
        # Else if there are countries with geocoords select the first
        elif not countries_with_geo.empty:
            countries_selected.append(countries_with_geo.iloc[0])
        # Else if there are countries with no end dates
        elif not countries_current.empty:
            countries_selected.append(countries_current.iloc[0])
        # Anything else
        else:
            countries_selected.append(group.iloc[-1])
df_countries = pd.concat(countries_selected, axis=1).T

In [12]:
df_countries

Unnamed: 0,text,country,startDate,endDate,countryLabel,countryTypeLabel,lat,lon
2308,AUS,http://www.wikidata.org/entity/Q408,1901-01-01T00:00:00Z,,Australia,country,-28.0,137.0
242,Abyssinia,http://www.wikidata.org/entity/Q207521,,1974-01-01T00:00:00Z,Ethiopian Empire,historical country,12.6,37.466667
1834,Accad,http://www.wikidata.org/entity/Q4461035,-2333-01-01T00:00:00Z,-2153-01-01T00:00:00Z,Akkadian empire,historical country,33.1,44.1
2279,Adamawa,http://www.wikidata.org/entity/Q775550,1809-01-01T00:00:00Z,1903-07-29T00:00:00Z,Adamawa Emirate,historical country,9.15,10.0
604,Afghanistan,http://www.wikidata.org/entity/Q889,1709-01-01T00:00:00Z,,Afghanistan,country,33.0,66.0
...,...,...,...,...,...,...,...,...
1585,sri,http://www.wikidata.org/entity/Q854,1972-05-22T00:00:00Z,,Sri Lanka,country,7.0,81.0
1457,the United States,http://www.wikidata.org/entity/Q30,1784-05-12T00:00:00Z,,United States of America,country,39.828175,-98.5795
2322,tza,http://www.wikidata.org/entity/Q924,1964-04-26T00:00:00Z,,Tanzania,country,-6.306944,34.853889
1483,van,http://www.wikidata.org/entity/Q686,1980-07-30T00:00:00Z,,Vanuatu,country,-16.633331,168.016669


In [13]:
# Save to CSV for further processing
df_countries.to_csv("cleaned_countries.csv", index=False)

In [14]:
# Merge the refs and the selected country records
df = pd.merge(refs, df_countries, on="text")

In [15]:
# Save to file for further processing
df.to_csv('refs_countries.csv', index=False)

In [16]:
df.shape

(466053, 9)

In [17]:
#df = df.loc[df["text"].str.len() > 2]

In [18]:
# How many books include country names
df["book_id"].nunique()

31764

In [19]:
# Books with most country names
df["book_id"].value_counts()

05017326    315
00000649    240
08022453    239
04018649    222
00705379    203
           ... 
06045758      1
06046481      1
07008541      1
07013449      1
05010219      1
Name: book_id, Length: 31764, dtype: int64

In [20]:
df.drop_duplicates(subset=["book_id", "text"], inplace=True, keep="last")

In [21]:
# Most common place name references
df["text"].value_counts()[:50]

England              20801
the United States    20567
America              19411
France               17914
Rome                 12503
Spain                11935
Germany              11933
United States        11604
Great Britain        10841
Italy                10751
India                10498
Canada               10492
Scotland              9565
Ireland               9498
China                 9135
Mexico                8907
Egypt                 8824
Holland               8743
Russia                8405
Georgia               7836
Greece                7765
Switzerland           6066
Israel                5999
Britain               5999
Japan                 5676
Cuba                  5303
Turkey                5301
Austria               5225
Sweden                4973
Venice                4890
Jerusalem             4762
Wales                 4760
Portugal              4638
Prussia               4244
Peru                  4243
Norway                4222
Denmark               4094
A

In [22]:
# Most common 'countries'
df["countryLabel"].value_counts()[:50]

United States of America      52526
Kingdom of England            20805
France                        18324
United Kingdom                17410
Roman Republic                12555
Germany                       12182
Spain                         11935
Italy                         10906
India                         10793
Canada                        10492
Republic of Ireland            9773
Kingdom of Scotland            9565
People's Republic of China     9145
Mexico                         8925
Egypt                          8824
Netherlands                    8771
Greece                         8412
Russia                         8408
Georgia                        7836
Switzerland                    6077
Israel                         6051
Japan                          5773
Iceland                        5749
Cuba                           5313
Wales                          5310
Turkey                         5301
Austria                        5233
Sweden                      