# Look for countries in Wikidata matching the place names extracted through NER

This notebook feeds the place names extracted from the books via NER to Wikidata to see if there's a country by that name. It searches the main label field as well as alternative labels, and accepts entities that are instances of `country`, `historical country`, or `sovereign state`.

The original place name, the matched country name, the instance type, and Wikidata links are saved to an `ndjson` file (one JSON object per line). Dates and geocordinates are included if available.

In [1]:
from SPARQLWrapper import JSON, SPARQLWrapper
import pandas as pd
import re
import json
from pathlib import Path
from tqdm.auto import tqdm
import time

In [2]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent="GLAMWorkbench notebook")

In [3]:
def df_clean(results, is_int=[]):
    """
    Convert SPARQL query results into a Pandas Dataframe.
    Flatten the nested structures and remove the datatype info to leave only the fields and values.
    Convert string ints to ints.
    """
    # Use json_normalize to import and flatten the JSON
    df = pd.json_normalize(results["results"]["bindings"], sep="_")
    # Drop columns that don't have 'value' in their name
    columns = [c for c in df.columns if c.endswith("_value")]
    df = df[columns]
    # Rename columns to remove '_value'
    df.rename(lambda x: re.sub(r"_value$", "", x), axis=1, inplace=True)
    # Make sure columns containing integers have an integer data type.
    for int_col in is_int:
        df[int_col] = df[int_col].astype("Int64")
    return df

In [4]:
# Load the placenames data file
df = pd.read_csv("spacy_ner.txt", delimiter="\t", header=None)
df.head()

Unnamed: 0,0,1
0,3,Scotland
1,3,New Jersey
2,3,Los Angeles
3,3,Aberdeen
4,3,Joannes


In [5]:
# Get the list of place names
ents = df[1].to_frame()
ents.columns = ['label']
ents.shape

(4170275, 1)

In [6]:
# Deduplicate the place names
ents.drop_duplicates(inplace=True)
ents.shape

(1060910, 1)

In [None]:
query_template = """
SELECT ?text ?country ?countryLabel ?countryTypeLabel ?startDate ?endDate ?lat ?lon WHERE {{
  # Get country, historical country, or soverign state -- historical country is a subclass of country but is not preferred, so can't get it with P31/P279*
  VALUES ?class {{wd:Q6256 wd:Q3024240 wd:Q3624078}}.
  # This will get populated with a batch of placenames
  VALUES ?text {{{}}}
  # Match either label or altLabel
  ?country rdfs:label|skos:altLabel ?text;
           wdt:P31 ?class;
           wdt:P31 ?countryType.
  # Only include the specified instance types in results
  FILTER(?countryType IN (wd:Q6256, wd:Q3024240, wd:Q3624078)).
  # Include dates if available
  OPTIONAL {{?country wdt:P571 ?startDate.}}
  OPTIONAL {{?country wdt:P576 ?endDate.}}
  # include geocoords if available
  OPTIONAL {{?country p:P625/psv:P625 [wikibase:geoLatitude ?lat; wikibase:geoLongitude ?lon].}}
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "en" .
  }}
}}
"""

def chunker(seq, size):
    """
    Split an iterable into smaller chunks for batched processing.
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

ent_list = ents["label"].to_list()

# We'll save processed countries in case we have to restart
countries_processed = Path("countries_processed.txt").read_text().split("\n")

with tqdm(total=len(ent_list)) as pbar:
    # We can give the WD Sparql querry multiple values,
    # so we'll split the list of coutnries up into batches
    for chunk in chunker(ent_list, 100):
        # Make sure each place name is a string
        chunk = [str(e) for e in chunk]
        # Submit multiple values in one SPARQL query for efficiency
        # Also filters out values containing non-alphabetical characters (except for spaces and hyphens)
        query_vals  = " ".join([f'"{re.sub(r"^the ", "", e)}"@en' for e in chunk if e.replace(" ", "").replace("-", "").isalpha() and e not in countries_processed])
        if query_vals:
            # Format the Sparql query
            query = query_template.format(query_vals)
            # print(query)
            # Get the data!
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            df_country = df_clean(results)
            # If we've found some matches save them to an ndjson file
            if not df_country.empty:
                #print(df_country.iloc[0])
                #dfs.append(df_country)
                countries = df_country.to_dict(orient="records")
                for country in countries:
                    with Path("countries.ndjson").open("a") as countries_file:
                        countries_file.write(f"{json.dumps(country)}\n")
            # Keep track of the values we've processed
            countries_processed.extend(chunk)
            with Path("countries_processed.txt").open("a") as countries_processed_file:
                processed = "\n".join(chunk)
                countries_processed_file.write(f"{processed}\n")
            time.sleep(1)
        pbar.update(100)