# Harvest links from Wikidata

This notebook takes AusStage and DAAO identifiers, finds any associated Wikidata entries, then harvests details of any external identifiers attached to those entries. Of course it's possible to just download selected identifiers, but it's hard to know what identifiers are in use so I thought it would be useful to get everything. This can also help us explore unexpected connections between identification systems.

I haven't yet been able to figure out how to get all this data using SPARQL alone. In the end I decided to break the processing down into a few steps:

- Use a SPARQL query to find the WD entity related to a specific DAAO or AusStage identifier
- Download a complete RDF representation of the entity from the Linked Data interface
- Load the RDF into a graph
- Find all of the properties with the type `externalid`
- For each of these properties, get the label and the value
- Use SPARQL to get the URL format string for the identifier (this string includes a `$1` marker that you replace with the identifier to produce a url to the indefication system) – rather than query for the same properties over and over, I'm saving the results in a file that is checked before a query is sent
- Create the url from the format string and value

I'm sure there must be ways of simplifying this. I noticed that the RDF does include a `wdtn` value for some of the properties, this is a normalised value that seems to be a full url. However, this only appears on some identifiers and not others. I don't know why. I thought it safest to get the url formatters.

In [None]:
import json
import math
import re

import altair as alt
import pandas as pd
import requests
from slugify import slugify
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm.auto import tqdm
from pathlib import Path
import datetime
from rdflib import Graph, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS

In [None]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

people_ids = {
    "Libraries Australia ID": "P409",
    "NLA Trove people ID": "P1315",
    "People Australia ID": "P9159",
    "Obituaries Australia ID": "P9232",
    "Australian Dictionary of Biography ID": "P1907",
    "Labour Australia ID": "P9245",
    "Indigenous Australia ID": "P9246",
    "Women Australia ID": "P9244",
    "Encyclopedia of Australian Science ID": "P4228",
    "AusStage person ID": "P8292",
    "AustLit ID": "P8295",
    "DAAO ID": "P1707",
    "National Archives of Australia entity ID": "P10856",
    "Encyclopedia of Melbourne ID": "P9304",
    "Dictionary of Sydney ID": "P3794",
    "Australian Women's Register ID": "P4186",
    "Art Gallery of South Australia creator ID": "P6804",
    "Parliament of Australia MP ID": "P10020",
    "Re-Member ID": "P8633",
    "NSW Parliament member ID": "P10012",
    "Australian War Memorial ID": "P6713",
    "National Gallery of Victoria artist ID": "P2041",
    "Australian Prints + Printmaking artist ID": "P10086",
    "Australian Music Centre artist ID": "P9575",
    "Australian National Maritime Museum person ID": "P7769",
    "Australian Poetry Library poet ID": "P5465",
    "VIAF ID": "P214"
}

In [None]:
query_template = """
    SELECT ?person ?personLabel WHERE {{
        ?person wdt:{} "{}".
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE], en". }}
    }}
"""

def get_links_from_externalid(source, source_id):
    """
    This first finds an entity via SPARQL based on an external id.
    It then gets a RDF representation of that entity from the LOD interface,
    and extracts details of all of the external identifiers.
    """
    # Get the property id for the identifier
    source_property = people_ids[source]
    # Format the SPARQL query
    query = query_template.format(source_property, source_id)
    # print(query)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    # Submit SPARQL query
    results = sparql.query().convert()
    # SPARQL results are in bindings
    bindings = results["results"]["bindings"]
    if bindings:
        # There SHOULD only be one result for an id, but just in case...
        if len(bindings) > 1:
            print(f"Multiple results for {source}: {source_id}:")
            for binding in bindings:
                print(f"  -- {binding}") 
        else:
            # This is the WD url for the entity
            entity_url = results["results"]["bindings"][0]["person"]["value"]
            # Label for this entity
            entity_label = results["results"]["bindings"][0]["personLabel"]["value"]
            # print(entity_label)
            # This url will get a Turtle serialised RDF representation of the entity
            data_url = f"{entity_url.replace('entity', 'wiki/Special:EntityData')}.ttl"
            id_values = extract_id_values(entity_url, data_url)
            # Add details of the WD item to each external id
            for id_v in id_values:
                id_v["wd_url"] = entity_url
                id_v["wd_label"] = entity_label
            return id_values
        
def get_formatter(prop):
    """
    Get the url format string for the given external id property.
    This will return a string with a '$1' placeholder.
    Replace the '$1' with the id value to get a url to the external id source.
    """
    query_template = """
    SELECT ?formatter WHERE {{
      wd:{} wdt:P1630 ?formatter
    }}
    """
    query = query_template.format(prop)
    # print(query)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    try:
        return results["results"]["bindings"][0]["formatter"]["value"]
    except (KeyError, IndexError):
        return None    
        
def extract_id_values(entity_url, data_url):
    '''
    Extract details and values of all external identifiers in a WD rdf file.
    '''
    formatters = json.loads(Path("formatters.json").read_text())
    id_values = []
    g = Graph()
    g.parse(data_url)
    for prop in g.subjects(URIRef("http://wikiba.se/ontology#propertyType"), URIRef("http://wikiba.se/ontology#ExternalId")):
        prop_label = g.value(prop, RDFS.label)
        prop_id = prop.split("/")[-1]
        value = g.value(URIRef(entity_url), URIRef(prop.replace('entity', 'prop/direct')))
        if value:
            try: 
                formatter = formatters[prop_id]
            except KeyError:
                formatter = get_formatter(prop_id)
                formatters[prop_id] = formatter
                Path("formatters.json").write_text(json.dumps(formatters))
            if formatter:
                url = formatter.replace("$1", value)
            id_values.append({
                "related_source": str(prop_label),
                "related_id": str(value),
                "related_url": url
            })
    return id_values

def harvest_all_wd_links(acde_source, wd_source, ids):
    links = []
    not_found = []
    for source_id in tqdm(ids):
            results = get_links_from_externalid(wd_source, source_id)
            if results:
                # print('Found')
                ori_data = {
                    "acde_source": acde_source,
                    "or_id": source_id
                }
                for result in results:
                    links.append({**ori_data, **result})
            else:
                not_found.append(source_id)
    with Path(f"{acde_source.lower()}_wd_all_links_{datetime.datetime.now().strftime('%Y%m%d')}.json").open('w') as json_file:
        json.dump(links, json_file)
    with Path(f"{acde_source.lower()}_wd_all_not_found_{datetime.datetime.now().strftime('%Y%m%d')}.json").open('w') as json_file:
        json.dump(not_found, json_file)

Rather than look for every AusStage identifier in the ACDEngine dataset, it's much quicker to download a set of AusStage ids attached to people in Wikidata from the query interface. Run this query and download the results as a CSV file: https://w.wiki/5miL

In [None]:
ausstage_ids = list(set(Path("ausstage_ids_in_wd.csv").read_text().split()))
harvest_all_wd_links("AusStage", "AusStage person ID", ausstage_ids)

In [None]:
#daao_ids = pd.read_csv("daao_urls.csv")["daao_path"].to_list()
daao_ids = list(set(Path("daao_wd.csv").read_text().split()))
harvest_all_wd_links("DAAO", "DAAO ID", daao_ids)