# Harvest potential Trove matches for DAQA

DAQA identifiers are not used in Trove or Wikidata, so we need to search on names, collect matches and then manually check results.

Here we query the Trove People and Organisations SRU interface using the `pa.firstname` and `pa.surname` fields. It might be worth reprocessing the records with no matches and only searching on the surname – although this introduce a lot of noise and make matching harder.

I've filtered out matches where there's only one source and that source is the Prosecution Project. These links are saved to the `daqa_trove_ignore...json` file and could be manually checked.

In [9]:
import json
import re
import time
from pathlib import Path
import datetime

import requests_cache
from bs4 import BeautifulSoup
from IPython.display import JSON, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
import pandas as pd

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

In [4]:
# Available SRU parameters

params = {
    #'query': 'rec.identifier="http://nla.gov.au/nla.party-641680"', # Can specify a particular property, it not searches all (?) fields
    "query": "513fe28ad707ff6bcd001f56",
    "version": "1.1",
    "operation": "searchRetrieve",
    "recordSchema": "urn:isbn:1-931666-33-4",  # This specifies records in EAC-CPF format
    "maximumRecords": 100,
    "startRecord": 1,
    "resultSetTTL": 300,
    "recordPacking": "xml",
    "recordXPath": "",
    "sortKeys": "",
}

# SRU endpoint
api_url = "http://www.nla.gov.au/apps/srw/search/peopleaustralia"

In [5]:
def get_total_results(params):
    params["maximumRecords"] = 0
    response = s.get(api_url, params=params)
    soup = BeautifulSoup(response.content, "xml")
    return int(soup.find("numberOfRecords").string)

In [6]:
def soup_string(elem, prop):
    """
    Saves on memory by not keeping BS navigable string
    """
    if value := elem.find(prop):
        string = str(value.string).strip()
        if string == "None":
            string = value.get_text()
        return string


def get_attr(elem, prop, attr):
    if value := elem.find(prop):
        return value.attrs.get(attr)


def get_date(elem, prop):
    try:
        date = elem.find(prop)["standardDateTime"]
    except (KeyError):
        try:
            date = elem.find(prop)["standardDate"]
        except KeyError:
            date = soup_string(elem, prop)
    except TypeError:
        date = None
    return date


def get_dates(history):
    dates = {}
    if history:
        for event in history.find_all("maintenanceEvent"):
            event_type = soup_string(event, "eventType")
            event_date = get_date(event, "eventDateTime")
            if event_type == "created":
                dates["date_created"] = event_date
            elif event_type == "updated":
                dates["date_modified"] = event_date
    return dates


def get_names(identity):
    names = []
    for name_entry in identity.find_all("nameEntry"):
        name = {}
        for part in name_entry.find_all("part"):
            if part.has_attr("localType"):
                name_type = part["localType"]
            else:
                name_type = "name"
            try:
                name[name_type].append(str(part.string))
            except (KeyError, AttributeError):
                name[name_type] = [str(part.string)]
        if name_entry.find("authorizedForm"):
            name["authorized"] = True
        else:
            name["authorized"] = False
        names.append(name)
    return names


def get_exist_dates(description):
    exist_dates = {}
    dates = description.find("existDates")
    if dates:
        exist_dates["date_from"] = get_date(dates, "fromDate")
        exist_dates["date_to"] = get_date(dates, "toDate")
    return exist_dates


def get_places(description):
    places = []
    places_elem = description.find("places")
    if places_elem:
        for place_entry in places_elem.find_all("place"):
            place = {
                "place_type": soup_string(place_entry, "placeRole"),
                "name": soup_string(place_entry, "placeEntry"),
                "date_from": get_date(place_entry, "fromDate"),
                "date_to": get_date(place_entry, "toDate"),
            }
            places.append(place)
    return places


def get_events(description):
    events = []
    for event_list in description.find_all("chronList"):
        for event in event_list.find_all("chronItem"):
            events.append(
                {
                    "name": soup_string(event, "event"),
                    "date": get_date(event, "date"),
                    "date_from": get_date(event, "fromDate"),
                    "date_to": get_date(event, "toDate"),
                }
            )
    return events


def get_occupations(description):
    occupations = []
    if occupation_list := description.find("occupations"):
        for occupation in occupation_list.find_all("occupation"):
            occupations.append(soup_string(occupation, "term"))
    return occupations


def get_related_entities(eac):
    related = []
    for relation in eac.find_all("cpfRelation"):
        # Can be resourceRelation or cpfRelation
        if description := relation.find("descriptiveNote"):
            description = description.get_text().strip()
        else:
            description = None
        related.append(
            {
                "relation_type": relation.attrs.get("cpfRelationType"),
                "href": relation.attrs.get("href"),
                "name": soup_string(relation, "relationEntry"),
                "entity_type": get_attr(relation, "relationEntry", "localType"),
                "date_from": get_date(relation, "fromDate"),
                "date_to": get_date(relation, "toDate"),
                "description": description,
            }
        )
    return related


def get_related_resources(eac):
    related = []
    for relation in eac.find_all("resourceRelation"):
        # Can be resourceRelation or cpfRelation
        relation_type = relation.attrs.get("resourceRelationType")
        if relation.find("dc"):
            if description := relation.find_all("description"):
                description = " ".join([d.get_text() for d in description])
            related.append(
                {
                    "relation_type": relation_type,
                    "href": soup_string(relation, "identifier"),
                    "name": soup_string(relation, "title"),
                    "resource_type": None,
                    "contributor": soup_string(relation, "contributor"),
                    "date": soup_string(relation, "date"),
                    "description": description,
                }
            )
        else:
            if description := relation.find("abstract"):
                description = description.get_text()
            related.append(
                {
                    "relation_type": relation_type,
                    "href": relation.attrs.get("href"),
                    "name": soup_string(relation, "relationEntry"),
                    "resource_type": get_attr(relation, "relationEntry", "localType"),
                    "contributor": soup_string(relation, "name"),
                    "date": soup_string(relation, "date"),
                    "description": description,
                }
            )
    return related


def get_biog(description):
    biog = []
    for bio in description.find_all("biogHist"):
        for para in bio.find_all("p"):
            biog.append(str(para.string).strip())
    return " ".join(biog)


def get_sources(eac):
    sources = []
    for source_eac in eac.find_all("eac-cpf"):
        source = process_eac(source_eac)
        source["related_entities"] = get_related_entities(source_eac)
        source["related_resources"] = get_related_resources(source_eac)
        sources.append(source)
    return sources


def get_agency_details(agency_element):
    agency = {
        "agency_id": soup_string(agency_element, "agencyCode"),
        "agency_name": soup_string(agency_element, "agencyName"),
    }
    return agency


def get_eac_meta(eac):
    meta = {"record_id": soup_string(eac, "recordId")}
    control = eac.find("control")
    # agency
    meta.update(get_agency_details(control.find("maintenanceAgency")))
    meta.update(get_dates(control.find("maintenanceHistory")))
    return meta


def format_name(names, entity_type):
    authorized = None
    combined_names = []
    for name in names:
        if name["authorized"] is True:
            authorized = name
            break
    if not authorized:
        try:
            authorized = names[0]
        except IndexError:
            pass
    if authorized:
        for name_type in ["forename", "surname", "name", "parent"]:
            combined_names += authorized.get(name_type, [])
    return " ".join(combined_names)


def process_eac(eac):
    record = get_eac_meta(eac)
    identity = eac.find("identity")
    record["names"] = get_names(identity)
    record["entity_type"] = soup_string(identity, "entityType")
    record["entity_id"] = soup_string(identity, "entityId")
    record["name"] = format_name(record["names"], record["entity_type"])
    description = eac.find("description")
    if not description:
        description = eac.find("cpfDescription")
    record["dates"] = get_exist_dates(description)
    record["places"] = get_places(description)
    record["occupations"] = get_occupations(description)
    record["abstract"] = soup_string(description, "abstract")
    record["description"] = get_biog(description)
    record["events"] = get_events(description)
    record["sources"] = get_sources(eac)
    return record


def get_records(params):
    records = []
    response = s.get(api_url, params=params)
    soup = BeautifulSoup(response.content, "xml")
    for result in soup.find_all("record"):
        eac = result.find("eac-cpf")
        # get id info here
        record = process_eac(eac)
        record["trove_url"] = f"https://nla.gov.au/nla.party-{record['record_id']}"
        records.append(record)
    return records


def harvest_results(params):
    records = []
    total = get_total_results(params.copy())
    start = 1
    with tqdm(total=total) as pbar:
        while start <= total:
            params["start"] = start
            new_records = get_records(params)
            records += new_records
            start += 100
            pbar.update(len(new_records))
    return records

In [8]:
matches = []
not_found = []
ignore = []
search_params = params.copy()
with Path("ACDE_Merged_Normalized_202206031344.json").open("r") as json_file:
    acde_json = json.load(json_file)
for record in tqdm(acde_json):
    if record["data_source"] == "DAQA":
        search_params["query"] = f"pa.surname={record['last_name']} AND pa.firstname ={record['first_name']}"
        results = get_records(search_params)
        if len(results) == 0:
            not_found.append(record["ori_id"])
        else:
            if len(results) > 1:
                multiple = True
            else:
                multiple = False
            for link in results:
                sources = []
                for source in link["sources"]:
                    sources.append(source["agency_id"])
                # Ignore if it's just from Prosecution Project
                if len(sources) == 1 and sources[0] == "AU-QPRO":
                    ignore.append(link["trove_url"])
                else:
                    match = {
                            "acde_source": "DAQA",
                            "or_id": record["ori_id"],
                            "display_name": record["display_name"],
                            "related_source": "Trove",
                            "related_source_id": link["record_id"],
                            "related_source_url": link["trove_url"],
                            "related_source_name": link["name"],
                            "related_source_birth_date": link["dates"].get("date_from"),
                            "related_source_death_date": link["dates"].get("date_to"),
                            "related_source_sources": " | ".join(sources),
                            "related_source_occupations": " | ".join(link["occupations"]),
                            "multiple_matches": multiple
                        }
                    matches.append(match)
                        
with Path(f"daqa_trove_matches_{datetime.datetime.now().strftime('%Y%m%d')}.json").open("w") as json_file:
    json.dump(matches, json_file)
with Path(f"daqa_trove_ignore_{datetime.datetime.now().strftime('%Y%m%d')}.json").open("w") as json_file:
    json.dump(ignore, json_file)
with Path(f"daqa_trove_not_found_{datetime.datetime.now().strftime('%Y%m%d')}.json").open("w") as json_file:
    json.dump(not_found, json_file)

  0%|          | 0/174097 [00:00<?, ?it/s]

## Some quick analysis

In [15]:
matches = pd.read_json("daqa_trove_matches_20221006.json")
matches

Unnamed: 0,acde_source,or_id,display_name,related_source,related_source_id,related_source_url,related_source_name,related_source_birth_date,related_source_death_date,related_source_sources,related_source_occupations,multiple_matches
0,DAQA,1,Tom Heath,Trove,463497,https://nla.gov.au/nla.party-463497,Tom Heath,,,AuCNLKIN,,False
1,DAQA,9,David Roessler,Trove,575523,https://nla.gov.au/nla.party-575523,David M Roessler,,,AuCNLKIN,,False
2,DAQA,11,Leon Trotsky,Trove,994832,https://nla.gov.au/nla.party-994832,Leon Trotsky,1879,1940,AuCNLKIN,,False
3,DAQA,23,Roy Churcher,Trove,551983,https://nla.gov.au/nla.party-551983,Roy Churcher,1933,,AuCNLKIN | AU-NUN:DAAO,,False
4,DAQA,24,Peter Harvey,Trove,1629843,https://nla.gov.au/nla.party-1629843,Peter Harvey,,,AU-YORCID,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1415,DAQA,4006,James Gibson,Trove,1297541,https://nla.gov.au/nla.party-1297541,J Douglas Gibson,1909,,AuCNLKIN,,True
1416,DAQA,4006,James Gibson,Trove,632631,https://nla.gov.au/nla.party-632631,James Gibson,1862,1942,AuCNLKIN | AuCNLKIN,,True
1417,DAQA,4006,James Gibson,Trove,833013,https://nla.gov.au/nla.party-833013,James J Gibson,1904,,AuCNLKIN,,True
1418,DAQA,4006,James Gibson,Trove,833016,https://nla.gov.au/nla.party-833016,J T R Gibson,,,AuCNLKIN,,True


In [16]:
# Number of matches
matches.shape[0]

1420

In [21]:
# Number of DAQ records with matches
matches["or_id"].nunique()

370

In [22]:
# Number of DAQA records with single Trove matches
matches.loc[matches["multiple_matches"] == False].shape[0]

203

In [19]:
# How many DAQA records could be matched?
misses = pd.read_json("daqa_trove_not_found_20221006.json")
misses.shape[0]

454