In [103]:
import httpx
import xmltodict

In [None]:
params = {
    "verb": "ListRecords",
    "metadataPrefix": "edm",
}

with httpx.Client() as client:
    response = client.get("https://data.rijksmuseum.nl/oai", params=params)

response

<Response [200 OK]>

In [99]:
from pydantic import BaseModel
from typing import List
import xmltodict

class Record(BaseModel):
    identifier: str
    creator_identifier: str
    description: str
    title: str
    created: str
    is_part_off: list[str]
    is_referenced_by: list[str]
    provenance: str
    img_url: str
    alternative_descriptions: list[str]
    place: str


def extract_records_from_xml(xml_string: str) -> List[Record]:
    """
    Extracts a list of Record instances from an XML string using xmltodict.

    Args:
        xml_string: The XML string to parse.

    Returns:
        A list of Record instances.
    """
    records_dict = xmltodict.parse(xml_string)
    record_list: List[Record] = []

    for record_data in records_dict['OAI-PMH']['ListRecords']['record']:
        header = record_data['header']
        metadata = record_data['metadata']['rdf:RDF']

        identifier = header['identifier']

        # Find the ore:Aggregation, then edm:ProvidedCHO within metadata
        aggregation = None
        provided_cho = None
        # Ensure metadata['ore:Aggregation'] is always a list to handle both single and multiple cases
        aggregations = metadata.get('ore:Aggregation')
        if not isinstance(aggregations, list):
            aggregations = [aggregations] if aggregations else []

        for item in aggregations:
            if isinstance(item, dict) and '@rdf:about' in item:
                aggregation = item
                provided_cho = aggregation.get('edm:aggregatedCHO', {}).get('edm:ProvidedCHO', {})
                if provided_cho: # if provided_cho is found, break after the first valid aggregation
                    break


        if not provided_cho: # Changed condition to check for falsy value (None or empty dict)
            continue # Skip if no ProvidedCHO found


        creator_identifier = ''
        creator_element = provided_cho.get('dc:creator')
        if creator_element and '@rdf:resource' in creator_element:
            creator_identifier = creator_element['@rdf:resource']

        description_en = ''
        description_nl = ''
        description_elements = provided_cho.get('dc:description')
        if isinstance(description_elements, list):
            for desc in description_elements:
                if isinstance(desc, dict) and '@xml:lang' in desc:
                    if desc['@xml:lang'] == 'en':
                        description_en = desc.get('#text', '')
                    elif desc['@xml:lang'] == 'nl':
                        description_nl = desc.get('#text', '')
        elif isinstance(description_elements, dict) and '@xml:lang' in description_elements:
            if description_elements['@xml:lang'] == 'en':
                description_en = description_elements.get('#text', '')
            elif description_elements['@xml:lang'] == 'nl':
                description_nl = description_elements.get('#text', '')
        else:
            description_nl = description_elements if isinstance(description_elements, str) else ''


        description = description_en or description_nl

        title = provided_cho.get('dc:title', '')
        if isinstance(title,dict):
            title = title['#text']
        created = provided_cho.get('dcterms:created', '')
        if isinstance(created, list):
            created_en_list = [c['#text'] for c in created if isinstance(c, dict) and c.get('@xml:lang') == 'en']
            created_nl_list = [c['#text'] for c in created if isinstance(c, dict) and c.get('@xml:lang') == 'nl']
            created_en = created_en_list[0] if created_en_list else ''
            created_nl = created_nl_list[0] if created_nl_list else ''
            created = created_en or created_nl

        is_part_off = []
        is_part_off_elements = provided_cho.get('dcterms:isPartOf')
        if isinstance(is_part_off_elements, list):
            is_part_off = [part['@rdf:resource'] for part in is_part_off_elements if isinstance(part, dict) and '@rdf:resource' in part]
        elif isinstance(is_part_off_elements, dict) and '@rdf:resource' in is_part_off_elements:
            is_part_off = [is_part_off_elements['@rdf:resource']]


        is_referenced_by = []
        is_referenced_by_elements = provided_cho.get('dcterms:isReferencedBy')
        if isinstance(is_referenced_by_elements, list):
            is_referenced_by = [ref['@rdf:resource'] for ref in is_referenced_by_elements if isinstance(ref, dict) and '@rdf:resource' in ref]
        elif isinstance(is_referenced_by_elements, dict) and '@rdf:resource' in is_referenced_by_elements:
            is_referenced_by = [is_referenced_by_elements['@rdf:resource']]


        provenance_en = ''
        provenance_nl = ''
        provenance_elements = provided_cho.get('mrel:spn')
        if isinstance(provenance_elements, list):
            for prov in provenance_elements:
                 if isinstance(prov, dict) and '@xml:lang' in prov:
                    if prov['@xml:lang'] == 'en':
                        provenance_en = prov.get('#text', '')
                    elif prov['@xml:lang'] == 'nl':
                        provenance_nl = prov.get('#text', '')
        elif isinstance(provenance_elements, dict) and '@xml:lang' in provenance_elements:
             if provenance_elements['@xml:lang'] == 'en':
                provenance_en = provenance_elements.get('#text', '')
             elif provenance_elements['@xml:lang'] == 'nl':
                provenance_nl = provenance_elements.get('#text', '')
        else:
            provenance_nl = provenance_elements if isinstance(provenance_elements, str) else ''

        provenance = provenance_en or provenance_nl

        img_url = ''
        object_resource = aggregation.get('edm:object', {}).get('edm:WebResource')
        if object_resource and '@rdf:about' in object_resource:
            img_url = object_resource['@rdf:about']

        alternative_descriptions = []
        alternative_elements = provided_cho.get('dcterms:alternative')
        if isinstance(alternative_elements, list):
            alternative_descriptions = [alt for alt in alternative_elements if isinstance(alt, str)]
        elif isinstance(alternative_elements, str):
            alternative_descriptions = [alternative_elements]


        place = ''
        creator_description = None

        # Find creator description, can be inside ore:Aggregation or top level rdf:RDF
        for desc in metadata.get('rdf:Description', []):
             if isinstance(desc, dict) and '@rdf:about' in desc and desc['@rdf:about'] == creator_identifier:
                creator_description = desc
                break
        if creator_description is None and isinstance(metadata.get('rdf:Description'), dict) and metadata.get('rdf:Description', {}).get('@rdf:about') == creator_identifier:
             creator_description = metadata.get('rdf:Description')


        if creator_description:
            place_death_element = creator_description.get('rdaGr2:placeOfDeath', {}).get('edm:Place', {}).get('skos:prefLabel')
            if isinstance(place_death_element, list):
                for pde in place_death_element:
                    if isinstance(pde, dict) and '@xml:lang' in pde and pde['@xml:lang'] == 'nl':
                        place = pde.get('#text', '')
                        break
            elif isinstance(place_death_element, dict) and '@xml:lang' in place_death_element and place_death_element['@xml:lang'] == 'nl':
                 place = place_death_element.get('#text', '')

        record = Record(
            identifier=identifier,
            creator_identifier=creator_identifier,
            description=description,
            title=title,
            created=created,
            is_part_off=is_part_off,
            is_referenced_by=is_referenced_by,
            provenance=provenance,
            img_url=img_url,
            alternative_descriptions=alternative_descriptions,
            place=place,
        )
        record_list.append(record)

    return record_list

In [100]:
records = extract_records_from_xml(response.content)

In [101]:
records

[Record(identifier='https://id.rijksmuseum.nl/2001', creator_identifier='https://id.rijksmuseum.nl/21029638', description='Links naast de vrouw staat een paal waaraan een opgespannen doek met blote billen erop hangt. De vrouw leunt op een blok met de titel.', title='Before, Behind, Between, Above, Below', created='1973', is_part_off=['https://id.rijksmuseum.nl/26051'], is_referenced_by=['https://id.rijksmuseum.nl/301102015'], provenance='Piet Clement Collection', img_url='https://iiif.micr.io/KMFvF/full/max/0/default.jpg', alternative_descriptions=['John Donne, Elegie XIX: To his Mistris going to Bed'], place='Laren'),
 Record(identifier='https://id.rijksmuseum.nl/20010', creator_identifier='https://id.rijksmuseum.nl/21029638', description='In het midden trekt een jongen zijn hemd over zijn hoofd uit.', title='Badende jongens', created='1947', is_part_off=['https://id.rijksmuseum.nl/26051'], is_referenced_by=['https://id.rijksmuseum.nl/301102015', 'https://id.rijksmuseum.nl/301267971']

In [102]:
print(records[0].model_dump_json(indent=2))

{
  "identifier": "https://id.rijksmuseum.nl/2001",
  "creator_identifier": "https://id.rijksmuseum.nl/21029638",
  "description": "Links naast de vrouw staat een paal waaraan een opgespannen doek met blote billen erop hangt. De vrouw leunt op een blok met de titel.",
  "title": "Before, Behind, Between, Above, Below",
  "created": "1973",
  "is_part_off": [
    "https://id.rijksmuseum.nl/26051"
  ],
  "is_referenced_by": [
    "https://id.rijksmuseum.nl/301102015"
  ],
  "provenance": "Piet Clement Collection",
  "img_url": "https://iiif.micr.io/KMFvF/full/max/0/default.jpg",
  "alternative_descriptions": [
    "John Donne, Elegie XIX: To his Mistris going to Bed"
  ],
  "place": "Laren"
}
