In [67]:
import httpx
import pandas as pd
from pydantic import BaseModel, model_validator


In [2]:
df = pd.read_csv("../data/bare_records.csv")
df.shape

(814225, 3)

In [3]:
df = df.dropna()
df.shape

(814225, 3)

In [5]:
df.head()

Unnamed: 0,object_id,datestamp,set_spec
0,https://id.rijksmuseum.nl/2001,2025-01-14T19:57:56Z,['26051']
1,https://id.rijksmuseum.nl/20010,2025-01-14T19:34:32Z,['26051']
2,https://id.rijksmuseum.nl/20010000,2025-01-14T12:12:07Z,"['260239', '26051']"
3,https://id.rijksmuseum.nl/200100000,2025-01-16T21:45:21Z,"['260106', '260239']"
4,https://id.rijksmuseum.nl/200100001,2025-01-16T21:45:22Z,"['260106', '260239']"


In [10]:
with httpx.Client() as client:
    response = client.get(df.object_id.iloc[0])
    data = response.json()
    data

{'@context': 'https://linked.art/ns/v1/linked-art.json',
 'id': 'https://id.rijksmuseum.nl/2001',
 'type': 'HumanMadeObject',
 'produced_by': {'type': 'Production',
  'technique': [{'id': 'https://id.rijksmuseum.nl/220922',
    'type': 'Type',
    'identified_by': [{'type': 'Identifier',
      'assigned_by': [{'type': 'AttributeAssignment',
        'assigned': ['https://id.rijksmuseum.nl/2001'],
        'assigned_property': 'influenced_by'}],
      'content': '3',
      'classified_as': [{'id': 'http://vocab.getty.edu/aat/300456575',
        'type': 'Type'}]}]},
   {'id': 'https://id.rijksmuseum.nl/2202942',
    'type': 'Type',
    'identified_by': [{'type': 'Identifier',
      'assigned_by': [{'type': 'AttributeAssignment',
        'assigned': ['https://id.rijksmuseum.nl/2001'],
        'assigned_property': 'influenced_by'}],
      'content': '1',
      'classified_as': [{'id': 'http://vocab.getty.edu/aat/300456575',
        'type': 'Type'}]}]},
   {'id': 'https://id.rijksmuseum.nl/22

In [None]:
data.keys()

dict_keys(['@context', 'id', 'type', 'produced_by', 'subject_of', 'assigned_by', 'identified_by', 'classified_as', 'dimension', 'made_of', 'shows', 'referred_to_by', 'equivalent', 'member_of'])

In [64]:
data['referred_to_by'][6]['classified_as'][0]['id']

'http://vocab.getty.edu/aat/300435452'

In [45]:
data['referred_to_by'][6]['classified_as'][0]['classified_as'][0]['_label']

'brief text'

In [65]:
class Assigned(BaseModel):
    id: str
    type: str

class AssignedBy(BaseModel):
    assigned: list[Assigned]

class Part(BaseModel):
    assigned_by: list[AssignedBy]

class ProducedBy(BaseModel):
    part: list[Part]

class _ClassifiedAs(BaseModel):
    label: str

    @model_validator(mode="before")
    def validator(cls, data):
        return {**data, "label": data['_label']}
    

class ClassifiedAs(BaseModel):
    id: str
    classified_as: list[_ClassifiedAs]

class ReferredToBy(BaseModel):
    content: str
    classified_as: list[ClassifiedAs]
    
class Record(BaseModel):
    produced_by: ProducedBy
    referred_to_by: list[ReferredToBy]
    artist_id: str = ''
    description: str = ''

    def _get_artist_id(self):
        for part in self.produced_by.part:
            for assigned_by in part.assigned_by:
                for assigned in assigned_by.assigned:
                    if assigned.type == "Person":
                        return assigned.id
        raise ValueError("No `artist_id` found.")

    def _get_description(self):
        for referred_to_by in self.referred_to_by:
            for classified_as in referred_to_by.classified_as:
                if classified_as.id == 'http://vocab.getty.edu/aat/300435452':
                    for _classified_as in classified_as.classified_as:
                        if _classified_as.label == "brief text":
                            self.description = referred_to_by.content
                            return referred_to_by.content
        raise ValueError("No description found")

    @model_validator(mode="after")
    def validator(self):
        self.artist_id = self._get_artist_id()
        self.description = self._get_description()
        return self

record = Record.model_validate(data)
record.artist_id, record.description

('https://id.rijksmuseum.nl/21029638',
 'Links naast de vrouw staat een paal waaraan een opgespannen doek met blote billen erop hangt. De vrouw leunt op een blok met de titel.')

In [None]:
from tqdm.auto import tqdm

artist_ids = []

with httpx.Client(follow_redirects=True) as client:
    for object_id in tqdm(df.object_id.to_list()):
        response = client.get(df.object_id.iloc[0])
        data = response.json()
        record = Record.model_validate(data)
        artist_ids.append(record.artist_id)

  0%|          | 0/814225 [00:00<?, ?it/s]