In [7]:
import zipfile, json, pandas as pd, pathlib

zipfile.ZipFile("data.json.zip").extractall("data")

json_path = pathlib.Path("data/data.json")

with json_path.open(encoding="utf-8") as f:
    obj = json.load(f)

raw = obj["data"] if isinstance(obj, dict) else obj      # Supports both dictionary and list to avoid situations where some collaborators work with the complete API response while others work with a pure list.
print(f"{len(raw)} records in total")

df = pd.json_normalize(raw)
df.head()

68076 records in total


Unnamed: 0,id,accession_number,share_license_status,tombstone,current_location,title,title_in_original_language,series,series_in_original_language,creation_date,...,dimensions.chain.height,dimensions.overall as mounted (both pieces).height,dimensions.overall as mounted (both pieces).width,dimensions.sheet (irregular).height,dimensions.sheet (irregular).width,dimensions.portfolio box.height,dimensions.portfolio box.width,dimensions.portfolio box.depth,dimensions.with mounting.height,dimensions.with mounting.width
0,74228,2020.113,CC0,"Fishmarket, 1902. Camille Pissarro (French, 18...",,Fishmarket,,,,1902,...,,,,,,,,,,
1,74539,2015.449,CC0,"A Miller's Carriage, c. 1895. Albert-Charles L...",,A Miller's Carriage,,,,c. 1895,...,,,,,,,,,,
2,74540,2015.451,CC0,"Leda and the Swan, c. 1846–83. Adolphe Yvon (F...",,Leda and the Swan,,,,c. 1846–83,...,,,,,,,,,,
3,74551,2018.1059,CC0,"The Monks, c. 1802–30. François Marius Granet ...",,The Monks,,,,c. 1802–30,...,,,,,,,,,,
4,74553,2018.106,CC0,"Study Sheet, c. 1870–80. Alfred Dehodencq (Fre...",,Study Sheet,,,,c. 1870–80,...,,,,,,,,,,


In [8]:
pip install rdflib SPARQLWrapper pandas tqdm requests

Note: you may need to restart the kernel to use updated packages.


In [12]:
df = pd.json_normalize(raw)
print("total columns: ", len(df.columns))
df.columns.tolist()  # Preview the columns and output as list

total columns:  954


['id',
 'accession_number',
 'share_license_status',
 'tombstone',
 'current_location',
 'title',
 'title_in_original_language',
 'series',
 'series_in_original_language',
 'creation_date',
 'creation_date_earliest',
 'creation_date_latest',
 'artists_tags',
 'culture',
 'technique',
 'support_materials',
 'department',
 'collection',
 'type',
 'measurements',
 'state_of_the_work',
 'edition_of_the_work',
 'copyright',
 'inscriptions',
 'provenance',
 'find_spot',
 'related_works',
 'former_accession_numbers',
 'did_you_know',
 'description',
 'citations',
 'catalogue_raisonne',
 'url',
 'alternate_images',
 'creditline',
 'sketchfab_id',
 'sketchfab_url',
 'gallery_donor_text',
 'creators',
 'updated_at',
 'dimensions.unframed.height',
 'dimensions.unframed.width',
 'dimensions.framed.height',
 'dimensions.framed.width',
 'dimensions.framed.depth',
 'exhibitions.current',
 'exhibitions.legacy',
 'external_resources.wikidata',
 'external_resources.internet_archive',
 'images.annotation

In [50]:
from __future__ import annotations
from typing import List, Dict
import re, time, requests
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, DC, FOAF, OWL
from SPARQLWrapper import SPARQLWrapper, JSON

EX   = Namespace("http://example.org/cma/")
WDQS = "https://query.wikidata.org/sparql"

class Artifact:
    BASE = "https://openaccess-api.clevelandart.org/api/artworks"
    
    def __init__(self, rec: Dict):
        self.raw = rec
        self.id = rec["id"]
        self.title = rec.get("title", "(no title)")
        
        # For simplicity, only read the description of the first creator when there are multiple creators
        creators = rec.get("creators") or []
        if creators:
            self.creator = creators[0].get("description", "unknown artist")
        else:
            self.creator = "unknown artist" # If the list is empty, the art object is from unknown creator, output "unknown artist".
        
        # exact creation date or timeframe
        cdate = rec.get("creation_date")
        if cdate and isinstance(cdate, str) and cdate.strip():
            self.date_text = cdate.strip()
        else:
            earl = rec.get("creation_date_earliest")
            late = rec.get("creation_date_latest")
            if earl is not None and late is not None:
                self.date_text = f"{earl}–{late}"    
            elif earl is None and late is not None:
                self.date_text = f"?–{late}"    
            elif earl is not None and late is None:
                self.date_text = f"{earl}–?"
            else:
                self.date_text = "unknown date"
        m = re.search(r"\b(-?\d{3,4})\b", self.date_text)
        self.date_num: int | None = int(m.group()) if m else None
            
        wd_list = rec.get("external_resources", {}).get("wikidata", [])
        if wd_list:
            self.wd_url = wd_list[0]
            m = re.search(r"Q\d+$", self.wd_url)
            self.qid = m.group(0) if m else None
        else:
            self.wd_url = None
            self.qid = None
            
        self.img = rec.get("images", {}).get("web", {}).get("url")

    # 1) RDF
    def to_rdf(self) -> Graph:
        g = Graph()
        g.bind("ex", EX); g.bind("dc", DC); g.bind("foaf", FOAF)
        
        art = EX[f"artwork/{self.id}"]
        
        g.add((art, RDF.type, EX.Artwork)) # S, P, O
        g.add((art, RDFS.label, Literal(self.title)))
        if self.creator != "unknown artist":
            g.add((art, DC.creator, Literal(self.creator)))
        g.add((art, DC.date, Literal(self.date_text)))
        if self.wd_url:
            g.add((art, FOAF.isPrimaryTopicOf, URIRef(self.wd_url)))

        return g

    # 2) Wikidata enrichment
    def wikidata_enrich(self, sleep: float = 1.0) -> Dict[str, List[str]]:
    
        if not self.qid:
            return {}
        
        agent_str = (
            "ACTH25-project/0.1 "
            "(https://github.com/yfgann/ACTH25_project; "
            "mailto:annyufeiguo@gmail.com)"
        )
        
        sparql = SPARQLWrapper(WDQS, agent=agent_str)
        sparql.setReturnFormat(JSON)
        sparql.setQuery(f"""
        SELECT ?pid ?valLabel WHERE {{
            VALUES ?propPid {{ "P136" "P186" "P180" }}
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/direct/", ?propPid))  AS ?propDir)
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/statement/", ?propPid)) AS ?propStmt)
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/statement/value/", ?propPid)) AS ?propPS)

            {{
                wd:{self.qid} ?propDir ?val .
            }}
            UNION
            {{
                wd:{self.qid} ?propStmt ?st .
                ?st ?propPS ?val .
            }}

            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            BIND(?propPid AS ?pid)
        }}
        """)
        
        data = sparql.query().convert()
        
        pid2key = {"P136": "P136", "P186": "P186", "P180": "P180"}
        bag = {"P136": [], "P186": [], "P180": []}
        
        for row in data["results"]["bindings"]:
            pid = row["pid"]["value"]
            lbl = row.get("valLabel", {}).get("value")
            
            if pid in bag and lbl:
                bag[pid].append(lbl)
                
        clean = {k: sorted(set(v)) for k, v in bag.items()}
        self.raw["wikidata_enrichment"] = clean

        return clean


    # 3) Look for similar artworks
    def similar_artworks(
        self,
        limit: int = 10,          # up to 10 similar artworks
        year_window: int = 20,    # timeframe ±20
        use_material: bool = False,
        pause: float = 0.3
    ) -> list:
        
        m = re.search(r"\b(-?\d{3,4})\b", self.date_text)
        if m:
            center_year = int(m.group(1))
        else:
            print("not enough information, unable to find similar artworkds")
            return []
        
        params = {
            "q": self.creator,                       # same artist
            "created_after": center_year - year_window,
            "created_before": center_year + year_window,
            "limit": limit
        }
        
        if use_material and "wikidata_enrichment" in self.raw:
            mats = self.raw["wikidata_enrichment"].get("P186", [])
            if mats:
                params["q"] += " " + " ".join(mats)
                
        r = requests.get(self.BASE, params=params, timeout=30)
        r.raise_for_status()
        time.sleep(pause)
        return r.json().get("data", [])   

In [52]:
art = Artifact(raw[0])

print("\n=== Basic fields ===")
print("ID        :", art.id)
print("Title     :", art.title)
print("Creator   :", art.creator)
print("Date text :", art.date_text)
print("QID       :", art.qid)
print("Image URL :", art.img)

g: Graph = art.to_rdf()
ttl_snippet = g.serialize(format="turtle").splitlines()[:10]
print("\n=== Turtle preview ===")
print("\n".join(ttl_snippet))

enrich = art.wikidata_enrich(sleep=0) 
print("\n=== Enrichment (Wikidata) ===")
pprint.pp(enrich)

sim = art.similar_artworks(limit=3, pause=0)
print("\n=== Similar artworks (3) ===")
for s in sim:
    print(f"{s['id']:>8}  {s['title']}")


=== Basic fields ===
ID        : 74228
Title     : Fishmarket
Creator   : Camille Pissarro (French, 1830–1903)
Date text : 1902
QID       : Q87480807
Image URL : https://openaccess-cdn.clevelandart.org/2020.113/2020.113_web.jpg

=== Turtle preview ===
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix ex: <http://example.org/cma/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://example.org/cma/artwork/74228> a ex:Artwork ;
    rdfs:label "Fishmarket" ;
    dc:creator "Camille Pissarro (French, 1830–1903)" ;
    dc:date "1902" ;
    foaf:isPrimaryTopicOf <https://www.wikidata.org/wiki/Q87480807> .

=== Enrichment (Wikidata) ===
{'P136': ['cityscape'],
 'P186': ['canvas', 'oil paint'],
 'P180': ['fish market']}

=== Similar artworks (3) ===
