In [7]:
import zipfile, json, pandas as pd, pathlib

zipfile.ZipFile("data.json.zip").extractall("data")

json_path = pathlib.Path("data/data.json")

with json_path.open(encoding="utf-8") as f:
    obj = json.load(f)

raw = obj["data"] if isinstance(obj, dict) else obj      # Supports both dictionary and list to avoid situations where some collaborators work with the complete API response while others work with a pure list.
print(f"{len(raw)} records in total")

df = pd.json_normalize(raw)
df.head()

68076 records in total


Unnamed: 0,id,accession_number,share_license_status,tombstone,current_location,title,title_in_original_language,series,series_in_original_language,creation_date,...,dimensions.chain.height,dimensions.overall as mounted (both pieces).height,dimensions.overall as mounted (both pieces).width,dimensions.sheet (irregular).height,dimensions.sheet (irregular).width,dimensions.portfolio box.height,dimensions.portfolio box.width,dimensions.portfolio box.depth,dimensions.with mounting.height,dimensions.with mounting.width
0,74228,2020.113,CC0,"Fishmarket, 1902. Camille Pissarro (French, 18...",,Fishmarket,,,,1902,...,,,,,,,,,,
1,74539,2015.449,CC0,"A Miller's Carriage, c. 1895. Albert-Charles L...",,A Miller's Carriage,,,,c. 1895,...,,,,,,,,,,
2,74540,2015.451,CC0,"Leda and the Swan, c. 1846–83. Adolphe Yvon (F...",,Leda and the Swan,,,,c. 1846–83,...,,,,,,,,,,
3,74551,2018.1059,CC0,"The Monks, c. 1802–30. François Marius Granet ...",,The Monks,,,,c. 1802–30,...,,,,,,,,,,
4,74553,2018.106,CC0,"Study Sheet, c. 1870–80. Alfred Dehodencq (Fre...",,Study Sheet,,,,c. 1870–80,...,,,,,,,,,,


In [8]:
pip install rdflib SPARQLWrapper pandas tqdm requests

Note: you may need to restart the kernel to use updated packages.


In [12]:
df = pd.json_normalize(raw)
print("total columns: ", len(df.columns))
df.columns.tolist()  # Preview the columns and output as list

total columns:  954


['id',
 'accession_number',
 'share_license_status',
 'tombstone',
 'current_location',
 'title',
 'title_in_original_language',
 'series',
 'series_in_original_language',
 'creation_date',
 'creation_date_earliest',
 'creation_date_latest',
 'artists_tags',
 'culture',
 'technique',
 'support_materials',
 'department',
 'collection',
 'type',
 'measurements',
 'state_of_the_work',
 'edition_of_the_work',
 'copyright',
 'inscriptions',
 'provenance',
 'find_spot',
 'related_works',
 'former_accession_numbers',
 'did_you_know',
 'description',
 'citations',
 'catalogue_raisonne',
 'url',
 'alternate_images',
 'creditline',
 'sketchfab_id',
 'sketchfab_url',
 'gallery_donor_text',
 'creators',
 'updated_at',
 'dimensions.unframed.height',
 'dimensions.unframed.width',
 'dimensions.framed.height',
 'dimensions.framed.width',
 'dimensions.framed.depth',
 'exhibitions.current',
 'exhibitions.legacy',
 'external_resources.wikidata',
 'external_resources.internet_archive',
 'images.annotation

In [68]:
from __future__ import annotations
from typing import List, Dict
import re, time, requests
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, DC, FOAF, OWL
from SPARQLWrapper import SPARQLWrapper, JSON

EX   = Namespace("http://example.org/cma/")
WDQS = "https://query.wikidata.org/sparql"

class Artifact:
    BASE = "https://openaccess-api.clevelandart.org/api/artworks"
    
    def __init__(self, rec: Dict):
        self.raw = rec
        self.id = rec["id"]
        self.title = rec.get("title", "(no title)")
        
        # For simplicity, only read the description of the first creator when there are multiple creators
        creators = rec.get("creators") or []
        if creators:
            self.creator = creators[0].get("description", "unknown artist")
        else:
            self.creator = "unknown artist" # If the list is empty, the art object is from unknown creator, output "unknown artist".
        
        # exact creation date or timeframe
        cdate = rec.get("creation_date")
        if cdate and isinstance(cdate, str) and cdate.strip():
            self.date_text = cdate.strip()
        else:
            earl = rec.get("creation_date_earliest")
            late = rec.get("creation_date_latest")
            if earl is not None and late is not None:
                self.date_text = f"{earl}–{late}"    
            elif earl is None and late is not None:
                self.date_text = f"?–{late}"    
            elif earl is not None and late is None:
                self.date_text = f"{earl}–?"
            else:
                self.date_text = "unknown date"
        m = re.search(r"\b(-?\d{1,4})\b", self.date_text)
        self.date_num = int(m.group()) if m else None
            
        wd_list = rec.get("external_resources", {}).get("wikidata", [])
        if wd_list:
            self.wd_url = wd_list[0]
            self.qid = re.search(r"Q\d+$", self.wd_url or "").group(0)
        else:
            self.wd_url = None
            self.qid = None
            
        self.img = rec.get("images", {}).get("web", {}).get("url")

    # 1) RDF
    def to_rdf(self) -> Graph:
        g = Graph()
        g.bind("ex", EX); g.bind("dc", DC); g.bind("foaf", FOAF); g.bind("owl", OWL)
        
        art = EX[f"artwork/{self.id}"]
        
        g.add((art, RDF.type, EX.Artwork)) # S, P, O
        g.add((art, RDFS.label, Literal(self.title)))
        if self.creator != "unknown artist":
            g.add((art, DC.creator, Literal(self.creator)))
        g.add((art, DC.date, Literal(self.date_text)))
        if self.wd_url:
            g.add((art, FOAF.isPrimaryTopicOf, URIRef(self.wd_url)))
            g.add((art, OWL.sameAs,
                   URIRef(self.wd_url.replace("/wiki/", "/entity/"))))
        return g

    # 2) Wikidata enrichment
    def wikidata_enrich(self, sleep: float = 1.0) -> Dict[str, List[str]]:
    
        if not self.qid:
            return {}
        
        agent_str = (
            "ACTH25-project/0.1 "
            "(https://github.com/yfgann/ACTH25_project; "
            "mailto:annyufeiguo@gmail.com)"
        )
        
        sparql = SPARQLWrapper(WDQS, agent=agent_str)
        sparql.setReturnFormat(JSON)
        sparql.setQuery(f"""
        SELECT ?pid ?pidLabel ?valLabel WHERE {{
            VALUES ?propPid {{ "P136" "P180" "P186" }}
            BIND(IRI(CONCAT("http://www.wikidata.org/entity/", ?propPid)) AS ?pidEntity)
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/direct/", ?propPid)) AS ?propDir)
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/statement/", ?propPid)) AS ?propStmt)
            BIND(IRI(CONCAT("http://www.wikidata.org/prop/statement/value/", ?propPid)) AS ?propPS)

            {{
                wd:{self.qid} ?propDir ?val .
            }}
            UNION
            {{
                wd:{self.qid} ?propStmt ?st .
                ?st ?propPS ?val .
            }}

            SERVICE wikibase:label {{
                bd:serviceParam wikibase:language "en". 
                ?pidEntity rdfs:label ?pidLabel .
                ?val       rdfs:label ?valLabel .
            }}
            
            BIND(?propPid AS ?pid)
        }}
        """)
        
        data = sparql.query().convert()
        
        bag: Dict[str, List[str]] = {}   # {'genre': [...], 'depicts': [...], …}
        
        for row in data["results"]["bindings"]:
            prop  = row.get("pidLabel", {}).get("value") # genre / depicts / made from material
            value = row.get("valLabel", {}).get("value")
            
            if prop and value:
                bag.setdefault(prop, []).append(value)
                
        clean = {k: sorted(set(v)) for k, v in bag.items()}
        self.raw["wikidata_enrichment"] = clean

        return clean


    # 3) Look for similar artworks
    def similar_artworks(
        self,
        limit: int = 10,          # up to 10 similar artworks
        year_window: int = 20,    # timeframe ±20
        filter_author: bool = False,
        filter_material: bool = False,
        extra_terms: List[str] | None = None,
        pause: float = 0.3
    ) -> list:
        
        if self.date_num is None:
            return []
        
        query_terms: List[str] = []
            
        if filter_author and self.creator != "unknown artist":
            query_terms.append(self.creator)
            
        if filter_material:
            mats = self.raw.get("wikidata_enrichment", {}).get("made from material", [])
            query_terms.extend(mats)

        if extra_terms:
            query_terms.extend(extra_terms)   
            
        q_string = " ".join(query_terms) if query_terms else None
        
        center_year = self.date_num
        
        params = {
            "created_after": center_year - year_window,
            "created_before": center_year + year_window,
            "limit": limit
        }
        
        if q_string:
            params["q"] = q_string
                
        try:
            r = requests.get(self.BASE, params=params, timeout=15)
            r.raise_for_status()
            time.sleep(pause)
            return r.json().get("data", [])
        except requests.RequestException as e:
            print("[similar_artworks_by_year] API error:", e)
            return []

In [69]:
art = Artifact(raw[0])

print("\n=== Basic information ===")
print("ID        :", art.id)
print("Title     :", art.title)
print("Creator   :", art.creator)
print("Date      :", art.date_text)
print("QID       :", art.qid)
print("Image URL :", art.img)

g: Graph = art.to_rdf()
ttl_snippet = g.serialize(format="turtle").splitlines()[:10]
print("\n=== Turtle preview ===")
print("\n".join(ttl_snippet))

enrich = art.wikidata_enrich(sleep=0) 
print("\n=== Wikidata Enrichment ===")
pprint.pp(enrich)

sim = art.similar_artworks(limit=3, pause=0)
print("\n=== Similar artworks ===")
pprint.pp(sim)


=== Basic information ===
ID        : 74228
Title     : Fishmarket
Creator   : Camille Pissarro (French, 1830–1903)
Date      : 1902
QID       : Q87480807
Image URL : https://openaccess-cdn.clevelandart.org/2020.113/2020.113_web.jpg

=== Turtle preview ===
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix ex: <http://example.org/cma/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://example.org/cma/artwork/74228> a ex:Artwork ;
    rdfs:label "Fishmarket" ;
    dc:creator "Camille Pissarro (French, 1830–1903)" ;
    dc:date "1902" ;

=== Wikidata Enrichment ===
{'genre': ['cityscape'],
 'depicts': ['fish market'],
 'made from material': ['canvas', 'oil paint']}

=== Similar artworks ===
[{'id': 102578,
  'accession_number': '1921.1239',
  'share_license_status': 'CC0',
  'tombstone': 'Portrait of Dora Wheeler, 1882–83. William Merritt Chase '
               '(American,

In [77]:
art = Artifact(raw[0])

# similar artworkds in terms of year
arts = art.similar_artworks(limit=5)
print(arts)

[{'id': 102578, 'accession_number': '1921.1239', 'share_license_status': 'CC0', 'tombstone': 'Portrait of Dora Wheeler, 1882–83. William Merritt Chase (American, 1849–1916). Oil on canvas; framed: 180.6 x 188.6 x 11 cm (71 1/8 x 74 1/4 x 4 5/16 in.); unframed: 159.8 x 166.4 cm (62 15/16 x 65 1/2 in.). The Cleveland Museum of Art, Gift of Mrs. Boudinot Keith in memory of  Mr. and Mrs. J. H. Wade, 1921.1239', 'current_location': '208 American Gilded Age and Realism', 'title': 'Portrait of Dora Wheeler', 'creation_date': '1882–83', 'creation_date_earliest': 1882, 'creation_date_latest': 1883, 'artists_tags': ['male'], 'culture': ['America'], 'technique': 'oil on canvas', 'support_materials': [], 'department': 'American Painting and Sculpture', 'collection': 'American - Painting', 'type': 'Painting', 'measurements': 'Framed: 180.6 x 188.6 x 11 cm (71 1/8 x 74 1/4 x 4 5/16 in.); Unframed: 159.8 x 166.4 cm (62 15/16 x 65 1/2 in.)', 'dimensions': {'framed': {'height': 1.806, 'width': 1.886, '

In [78]:
arts = art.similar_artworks(limit=10, filter_author=True)
print(arts)

[]


In [80]:
art.wikidata_enrich()
arts = art.similar_artworks(
    year_window=10,
    filter_material=True,
)
print(arts)

[{'id': 140161, 'accession_number': '1964.160', 'share_license_status': 'CC0', 'tombstone': "A Woman's Work, 1912. John Sloan (American, 1871–1951). Oil on canvas; framed: 97.2 x 82.2 x 6.4 cm (38 1/4 x 32 3/8 x 2 1/2 in.); unframed: 80.3 x 65.4 cm (31 5/8 x 25 3/4 in.). The Cleveland Museum of Art, Gift of Amelia Elizabeth White 1964.160", 'current_location': '208 American Gilded Age and Realism', 'title': "A Woman's Work", 'creation_date': '1912', 'creation_date_earliest': 1912, 'creation_date_latest': 1912, 'artists_tags': ['male'], 'culture': ['America'], 'technique': 'oil on canvas', 'support_materials': [], 'department': 'American Painting and Sculpture', 'collection': 'American - Painting', 'type': 'Painting', 'measurements': 'Framed: 97.2 x 82.2 x 6.4 cm (38 1/4 x 32 3/8 x 2 1/2 in.); Unframed: 80.3 x 65.4 cm (31 5/8 x 25 3/4 in.)', 'dimensions': {'framed': {'height': 0.972, 'width': 0.822, 'depth': 0.064}, 'unframed': {'height': 0.803, 'width': 0.654}}, 'state_of_the_work': No

In [98]:
type_kw  = ["porcelain", "ceramic", "pottery", "earthenware"]
tech_kw  = ["porcelain", "ceramic", "pottery", "earthenware"]

type_regex = "|".join(type_kw)
tech_regex = "|".join(tech_kw)

mask_type = df["type"].str.contains(type_regex,  case=False, na=False, regex=True)
mask_tech = df["technique"].str.contains(tech_regex, case=False, na=False, regex=True)

ceramic_df = df[mask_type & mask_tech]
print("Number of ceramic:", len(ceramic_df))

Number of ceramic: 2708


In [99]:
ceramic_df.head()

Unnamed: 0,id,accession_number,share_license_status,tombstone,current_location,title,title_in_original_language,series,series_in_original_language,creation_date,...,dimensions.chain.height,dimensions.overall as mounted (both pieces).height,dimensions.overall as mounted (both pieces).width,dimensions.sheet (irregular).height,dimensions.sheet (irregular).width,dimensions.portfolio box.height,dimensions.portfolio box.width,dimensions.portfolio box.depth,dimensions.with mounting.height,dimensions.with mounting.width
41,74788,2022.107,CC0,"Vase, c. 1890–99. House of Fabergé (Russian, 1...",211 Fabergé,Vase,,,,c. 1890–99,...,,,,,,,,,,
54,75573,2020.18,CC0,"Yellow-Glazed Bowl, 1505–21. China, Jiangxi pr...",,Yellow-Glazed Bowl,,,,1505–21,...,,,,,,,,,,
55,75575,2020.179,CC0,"Baluster Jar and Cover, 1600–1650. China, Jian...",,Baluster Jar and Cover,,,,1600–1650,...,,,,,,,,,,
59,76522,2020.183,CC0,"Dish with Carved Floral Design, 1100s–1200s. S...",,Dish with Carved Floral Design,,,,1100s–1200s,...,,,,,,,,,,
61,76524,2020.185,CC0,"White-Glazed Jar with Hidden Design, 1500s–160...",,White-Glazed Jar with Hidden Design,,,,1500s–1600s,...,,,,,,,,,,


In [89]:
class Ceramic(Artifact):
    
    def __init__(self, rec: Dict):
        super().__init__(rec)
        self.technique = rec.get("technique")
        self.type = rec.get("type")
        
    def to_rdf(self):
        g = super().to_rdf()
        art = EX[f"artwork/{self.id}"]
        if (art, RDF.type, EX.Artwork) in g:
            g.remove((art, RDF.type, EX.Artwork))
        g.add((art, RDF.type, EX.Ceramic))
        
        if self.technique:
            g.add((art, EX.technique, Literal(self.technique)))
        if self.type:
            g.add((art, EX.type, Literal(self.type)))
            
        return g

    def wikidata_enrich(self, sleep=0.5):
        data = super().wikidata_enrich(sleep)
        mats = data.get("made from material", [])
        self.material = mats[0] if mats else None
        return data
    
    def similar_artworks(self, **kwargs):
        extra = kwargs.get("extra_terms", [])
        kwargs["extra_terms"] = extra + ["Ceramic", "porcelain"]
        return super().similar_artworks(**kwargs)

In [105]:
rec = ceramic_df.iloc[0].to_dict()
cer = Ceramic(rec)

print("ID       :", cer.id)
print("Title    :", cer.title)
print("Technique:", cer.technique)
print("Type     :", cer.type)

ID       : 74788
Title    : Vase
Technique: ceramic, silver-gilt
Type     : Ceramic


In [107]:
enrich = cer.wikidata_enrich(sleep=0)    # 一条测试不用 sleep
print("Wikidata Enrichment:", enrich)

similar = cer.similar_artworks(limit=5, year_window=10, pause=0)
pd.json_normalize(similar)[["id","title","technique"]]

Wikidata Enrichment: {}


Unnamed: 0,id,title,technique
0,144614,Plate with Bird and Flower,Porcelain with overglaze enamel (Ko-Kutani rev...
1,447729,Incense Burner,Porcelain with tea-leaf colored glaze and mold...
2,151896,Vase,porcelain
3,168486,Cabinet Plate,porcelain with enamel pâte-sure-pâte and gilt ...
4,151897,Covered Vase,porcelain with pâte-sur-pâte decoration
