In [3]:
import requests
import json
import time
import os
from pathlib import Path
import logging
from typing import List, Dict
import pandas as pd
import csv

In [5]:
met_base_url = "https://collectionapi.metmuseum.org/public/collection/v1"

In [6]:
department_id = 17 # Medieval Art

In [7]:
def get_object_ids() -> List[int]:
    object_ids = []
    url = f"{met_base_url}/objects?departmentIds={department_id}"
    while url:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        object_ids.extend(data["objectIDs"])
        url = data.get("next", None)
    return object_ids


object_ids = get_object_ids()

print("Found {} object IDs".format(len(object_ids)))
print("{}...".format(object_ids[:5]))

Found 7136 object IDs
[32830, 32831, 32832, 32833, 32834]...


In [8]:
object_fields = [
    "objectID",
    "accessionNumber",
    "primaryImage",
    "primaryImageSmall",
    "objectName",
    "title",
    "artistDisplayName",
    "objectDate",
    "medium",
    "classification",
    "objectURL",
]

In [9]:
# create a csv file to store object information in 
csv_file = Path("met_objects.csv")
if not csv_file.exists():
    with open(csv_file, "w") as f:
        f.write(",".join(object_fields) + "\n")

In [10]:
def get_object_info (object_id: str) -> Dict:
    url = f"{met_base_url}/objects/{object_id}"
    response = requests.get(url)
    response.raise_for_status()
    json = response.json()
    output = {}

    for field in object_fields:
        output[field] = json.get(field, None)

    # only return the fields we want
    return output

In [12]:
get_object_info("32838")

{'objectID': 32838,
 'accessionNumber': '23.21.10',
 'primaryImage': 'https://images.metmuseum.org/CRDImages/md/original/DP163290.jpg',
 'primaryImageSmall': 'https://images.metmuseum.org/CRDImages/md/web-large/DP163290.jpg',
 'objectName': 'Manuscript leaf cutting perhaps from a breviary',
 'title': 'Manuscript Leaf Cutting showing an Illuminated Initial R with St. Protasius and St. Gervasius',
 'artistDisplayName': 'Olivetan Master',
 'objectDate': 'mid-15th century',
 'medium': 'Tempera, gold, and ink on parchment',
 'classification': 'Manuscripts and Illuminations',
 'objectURL': 'https://www.metmuseum.org/art/collection/search/32838'}

In [None]:
def download_image (url: str, directory: str, filename: str):
    response = requests.get(url)
    response.raise_for_status()
    
    filepath = Path(directory) / filename

    with open(filepath, "wb") as f:
        f.write(response.content)

In [51]:
for object_id in object_ids:
    df = pd.read_csv(csv_file)

    # skip if the object is already in the csv file
    if object_id not in df["objectID"].values:
        try:
            info = get_object_info(object_id)
            df = pd.concat([df, pd.DataFrame([info])], ignore_index=True)
            df.to_csv(csv_file, index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
        except Exception as e:
            logging.error(f"Error getting object {object_id}: {e}")    

        logging.info(f"Created object {object_id}")
        time.sleep(0.05)
    else:
        logging.info(f"Skipping object {object_id}")

In [7]:
# identify any duplicate rows 
df = pd.read_csv(csv_file)

duplicates = df[df.duplicated(subset=["objectID"], keep=False)]

if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate rows")
    print(duplicates)

# print how many rows are in the csv file
print(f"Total rows: {len(df)}")

Total rows: 7136
