In [4]:
# coding: utf-8
import os, re, json, time, math
from urllib.parse import unquote
import requests

WIKI_API = "https://en.wikipedia.org/w/api.php"
PAGE_TITLE = "Wolfgang_Amadeus_Mozart"   # English article
OUTDIR = "../../mozart_wikipedia_images"
os.makedirs(OUTDIR, exist_ok=True)

# Polite session: identify yourself
S = requests.Session()
S.headers.update({
    "User-Agent": "mozart-wiki-downloader/1.0 (personal archiving script)",
    "Accept": "application/json"
})

def slugify(text: str, max_len: int = 80) -> str:
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&[^;\s]+;", " ", text)
    text = re.sub(r"[_\s]+", " ", text).strip()
    text = re.sub(r"[^A-Za-z0-9\-\s]+", "", text)
    text = re.sub(r"\s+", "-", text)
    return (text[:max_len] or "image").strip("-").lower()

def download(url: str, dest: str, session=S, chunk=65536):
    r = session.get(url, stream=True, timeout=120)
    r.raise_for_status()
    with open(dest, "wb") as f:
        for c in r.iter_content(chunk):
            if c:
                f.write(c)


In [5]:
# Use action=parse to list images used on the article
params = {
    "action": "parse",
    "format": "json",
    "page": PAGE_TITLE,
    "prop": "images",
}
r = S.get(WIKI_API, params=params, timeout=60)
r.raise_for_status()
data = r.json()

file_titles = [f"File:{t}" if not t.startswith("File:") else t
               for t in (data.get("parse", {}).get("images") or [])]

print(f"Found {len(file_titles)} files referenced on the article.")
file_titles[:10]


Found 33 files referenced on the article.


['File:Semi-protection-shackle.svg',
 'File:De-Wolfgang_Amadeus_Mozart.ogg',
 'File:Wolfgang_Amadeus_Mozart_(1756-1791)_-_Quaerite_primum_regnum_Dei_à4,_K.86_73v_(1770).ogg',
 'File:Wolfgang_Amadeus_Mozart_-_Symphony_40_g-moll_-_1._Molto_allegro.ogg',
 'File:Wolfgang_Amadeus_Mozart_-_Don_Giovanni_-_Overtüre.ogg',
 'File:Mozart_Portrait_Croce.jpg',
 'File:Wolfgang_Amadeus_Mozart_Signature.svg',
 'File:Wolfgang_Amadeus_Mozart_baptism_record.jpg',
 'File:Casa_natale_di_Mozart.jpg',
 'File:Louis_Carrogis_dit_Carmontelle_-_Portrait_de_Wolfgang_Amadeus_Mozart_(Salzbourg,_1756-Vienne,_1791)_jouant_à_Paris_avec_son_père_Jean..._-_Google_Art_Project.jpg']

In [6]:
EXTMETA_FIELDS = [
    "ImageDescription","ObjectName","Artist","DateTimeOriginal","Credit",
    "LicenseShortName","LicenseUrl","UsageTerms","AttributionRequired","Copyrighted"
]

def chunked(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n]

records = []
BATCH = 50

for batch in chunked(file_titles, BATCH):
    params = {
        "action": "query",
        "format": "json",
        "prop": "imageinfo",
        "titles": "|".join(batch),
        "iiprop": "url|size|mime|sha1|timestamp|extmetadata",
        "iiextmetadatafilter": "|".join(EXTMETA_FIELDS),
        "iiextmetadatalanguage": "en"
    }
    rr = S.get(WIKI_API, params=params, timeout=90)
    rr.raise_for_status()
    q = rr.json().get("query", {}).get("pages", {})
    for page in q.values():
        title = page.get("title")
        ii = (page.get("imageinfo") or [{}])[0]
        if not ii or not ii.get("url"):
            continue
        rec = {
            "pageid": page.get("pageid"),
            "title": title,
            "original_url": ii.get("url"),
            "description_page": ii.get("descriptionurl"),
            "timestamp": ii.get("timestamp"),
            "mime": ii.get("mime"),
            "size": {"width": ii.get("width"), "height": ii.get("height")},
            "sha1": ii.get("sha1"),
            "extmetadata": {k: (ii.get("extmetadata", {}).get(k, {}) or {}).get("value")
                            for k in EXTMETA_FIELDS}
        }
        records.append(rec)
    time.sleep(0.2)  # be polite

print(f"Resolved {len(records)} originals.")
records[:2]


Resolved 33 originals.


[{'pageid': None,
  'title': 'File:De-Wolfgang Amadeus Mozart.ogg',
  'original_url': 'https://upload.wikimedia.org/wikipedia/commons/9/90/De-Wolfgang_Amadeus_Mozart.ogg',
  'description_page': 'https://commons.wikimedia.org/wiki/File:De-Wolfgang_Amadeus_Mozart.ogg',
  'timestamp': '2021-07-01T18:23:23Z',
  'mime': 'application/ogg',
  'size': {'width': 0, 'height': 0},
  'sha1': '9f08c750e05f8e3d5f9ac2a451e5c5ee88122519',
  'extmetadata': {'ImageDescription': 'Pronunciation recording of German noun "<a href="https://en.wikipedia.org/wiki/Wolfgang_Amadeus_Mozart" class="extiw" title="w:Wolfgang Amadeus Mozart">Wolfgang Amadeus Mozart</a>", IPA: /ˌvɔlfɡaŋ amaˌdeːʊs ˈmoːt͡saʁt/. Male voice, recorded by native German speaker from Berlin, Germany.',
   'ObjectName': 'De-Wolfgang Amadeus Mozart',
   'Artist': '<a href="//commons.wikimedia.org/wiki/User:Jeuwre" title="User:Jeuwre">Jeuwre</a>',
   'DateTimeOriginal': '2017-08-02',
   'Credit': '<span class="int-own-work" lang="en">Own work</s

In [7]:
def best_description(meta: dict) -> str:
    return (meta.get("ObjectName") or meta.get("ImageDescription") or "").strip()

saved = 0
for i, rec in enumerate(records, 1):
    # pick a friendly name
    desc = best_description(rec["extmetadata"]) or rec["title"].replace("File:", "")
    slug = slugify(desc, 70)
    # guess extension from URL path
    url = rec["original_url"]
    ext = os.path.splitext(unquote(url.split("/")[-1]))[1] or ".bin"

    base = f"mozart_{i:03d}_{slug}"
    img_path = os.path.join(OUTDIR, base + ext.lower())
    meta_path = os.path.join(OUTDIR, base + ".json")

    if os.path.exists(img_path) and os.path.exists(meta_path):
        continue

    try:
        download(url, img_path)
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(rec, f, ensure_ascii=False, indent=2)
        saved += 1
        print(f"[{i}/{len(records)}] Saved {os.path.basename(img_path)}")
    except Exception as e:
        print(f"[{i}/{len(records)}] ERROR {url}: {e}")

print(f"Done. Saved {saved} files into {os.path.abspath(OUTDIR)}")


[1/33] Saved mozart_001_de-wolfgang-amadeus-mozart.ogg
[2/33] Saved mozart_002_wolfgang-amadeus-mozart-1756-1791---quaerite-primum-regnum-dei-4-k86-7.ogg
[3/33] Saved mozart_003_wolfgang-amadeus-mozart---symphony-40-g-moll---1-molto-allegro.ogg
[4/33] Saved mozart_004_wolfgang-amadeus-mozart---don-giovanni---overtre.ogg
[5/33] Saved mozart_005_mozart-family-label-qslfrportrait-de-la-famille-mozart-label-qslenmoza.jpg
[6/33] Saved mozart_006_wolfgang-amadeus-mozart-signature.svg
[7/33] Saved mozart_007_wolfgang-amadeus-mozart-baptism-record.jpg
[8/33] Saved mozart_008_casa-natale-di-mozart.jpg
[9/33] Saved mozart_009_portrait-de-wolfgang-amadeus-mozart-salzbourg-1756-vienne-1791-jouant.jpg
[10/33] Saved mozart_010_portrait-of-wolfgang-amadeus-mozart-at-the-age-of-13-in-verona-label-q.jpg
[11/33] Saved mozart_011_mozarts-old-home.jpg
[12/33] Saved mozart_012_constanze-mozart-by-lange-1782.jpg
[13/33] Saved mozart_013_mozartv-klavr-1.jpg
[14/33] Saved mozart_014_portrait-of-wolfgang-amade