# Wikidata: The state of the art (CHARTS)

This notebook consolidates all Wikidata processing for the charts

In [None]:
!pip3 install SPARQLWrapper

In [None]:
import os
import json
import time
import hashlib
import urllib.request
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime

# =============================================================================
# CONFIGURATION
# =============================================================================

CACHE_DIR = "wikidata_cache"
VIS_DIR = "vis/"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(VIS_DIR, exist_ok=True)

ENDPOINT_URL = "https://query.wikidata.org/sparql"
USER_AGENT = "Wikidata: State of the Art; Marian Dörk"
ENTITY_URL_TEMPLATE = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"

# Rate limiting
CURRENT_DELAY = 0.1
MIN_DELAY = 0.1
MAX_DELAY = 60.0

# Parallel fetching
MAX_WORKERS = 10  # Concurrent HTTP requests

def timestamp():
    return datetime.now().strftime("%H:%M:%S")

def format_time_remaining(seconds):
    if seconds < 60:
        return f"{seconds:.0f}s"
    elif seconds < 3600:
        return f"{seconds / 60:.0f}m"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        return f"{hours}h {minutes}m"

In [None]:
# =============================================================================
# SPARQL UTILITIES
# =============================================================================

def query_hash(query_template: str) -> str:
    h = hashlib.sha1()
    h.update(query_template.encode("utf-8"))
    return h.hexdigest()[:12]

def get_sparql(query, max_retries=3):
    """Execute SPARQL query with caching and retry logic."""
    qhash = query_hash(query)
    cache_file = os.path.join(CACHE_DIR, f"{qhash}.json")
    
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            print(f"[cache] {qhash}")
            return json.load(f)
    
    sparql = SPARQLWrapper(ENDPOINT_URL, agent=USER_AGENT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    for attempt in range(max_retries):
        try:
            print(f"[query] {qhash}...")
            result = sparql.query().convert()
            
            with open(cache_file, "w") as f:
                json.dump(result, f)
            
            print(f"[saved] {qhash}")
            time.sleep(2)
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                wait = (attempt + 1) * 5
                print(f"[retry] Attempt {attempt + 1} failed: {e}")
                time.sleep(wait)
            else:
                raise e

def get_count(query):
    result = get_sparql(query)
    return int(result["results"]["bindings"][0]["count"]["value"])

def get_bindings(query):
    result = get_sparql(query)
    return result["results"]["bindings"]

In [None]:
# =============================================================================
# ENTITY FETCHING WITH PARALLEL SUPPORT
# =============================================================================

# In-memory entity cache for fast repeated access
ENTITY_CACHE = {}

def get_entity_from_disk(qid):
    """Load entity from disk cache only."""
    cache_file = os.path.join(CACHE_DIR, f"{qid}.json")
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            return json.load(f)
    return None

def fetch_entity_http(qid):
    """Fetch a single entity from HTTP (for parallel fetching)."""
    global CURRENT_DELAY
    
    cache_file = os.path.join(CACHE_DIR, f"{qid}.json")
    url = ENTITY_URL_TEMPLATE.format(qid)
    
    max_retries = 5
    for attempt in range(max_retries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=30) as response:
                data = json.loads(response.read().decode("utf-8"))
                
                if "entities" not in data or qid not in data["entities"]:
                    with open(cache_file, "w") as f:
                        json.dump(None, f)
                    return qid, None
                
                entity = data["entities"][qid]
                
                if "missing" in entity:
                    with open(cache_file, "w") as f:
                        json.dump(None, f)
                    return qid, None
                
                with open(cache_file, "w") as f:
                    json.dump(entity, f)
                
                return qid, entity
                
        except urllib.error.HTTPError as e:
            if e.code == 429:
                wait = min(MAX_DELAY, CURRENT_DELAY * (2 ** attempt))
                time.sleep(wait)
                continue
            elif e.code == 404:
                with open(cache_file, "w") as f:
                    json.dump(None, f)
                return qid, None
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1)
                continue
            return qid, None
    
    return qid, None

def get_entity(qid):
    """Get entity from memory cache, disk cache, or fetch."""
    # Check memory cache first
    if qid in ENTITY_CACHE:
        return ENTITY_CACHE[qid]
    
    # Check disk cache
    entity = get_entity_from_disk(qid)
    if entity is not None or os.path.exists(os.path.join(CACHE_DIR, f"{qid}.json")):
        ENTITY_CACHE[qid] = entity
        return entity
    
    # Fetch from HTTP
    _, entity = fetch_entity_http(qid)
    ENTITY_CACHE[qid] = entity
    return entity

def batch_fetch_entities(qids, desc="entities", max_workers=MAX_WORKERS):
    """
    Fetch multiple entities in parallel.
    Returns dict of qid -> entity.
    """
    results = {}
    to_fetch = []
    
    # Check what we already have
    for qid in qids:
        if qid in ENTITY_CACHE:
            results[qid] = ENTITY_CACHE[qid]
        else:
            entity = get_entity_from_disk(qid)
            if entity is not None or os.path.exists(os.path.join(CACHE_DIR, f"{qid}.json")):
                ENTITY_CACHE[qid] = entity
                results[qid] = entity
            else:
                to_fetch.append(qid)
    
    if not to_fetch:
        print(f"[{timestamp()}] All {len(qids):,} {desc} already cached")
        return results
    
    print(f"[{timestamp()}] Fetching {len(to_fetch):,} {desc} ({len(results):,} cached)...")
    
    start_time = time.time()
    fetched = 0
    errors = 0
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_entity_http, qid): qid for qid in to_fetch}
        
        for future in as_completed(futures):
            qid, entity = future.result()
            ENTITY_CACHE[qid] = entity
            results[qid] = entity
            
            if entity is None:
                errors += 1
            
            fetched += 1
            if fetched % 100 == 0 or fetched == len(to_fetch):
                elapsed = time.time() - start_time
                rate = fetched / elapsed if elapsed > 0 else 0
                remaining = (len(to_fetch) - fetched) / rate if rate > 0 else 0
                print(f"[{timestamp()}] [fetch] {fetched:,}/{len(to_fetch):,} @ {rate:.1f}/s | ~{format_time_remaining(remaining)} remaining")
    
    elapsed = time.time() - start_time
    print(f"[{timestamp()}] [done] Fetched {len(to_fetch):,} {desc} in {elapsed/60:.1f} minutes ({errors:,} errors)")
    
    return results

In [None]:
# =============================================================================
# ENTITY DATA EXTRACTION UTILITIES
# =============================================================================

def get_label(qid, lang="en"):
    """Get label for a Q-ID."""
    entity = get_entity(qid)
    if entity is None:
        return qid
    
    labels = entity.get("labels", {})
    if lang in labels:
        return labels[lang]["value"]
    elif "en" in labels:
        return labels["en"]["value"]
    elif labels:
        return list(labels.values())[0]["value"]
    return qid

def get_claim_values(entity, property_id):
    """Extract Q-IDs from a property's claims."""
    if entity is None:
        return []
    
    claims = entity.get("claims", {}).get(property_id, [])
    values = []
    for claim in claims:
        mainsnak = claim.get("mainsnak", {})
        datavalue = mainsnak.get("datavalue", {})
        if datavalue.get("type") == "wikibase-entityid":
            values.append(datavalue["value"]["id"])
    return values

def get_claim_time_values(entity, property_id):
    """Extract time values from a property's claims."""
    if entity is None:
        return []
    claims = entity.get("claims", {}).get(property_id, [])
    values = []
    for claim in claims:
        mainsnak = claim.get("mainsnak", {})
        datavalue = mainsnak.get("datavalue", {})
        if datavalue.get("type") == "time":
            values.append(datavalue["value"]["time"])
    return values

def get_creation_year(entity):
    """Extract creation year from P571 (inception) or P577 (publication date)."""
    for prop in ["P571", "P577"]:
        claims = entity.get("claims", {}).get(prop, [])
        for claim in claims:
            try:
                time_value = claim.get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("time", "")
                if time_value:
                    year_str = time_value[1:5]
                    return int(year_str)
            except (KeyError, ValueError, TypeError):
                continue
    return None

def parse_date(time_str):
    """Parse Wikidata time format to (year, month, day) tuple."""
    try:
        # Format: +1503-00-00T00:00:00Z or +1503-01-01T00:00:00Z
        sign = 1 if time_str[0] == '+' else -1
        year = int(time_str[1:5]) * sign
        month = int(time_str[6:8]) or 1
        day = int(time_str[9:11]) or 1
        return (year, month, day)
    except:
        return None

In [None]:
# =============================================================================
# STEP 1: GET PAINTING QIDs FROM SPARQL
# =============================================================================

print(f"[{timestamp()}] Fetching painting Q-IDs from SPARQL...")

# Total paintings count
query_total = """
SELECT (COUNT(?painting) AS ?count)
WHERE {
  ?painting wdt:P31/wdt:P279* wd:Q3305213.
}
"""
total_paintings = get_count(query_total)
print(f"[{timestamp()}] Total paintings: {total_paintings:,}")

# Get all painting QIDs
query_all_paintings = """
SELECT ?painting WHERE {
  ?painting wdt:P31/wdt:P279* wd:Q3305213.
}
"""
bindings = get_bindings(query_all_paintings)
painting_qids = [b["painting"]["value"].split("/")[-1] for b in bindings]
print(f"[{timestamp()}] Loaded {len(painting_qids):,} painting QIDs")

In [None]:
# =============================================================================
# STEP 2: BATCH FETCH ALL PAINTING ENTITIES
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] BATCH LOADING PAINTING ENTITIES")
print(f"[{timestamp()}] {'='*60}")

painting_entities = batch_fetch_entities(painting_qids, desc="paintings")

In [None]:
# =============================================================================
# STEP 3: SINGLE-PASS PAINTING SCAN
# Extract ALL painting properties in ONE loop!
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] SINGLE-PASS PAINTING SCAN")
print(f"[{timestamp()}] {'='*60}")

# Initialize counters and collections
movement_counts = Counter()        # P135: movement
material_counts = Counter()        # P186: material used
genre_counts = Counter()           # P136: genre
collection_counts = Counter()      # P195: collection

paintings_with_creator = set()
paintings_with_movement = set()
paintings_with_material = set()
paintings_with_genre = set()
paintings_with_collection = set()
paintings_with_year = set()

all_creator_qids = set()           # Collect all unique creators
creator_painting_count = Counter() # How many paintings per creator

# For timeline: painting year + movement pairs
painting_year_movement = []        # List of (year, movement_qid) tuples
painting_years = []                # All painting years for histogram
decade_counts = Counter()          # For inception chart

# For genre timelines
TIMELINE_GENRES = {
    "Q134307": "portrait",      # portrait
    "Q2864737": "religious",    # religious art
    "Q191163": "landscape",     # landscape painting
}
genre_decade_counts = {genre: Counter() for genre in TIMELINE_GENRES.values()}

start_time = time.time()

for i, qid in enumerate(painting_qids):
    if (i + 1) % 100000 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (len(painting_qids) - i - 1) / rate
        print(f"[{timestamp()}] [scan] {i + 1:,}/{len(painting_qids):,} @ {rate:.0f}/s | ~{format_time_remaining(remaining)}")
    
    entity = painting_entities.get(qid)
    if entity is None:
        continue
    
    # --- MOVEMENTS (P135) ---
    movements = get_claim_values(entity, "P135")
    if movements:
        paintings_with_movement.add(qid)
        for m in movements:
            movement_counts[m] += 1
    
    # --- MATERIALS (P186) ---
    materials = get_claim_values(entity, "P186")
    if materials:
        paintings_with_material.add(qid)
        for m in materials:
            material_counts[m] += 1
    
    # --- GENRES (P136) ---
    genres = get_claim_values(entity, "P136")
    if genres:
        paintings_with_genre.add(qid)
        for g in genres:
            genre_counts[g] += 1
    
    # --- COLLECTIONS (P195) ---
    collections = get_claim_values(entity, "P195")
    if collections:
        paintings_with_collection.add(qid)
        for c in collections:
            collection_counts[c] += 1
    
    # --- CREATORS (P170) ---
    creators = get_claim_values(entity, "P170")
    for c in creators:
        all_creator_qids.add(c)
        creator_painting_count[c] += 1
    if creators:
        paintings_with_creator.add(qid)

    # --- CREATION YEAR (P571/P577) ---
    year = get_creation_year(entity)
    if year is not None and 1000 <= year <= 2030:
        paintings_with_year.add(qid)
        painting_years.append(year)
        
        # Decade for inception chart
        decade = (year // 10) * 10
        decade_counts[decade] += 1
        
        # Store year + movements for timeline
        for m in movements:
            painting_year_movement.append((year, m))
        
        # Store year + genres for genre timeline
        for genre_qid, genre_name in TIMELINE_GENRES.items():
            if genre_qid in genres:
                genre_decade_counts[genre_name][decade] += 1

elapsed = time.time() - start_time
print(f"\n[{timestamp()}] PAINTING SCAN COMPLETE in {elapsed/60:.1f} minutes")
print(f"[{timestamp()}] Unique creators found: {len(all_creator_qids):,}")
print(f"[{timestamp()}] Paintings with movement: {len(paintings_with_movement):,}")
print(f"[{timestamp()}] Paintings with material: {len(paintings_with_material):,}")
print(f"[{timestamp()}] Paintings with genre: {len(paintings_with_genre):,}")
print(f"[{timestamp()}] Paintings with collection: {len(paintings_with_collection):,}")
print(f"[{timestamp()}] Paintings with year: {len(paintings_with_year):,}")
print(f"[{timestamp()}] Paintings with creator: {len(paintings_with_creator):,}")

In [None]:
print(f"[{timestamp()}] Paintings with creator: {len(paintings_with_creator):,}")

In [None]:
# =============================================================================
# STEP 4: BATCH FETCH ALL CREATOR ENTITIES
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] BATCH LOADING CREATOR ENTITIES")
print(f"[{timestamp()}] {'='*60}")

all_creator_qids = list(all_creator_qids)  # Convert to list for indexing
creator_entities = batch_fetch_entities(all_creator_qids, desc="creators")

In [None]:
# =============================================================================
# STEP 5: SINGLE-PASS CREATOR SCAN
# Extract ALL creator properties in ONE loop!
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] SINGLE-PASS CREATOR SCAN")
print(f"[{timestamp()}] {'='*60}")

# Initialize counters
gender_counts = Counter()          # P21: sex or gender
birthplace_counts = Counter()      # P19: place of birth
nationality_counts = Counter()     # P27: country of citizenship

creators_with_gender = 0
creators_with_birthplace = 0
creators_with_nationality = 0

# For movement inheritance
creator_movements = {}             # creator_qid -> list of movement qids

# For country/continent analysis
creator_birthplaces = {}           # creator_qid -> birthplace_qid
birthplace_qids = set()            # All unique birthplaces

# For lifespan analysis
artist_data = []                   # List of dicts with birth/death/lifespan info
missing_birth = 0
missing_death = 0

def calculate_lifespan(birth_date, death_date):
    if birth_date is None or death_date is None:
        return None
    b_year, b_month, b_day = birth_date
    d_year, d_month, d_day = death_date
    birth_days = b_year * 365.25 + (b_month - 1) * 30.44 + b_day
    death_days = d_year * 365.25 + (d_month - 1) * 30.44 + d_day
    lifespan = (death_days - birth_days) / 365.25
    return round(lifespan, 2)

start_time = time.time()

for i, creator_qid in enumerate(all_creator_qids):
    if (i + 1) % 10000 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (len(all_creator_qids) - i - 1) / rate
        print(f"[{timestamp()}] [scan] {i + 1:,}/{len(all_creator_qids):,} @ {rate:.0f}/s | ~{format_time_remaining(remaining)}")
    
    entity = creator_entities.get(creator_qid)
    if entity is None:
        continue
    
    # --- GENDER (P21) ---
    genders = get_claim_values(entity, "P21")
    if genders:
        creators_with_gender += 1
        for g in genders:
            gender_counts[g] += 1
    
    # --- BIRTHPLACE (P19) ---
    birthplaces = get_claim_values(entity, "P19")
    if birthplaces:
        creators_with_birthplace += 1
        creator_birthplaces[creator_qid] = birthplaces[0]
        birthplace_qids.add(birthplaces[0])
        for b in birthplaces:
            birthplace_counts[b] += 1
    
    # --- NATIONALITY (P27) ---
    nationalities = get_claim_values(entity, "P27")
    if nationalities:
        creators_with_nationality += 1
        for n in nationalities:
            nationality_counts[n] += 1
    
    # --- MOVEMENTS (P135) for inheritance ---
    movements = get_claim_values(entity, "P135")
    if movements:
        creator_movements[creator_qid] = movements
    
    # --- BIRTH/DEATH DATES for lifespan ---
    birth_dates = get_claim_time_values(entity, "P569")
    death_dates = get_claim_time_values(entity, "P570")
    
    birth_date = parse_date(birth_dates[0]) if birth_dates else None
    death_date = parse_date(death_dates[0]) if death_dates else None
    
    if birth_date is None:
        missing_birth += 1
    if death_date is None:
        missing_death += 1
    
    lifespan = calculate_lifespan(birth_date, death_date)
    
    if lifespan is not None and 10 <= lifespan <= 120:
        notable_works = get_claim_values(entity, "P800")
        works_in_dataset = creator_painting_count.get(creator_qid, 0)
        
        artist_data.append({
            "qid": creator_qid,
            "birth_year": birth_date[0],
            "death_year": death_date[0],
            "lifespan": lifespan,
            "notable_works": len(notable_works) if notable_works else 0,
            "works_in_dataset": works_in_dataset
        })

elapsed = time.time() - start_time
print(f"\n[{timestamp()}] CREATOR SCAN COMPLETE in {elapsed/60:.1f} minutes")
print(f"[{timestamp()}] Creators with gender: {creators_with_gender:,}")
print(f"[{timestamp()}] Creators with birthplace: {creators_with_birthplace:,}")
print(f"[{timestamp()}] Creators with nationality: {creators_with_nationality:,}")
print(f"[{timestamp()}] Creators with movements: {len(creator_movements):,}")
print(f"[{timestamp()}] Artists with lifespan data: {len(artist_data):,}")
print(f"[{timestamp()}] Unique birthplaces: {len(birthplace_qids):,}")

In [None]:
# =============================================================================
# STEP 6: RESOLVE BIRTHPLACES TO COUNTRIES AND CONTINENTS
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] RESOLVING BIRTHPLACES TO COUNTRIES")
print(f"[{timestamp()}] {'='*60}")

# Fetch birthplace entities
birthplace_entities = batch_fetch_entities(list(birthplace_qids), desc="birthplaces")

# Resolve birthplace -> country
birthplace_to_country = {}
country_qids = set()

for bp_qid in birthplace_qids:
    bp_entity = birthplace_entities.get(bp_qid)
    if bp_entity:
        countries = get_claim_values(bp_entity, "P17")  # P17 = country
        if countries:
            country_qid = countries[0]
            birthplace_to_country[bp_qid] = country_qid
            country_qids.add(country_qid)

print(f"[{timestamp()}] Birthplaces with country: {len(birthplace_to_country):,}")
print(f"[{timestamp()}] Unique countries: {len(country_qids):,}")

# Fetch country entities to get continents
country_entities = batch_fetch_entities(list(country_qids), desc="countries")

# Resolve country -> continent
country_to_continent = {}
for country_qid in country_qids:
    country_entity = country_entities.get(country_qid)
    if country_entity:
        continents = get_claim_values(country_entity, "P30")  # P30 = continent
        if continents:
            country_to_continent[country_qid] = continents[0]

print(f"[{timestamp()}] Countries with continent: {len(country_to_continent):,}")

In [None]:
# =============================================================================
# STEP 7: COUNT CREATORS BY COUNTRY AND CONTINENT
# =============================================================================

print(f"\n[{timestamp()}] Counting creators per country and continent...")

country_counts = Counter()
continent_counts = Counter()
creators_with_country = 0
no_country_count = 0

for creator_qid, birthplace_qid in creator_birthplaces.items():
    country_qid = birthplace_to_country.get(birthplace_qid)
    if country_qid:
        country_counts[country_qid] += 1
        creators_with_country += 1
        
        # Also count by continent
        continent_qid = country_to_continent.get(country_qid)
        if continent_qid:
            continent_counts[continent_qid] += 1
    else:
        no_country_count += 1

# Add creators without any birthplace
no_country_count += len(all_creator_qids) - len(creator_birthplaces)

print(f"[{timestamp()}] Creators with country: {creators_with_country:,}")
print(f"[{timestamp()}] Creators without country: {no_country_count:,}")

In [None]:
# =============================================================================
# STEP 8: SECOND PAINTING PASS - Add inherited movements
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] ADDING INHERITED MOVEMENTS")
print(f"[{timestamp()}] {'='*60}")

# Rebuild movement counts including creator movements
movement_counts_with_inheritance = Counter()
paintings_with_movement_inherited = set()
painting_year_movement_inherited = []

start_time = time.time()

for i, qid in enumerate(painting_qids):
    if (i + 1) % 100000 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        print(f"[{timestamp()}] [inherit] {i + 1:,}/{len(painting_qids):,} @ {rate:.0f}/s")
    
    entity = painting_entities.get(qid)
    if entity is None:
        continue
    
    # Direct movements
    all_movements = set(get_claim_values(entity, "P135"))
    
    # Add creator movements
    for creator_qid in get_claim_values(entity, "P170"):
        if creator_qid in creator_movements:
            all_movements.update(creator_movements[creator_qid])
    
    if all_movements:
        paintings_with_movement_inherited.add(qid)
        for m in all_movements:
            movement_counts_with_inheritance[m] += 1
        
        # For timeline
        year = get_creation_year(entity)
        if year is not None and 1000 <= year <= 2030:
            for m in all_movements:
                painting_year_movement_inherited.append((year, m))

print(f"[{timestamp()}] Paintings with movement (with inheritance): {len(paintings_with_movement_inherited):,}")

In [None]:
# =============================================================================
# VISUALIZATION GENERATION UTILITIES
# =============================================================================

def clean_movement_label(label):
    """Clean up movement labels."""
    if label.lower().endswith(" painting"):
        label = label[:-9]
    return label.title()

def create_bar_chart(data, field_name, output_name, title=""):
    """Create a Vega-Lite bar chart specification."""
    sort_order = [d[field_name] for d in data]
    
    spec = {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "data": {"values": data},
        "mark": {"type": "bar", "color": "#999999"},
        "encoding": {
            "y": {
                "field": field_name,
                "type": "nominal",
                "sort": sort_order,
                "title": ""
            },
            "x": {
                "field": "count",
                "type": "quantitative",
                "title": ""
            }
        },
        "width": 300,
        "height": 400
    }
    
    output_path = os.path.join(VIS_DIR, f"{output_name}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(spec, f, ensure_ascii=False, indent=2)
    
    print(f"[{timestamp()}] Output: {output_path}")
    return spec

def create_chart_from_counts(counts, field_name, output_name, top_n=20, label_transform=None):
    """Create a bar chart from a Counter, fetching labels."""
    print(f"\n[{timestamp()}] Creating {output_name} chart...")
    
    top_items = counts.most_common(top_n + 10)  # Fetch extra for merging
    
    # Build data with labels
    data = []
    for qid, count in top_items:
        label = get_label(qid)
        if label_transform:
            label = label_transform(label)
        data.append({field_name: label, "count": count, "qid": qid})
        print(f"[{timestamp()}]   {qid} -> {label}: {count:,}")
    
    # Collapse duplicates by lowercase
    collapsed = {}
    for item in data:
        label_lower = item[field_name].lower()
        if label_lower in collapsed:
            collapsed[label_lower]["count"] += item["count"]
        else:
            collapsed[label_lower] = {field_name: item[field_name], "count": item["count"]}
    
    final_data = sorted(collapsed.values(), key=lambda x: x["count"], reverse=True)[:top_n]
    
    # Calculate "Other"
    top_counts = sum(d["count"] for d in final_data)
    other_count = sum(counts.values()) - top_counts
    if other_count > 0:
        final_data.append({field_name: "Other", "count": other_count})
    
    return create_bar_chart(final_data, field_name, output_name)

In [None]:
# =============================================================================
# GENERATE BASIC BAR CHARTS
# =============================================================================

print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] GENERATING VISUALIZATIONS")
print(f"[{timestamp()}] {'='*60}")

# --- MOVEMENTS ---
create_chart_from_counts(
    movement_counts_with_inheritance, 
    "movement", 
    "movements",
    label_transform=clean_movement_label
)

# --- MATERIALS ---
create_chart_from_counts(
    material_counts, 
    "material", 
    "materials",
    label_transform=lambda x: x.title()
)

# --- GENRES ---
create_chart_from_counts(
    genre_counts, 
    "genre", 
    "genre",
    label_transform=lambda x: x.title()
)

# --- COLLECTIONS ---
create_chart_from_counts(
    collection_counts, 
    "collection", 
    "collection"
)

In [None]:
# =============================================================================
# COUNTRY BAR CHART
# =============================================================================

print(f"\n[{timestamp()}] Creating country chart...")

TOP_N = 20
top_countries = country_counts.most_common(TOP_N)

country_data = []
for qid, count in top_countries:
    label = get_label(qid)
    country_data.append({"country": label, "count": count})
    print(f"[{timestamp()}]   {qid} -> {label}: {count:,}")

country_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": country_data},
    "mark": {"type": "bar", "color": "#999999"},
    "encoding": {
        "y": {
            "field": "country",
            "type": "nominal",
            "sort": "-x",
            "title": ""
        },
        "x": {
            "field": "count",
            "type": "quantitative",
            "title": ""
        }
    },
    "width": 300,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "country.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(country_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")

In [None]:
# =============================================================================
# CONTINENT BAR CHART
# =============================================================================
print(f"\n[{timestamp()}] Creating continent chart...")

# Exclude non-standard continent entries
EXCLUDE_CONTINENT_QIDS = {
    "Q5401",    # Eurasia
    "Q27611",   # Central America
    "Q828",     # Americas
}

# Western continents (not highlighted)
WESTERN_CONTINENT_QIDS = {
    "Q46",      # Europe
    "Q49",      # North America
}

top_continents = continent_counts.most_common(10)
continent_data = []
for qid, count in top_continents:
    if qid in EXCLUDE_CONTINENT_QIDS:
        print(f"[{timestamp()}]   {qid} -> (excluded): {count:,}")
        continue
    
    label = get_label(qid)
    # Handle Oceania special case (Q55643 sometimes shows as "Australia")
    if qid == "Q538":
        label = "Oceania"
    
    # Highlight non-Western continents
    is_global_south = qid not in WESTERN_CONTINENT_QIDS
    
    continent_data.append({
        "continent": label,
        "count": count,
        "global_south": is_global_south
    })
    print(f"[{timestamp()}]   {qid} -> {label}: {count:,} {'(highlighted)' if is_global_south else ''}")

continent_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": continent_data},
    "mark": {"type": "bar"},
    "encoding": {
        "y": {
            "field": "continent",
            "type": "nominal",
            "sort": "-x",
            "title": ""
        },
        "x": {
            "field": "count",
            "type": "quantitative",
            "title": ""
        },
        "color": {
            "field": "global_south",
            "type": "nominal",
            "scale": {
                "domain": [True, False],
                "range": ["#FF00D3", "#999999"]
            },
            "legend": None
        }
    },
    "width": 300,
    "height": 200
}

output_path = os.path.join(VIS_DIR, "continent.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(continent_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")

In [None]:
# =============================================================================
# GLOBAL SOUTH PAINTER RATIO BY BIRTH DECADE
# =============================================================================
print(f"\n[{timestamp()}] Calculating Global South painter ratio by birth decade...")

# Define Europe and North America continent QIDs to exclude
EUROPE_QID = "Q46"
NORTH_AMERICA_QID = "Q49"
WESTERN_CONTINENTS = {EUROPE_QID, NORTH_AMERICA_QID}

# Count total and Global South artists per birth decade
birth_decade_total_geo = Counter()
birth_decade_global_south = Counter()

for creator_qid in all_creator_qids:
    entity = creator_entities.get(creator_qid)
    if entity is None:
        continue
    
    # Get birth date
    birth_dates = get_claim_time_values(entity, "P569")
    if not birth_dates:
        continue
    
    birth_date = parse_date(birth_dates[0])
    if birth_date is None:
        continue
    
    birth_year = birth_date[0]
    if birth_year < 1500 or birth_year > 1980:
        continue
    
    # Get birthplace -> country -> continent
    birthplace_qid = creator_birthplaces.get(creator_qid)
    if not birthplace_qid:
        continue
    
    country_qid = birthplace_to_country.get(birthplace_qid)
    if not country_qid:
        continue
    
    continent_qid = country_to_continent.get(country_qid)
    if not continent_qid:
        continue
    
    decade = (birth_year // 10) * 10
    birth_decade_total_geo[decade] += 1
    
    # Check if NOT Europe or North America
    if continent_qid not in WESTERN_CONTINENTS:
        birth_decade_global_south[decade] += 1

# Build timeline data
geo_timeline_data = []
for decade in sorted(birth_decade_total_geo.keys()):
    total = birth_decade_total_geo[decade]
    global_south = birth_decade_global_south.get(decade, 0)
    
    if total >= 10:  # Only include decades with enough data
        pct = (global_south / total) * 100
        geo_timeline_data.append({
            "decade": decade,
            "percentage": round(pct, 2),
            "global_south": global_south,
            "total": total
        })

print(f"[{timestamp()}] Decades with data: {len(geo_timeline_data)}")

# Print summary
print(f"\n[{timestamp()}] Global South painter percentage by century:")
for century_start in range(1500, 2000, 100):
    century_data = [d for d in geo_timeline_data if century_start <= d["decade"] < century_start + 100]
    if century_data:
        total_gs = sum(d["global_south"] for d in century_data)
        total_all = sum(d["total"] for d in century_data)
        pct = (total_gs / total_all) * 100 if total_all > 0 else 0
        print(f"[{timestamp()}]   {century_start}s: {pct:.1f}% ({total_gs:,} of {total_all:,})")

# Create Vega-Lite spec
geo_timeline_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": geo_timeline_data},
    "transform": [
        {"filter": "datum.decade < 1980"}
    ],
    "mark": {
        "type": "line",
        "color": "#FF00D3",
        "strokeWidth": 2,
        "interpolate": "monotone"
    },
    "encoding": {
        "x": {
        "field": "decade",
            "type": "quantitative",
            "title": "",
            "scale": {"domain": [1500, 2000]},
            "axis": {"format": "d", "tickCount": 6},
        },
        "y": {
            "field": "percentage",
            "type": "quantitative",
            "title": "",
            "axis": {"format": ".0f"},
            "scale": {"domain": [0, 20]}
        },
        "tooltip": [
            {"field": "decade", "type": "ordinal", "title": "Decade"},
            {"field": "percentage", "type": "quantitative", "title": "% Global South", "format": ".1f"},
            {"field": "global_south", "type": "quantitative", "title": "Non-Western painters", "format": ","},
            {"field": "total", "type": "quantitative", "title": "Total painters", "format": ","}
        ]
    },
    "width": 400,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "geo_timeline.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(geo_timeline_spec, f, ensure_ascii=False, indent=2)

print(f"[{timestamp()}] Output: {output_path}")

# ---------------------------------------------------------------------------
# Summary statistics
# ---------------------------------------------------------------------------
print(f"\n[{timestamp()}] {'='*50}")
print(f"[{timestamp()}] GLOBAL SOUTH TIMELINE SUMMARY")
print(f"[{timestamp()}] {'='*50}")

if geo_timeline_data:
    earliest = min(geo_timeline_data, key=lambda x: x["decade"])
    latest = [d for d in geo_timeline_data if d["decade"] <= 1980][-1]
    peak = max(geo_timeline_data, key=lambda x: x["percentage"])
    
    print(f"[{timestamp()}] Earliest decade: {earliest['decade']}s ({earliest['percentage']:.1f}% Global South)")
    print(f"[{timestamp()}] Latest decade: {latest['decade']}s ({latest['percentage']:.1f}% Global South)")
    print(f"[{timestamp()}] Peak: {peak['decade']}s ({peak['percentage']:.1f}% Global South)")
print(f"[{timestamp()}] {'='*50}")

In [None]:
# =============================================================================
# GENDER VISUALIZATION
# =============================================================================

print(f"\n[{timestamp()}] Creating gender chart...")

gender_data = []
for qid, count in gender_counts.most_common():
    label = get_label(qid).title()
    gender_data.append({"gender": label, "count": count})

# Collapse small categories
processed_data = []
other_total = 0
for entry in gender_data:
    if entry["count"] <= 1000:
        other_total += entry["count"]
    else:
        processed_data.append(entry)

if other_total > 0:
    processed_data.append({"gender": "Other", "count": other_total})

processed_data = sorted(processed_data, key=lambda x: x["count"], reverse=True)

gender_domain = ["Female", "Male"]
color_map = {
    "Female": "#FF00D3",
    "Male": "#CCCCCC",
}
gender_range = [color_map.get(g, "#999999") for g in gender_domain]

# Create pie chart spec
gender_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": processed_data},
    "mark": {"type": "arc", "innerRadius": 75},
    "encoding": {
        "theta": {"field": "count", "type": "quantitative"},
        "color": {
            "field": "gender",
            "type": "nominal",
            "legend": {"title": "Gender"},
            "scale": {
                "domain": gender_domain,
                "range": gender_range
            }
        },
    },
    "width": 300,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "gender.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(gender_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")

print(f"\n[{timestamp()}] GENDER SUMMARY")
for d in processed_data:
    print(f"  {d['gender']}: {d['count']:,}")

In [None]:
# =============================================================================
# INCEPTION TIMELINE (Paintings by decade)
# =============================================================================

print(f"\n[{timestamp()}] Creating inception timeline...")

decade_data = []
for decade in sorted(decade_counts.keys()):
    decade_data.append({"decade": f"{decade}-01-01", "count": decade_counts[decade]})

inception_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": decade_data},
    "mark": {
        "type": "line",
        "color": "#000000",
        "strokeWidth": 2,        
    },
    "encoding": {
        "x": {
            "field": "decade",
            "type": "temporal",
            "title": "",
            "axis": {"tickCount": 7}
        },
        "y": {
            "field": "count",
            "type": "quantitative",
            "title": ""
        }
    },
    "width": 300,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "inception.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(inception_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")

print(f"\n[{timestamp()}] INCEPTION SUMMARY")
print(f"[{timestamp()}] Total paintings: {len(painting_qids):,}")
print(f"[{timestamp()}] With inception date: {len(paintings_with_year):,} ({100*len(paintings_with_year)/len(painting_qids):.1f}%)")

In [None]:
# =============================================================================
# MOVEMENTS TIMELINES (One per major movement)
# =============================================================================

print(f"\n[{timestamp()}] Creating movement timelines...")

# Define the 6 movements to include
TIMELINE_MOVEMENTS = ["Baroque", "Impressionism", "Expressionism", "Realism", "Symbolism", "Romanticism"]

# Build QID -> label mapping for top movements
movement_qid_to_label = {}
for qid, count in movement_counts_with_inheritance.most_common(100):
    label = get_label(qid)
    clean_label = clean_movement_label(label)
    movement_qid_to_label[qid] = clean_label

# Get QIDs for our target movements
target_movement_qids = set()
for qid, label in movement_qid_to_label.items():
    if label in TIMELINE_MOVEMENTS:
        target_movement_qids.add(qid)

# Aggregate by decade for target movements
movement_decade_counts = defaultdict(Counter)

for year, movement_qid in painting_year_movement_inherited:
    if movement_qid in target_movement_qids:
        decade = (year // 10) * 10
        label = movement_qid_to_label[movement_qid]
        movement_decade_counts[label][decade] += 1

# Build timeline data for all movements
all_decades = set()
for counts in movement_decade_counts.values():
    all_decades.update(counts.keys())
all_decades = sorted(all_decades)

timeline_data = []
for decade in all_decades:
    for movement in TIMELINE_MOVEMENTS:
        count = movement_decade_counts[movement].get(decade, 0)
        timeline_data.append({
            "decade": decade,
            "count": count,
            "movement": movement
        })

def create_movement_timeline_spec(highlight_movement, data, movements):
    """Create a spec with one movement highlighted and others gray."""
    
    color_range = ["#FF00D3" if m == highlight_movement else "rgba(119, 119, 119, 0.5)" for m in movements]
    filtered_timeline_data = [d for d in timeline_data if 1500 <= d['decade'] <= 2000]
    return {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "data": {"values": filtered_timeline_data},
        "mark": {
            "type": "line",
            "interpolate": "monotone",                
        },
        "encoding": {
            "x": {
                "field": "decade",
                "type": "quantitative",
                "title": "",
                "axis": {"format": "d", "tickCount": 10},
                "scale": {"domain": [1500, 2000]}
            },
            "y": {
                "field": "count",
                "type": "quantitative",
                "title": "",
                "axis": {"format": "d", "tickCount": 10},
            },
            "color": {
                "field": "movement",
                "type": "nominal",
                "title": "Movement",
                "scale": {
                    "domain": movements,
                    "range": color_range
                },
                "legend": None
            },
            "strokeWidth": {
                "condition": {
                    "test": f"datum.movement === '{highlight_movement}'",
                    "value": 2
                },
                "value": 1.5
            }
        },
        "width": 400,
        "height": 300
    }

# Create a spec for each movement
for movement in TIMELINE_MOVEMENTS:
    spec = create_movement_timeline_spec(movement, timeline_data, TIMELINE_MOVEMENTS)
    
    filename = f"movements_timeline_{movement.lower()}.json"
    output_path = os.path.join(VIS_DIR, filename)
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(spec, f, ensure_ascii=False, indent=2)
    
    print(f"[{timestamp()}] Output: {output_path}")

print(f"[{timestamp()}] Created {len(TIMELINE_MOVEMENTS)} movement timeline specs")

In [None]:
# =============================================================================
# GENRE TIMELINES (Portrait, Religious, Landscape)
# =============================================================================

print(f"\n[{timestamp()}] Creating genre timelines...")

# Get all decades from genre data
all_genre_decades = set()
for counts in genre_decade_counts.values():
    all_genre_decades.update(counts.keys())
all_genre_decades = sorted(all_genre_decades)

# Build timeline data for all genres
genre_timeline_data = []
for decade in all_genre_decades:
    for genre_name in TIMELINE_GENRES.values():
        count = genre_decade_counts[genre_name].get(decade, 0)
        genre_timeline_data.append({
            "decade": f"{decade}-01-01",
            "count": count,
            "genre": genre_name
        })

def create_genre_timeline_spec(highlight_genre, all_data, genres):
    """Create a spec with one genre highlighted and others gray."""
    
    color_map = {
        "portrait": "#FF00D3",
        "religious": "#FF00D3", 
        "landscape": "#FF00D3"
    }
    
    domain = list(genres)
    range_colors = []
    for g in domain:
        if g == highlight_genre:
            range_colors.append(color_map[g])
        else:
            range_colors.append("rgba(119, 119, 119)")
    
    return {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "data": {"values": all_data},
        "mark": {
            "type": "line",
            "strokeWidth": 2,
            "clip": True,
            "interpolate": "monotone"
        },
        "encoding": {
            "x": {
                "field": "decade",
                "type": "temporal",
                "title": "",
                "scale": {"domain": ["1201", "2000"]}
            },
            "y": {
                "field": "count",
                "type": "quantitative",
                "title": ""
            },
            "color": {
                "field": "genre",
                "type": "nominal",
                "legend": None,
                "scale": {
                    "domain": domain,
                    "range": range_colors
                }
            },
            "opacity": {
                "condition": {
                    "test": f"datum.genre === '{highlight_genre}'",
                    "value": 1
                },
                "value": 0.3
            },
            "order": {
                "condition": {
                    "test": f"datum.genre === '{highlight_genre}'",
                    "value": 1
                },
                "value": 0
            }
        },
        "width": 300,
        "height": 200
    }

# Create specs for each genre
for genre_name in TIMELINE_GENRES.values():
    spec = create_genre_timeline_spec(genre_name, genre_timeline_data, TIMELINE_GENRES.values())
    
    output_path = os.path.join(VIS_DIR, f"genre_timeline_{genre_name}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(spec, f, ensure_ascii=False, indent=2)
    
    print(f"[{timestamp()}] Output: {output_path}")

print(f"\n[{timestamp()}] GENRE TIMELINE SUMMARY")
for genre_name, counts in genre_decade_counts.items():
    peak_decade = max(counts, key=counts.get) if counts else "N/A"
    peak_count = counts[peak_decade] if counts else 0
    print(f"[{timestamp()}] {genre_name.capitalize()}:")
    print(f"[{timestamp()}]   Total: {sum(counts.values()):,}")
    print(f"[{timestamp()}]   Peak: {peak_decade}s ({peak_count:,} paintings)")

In [None]:
# =============================================================================
# ARTIST PRODUCTIVITY VISUALIZATION
# =============================================================================

print(f"\n[{timestamp()}] Creating productivity visualization...")

# Filter for artists with at least 100 works in dataset
artists_with_works = [a for a in artist_data if a["works_in_dataset"] >= 100]
print(f"[{timestamp()}] Artists with 100+ works: {len(artists_with_works):,}")

# Add labels
for a in artists_with_works:
    a["label"] = get_label(a["qid"])
    a["works_per_year"] = a["works_in_dataset"] / a["lifespan"]

# Create scatterplot
productivity_data = [
    {"lifespan": a["lifespan"], "works": a["works_in_dataset"], "label": a["label"]}
    for a in artists_with_works
]

productivity_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": productivity_data},
    "mark": {"type": "circle", "opacity": 0.5, "color": "#000000", "size": 10},
    "encoding": {
        "x": {
            "field": "lifespan",
            "type": "quantitative",
            "title": "Lifespan (years)",
            "scale": {"domain": [20, 105]}
        },
        "y": {
            "field": "works",
            "type": "quantitative",
            "title": "Paintings in dataset",
            "scale": {"domain": [0, 4000]}
        },
        "tooltip": [
            {"field": "label", "type": "nominal", "title": "Artist"},
            {"field": "lifespan", "type": "quantitative", "title": "Lifespan"},
            {"field": "works", "type": "quantitative", "title": "Paintings"}
        ]
    },
    "width": 400,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "productivity.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(productivity_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")

# Print outliers
if artists_with_works:
    youngest = min(artists_with_works, key=lambda a: a["lifespan"])
    oldest = max(artists_with_works, key=lambda a: a["lifespan"])
    most_works = max(artists_with_works, key=lambda a: a["works_in_dataset"])
    most_productive = max(artists_with_works, key=lambda a: a["works_per_year"])
    
    print(f"\n[{timestamp()}] OUTLIERS:")
    print(f"  Shortest lifespan: {youngest['label']} ({youngest['lifespan']:.1f} years, {youngest['works_in_dataset']} works)")
    print(f"  Longest lifespan: {oldest['label']} ({oldest['lifespan']:.1f} years, {oldest['works_in_dataset']} works)")
    print(f"  Most works: {most_works['label']} ({most_works['works_in_dataset']} works)")
    print(f"  Most productive: {most_productive['label']} ({most_productive['works_per_year']:.2f} works/year)")

In [None]:
# =============================================================================
# FEMALE PAINTER RATIO BY BIRTH DECADE
# =============================================================================
print(f"\n[{timestamp()}] Calculating female painter ratio by birth decade...")

# Define female gender QIDs
FEMALE_QIDS = {"Q6581072", "Q1052281"}  # female, transgender female

# Count total and female artists per birth decade
birth_decade_total = Counter()
birth_decade_female = Counter()

for creator_qid in all_creator_qids:
    entity = creator_entities.get(creator_qid)
    if entity is None:
        continue
    
    # Get birth date
    birth_dates = get_claim_time_values(entity, "P569")
    if not birth_dates:
        continue
    
    birth_date = parse_date(birth_dates[0])
    if birth_date is None:
        continue
    
    birth_year = birth_date[0]
    if birth_year < 1500 or birth_year > 2000:
        continue
    
    decade = (birth_year // 10) * 10
    birth_decade_total[decade] += 1
    
    # Check gender
    genders = get_claim_values(entity, "P21")
    if any(g in FEMALE_QIDS for g in genders):
        birth_decade_female[decade] += 1

# Build timeline data
gender_timeline_data = []
for decade in sorted(birth_decade_total.keys()):
    total = birth_decade_total[decade]
    female = birth_decade_female.get(decade, 0)
    
    if total >= 1:
        pct = (female / total) * 100
        gender_timeline_data.append({
            "decade": decade,
            "percentage": round(pct, 2),
            "female": female,
            "total": total
        })

print(f"[{timestamp()}] Decades with data: {len(gender_timeline_data)}")

# Print summary
print(f"\n[{timestamp()}] Female painter percentage by century:")
for century_start in range(1500, 2000, 100):
    century_data = [d for d in gender_timeline_data if century_start <= d["decade"] < century_start + 100]
    if century_data:
        total_female = sum(d["female"] for d in century_data)
        total_all = sum(d["total"] for d in century_data)
        pct = (total_female / total_all) * 100 if total_all > 0 else 0
        print(f"[{timestamp()}]   {century_start}s: {pct:.1f}% ({total_female:,} of {total_all:,})")

# Create Vega-Lite bar chart spec
gender_timeline_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": gender_timeline_data},
    "transform": [ {"filter": "datum.decade < 2000"} ],
    "mark": {
        "type": "line",
        "color": "#8700F9",
        "interpolate": "monotone",                
    },
    "encoding": {
        "x": {
            "field": "decade",
            "type": "quantitative",
            "title": "",
            "scale": {"domain": [1500, 2000]},
            "axis": {"format": "d", "tickCount": 6},
        },
        "y": {
            "field": "percentage",
            "type": "quantitative",
            "title": "",
            "axis": {"format": ".0f"},
            "scale": {"domain": [0, 50]}
        },
        "tooltip": [
            {"field": "decade", "type": "ordinal", "title": "Decade"},
            {"field": "percentage", "type": "quantitative", "title": "% Female", "format": ".1f"},
            {"field": "female", "type": "quantitative", "title": "Female painters", "format": ","},
            {"field": "total", "type": "quantitative", "title": "Total painters", "format": ","}
        ]
    },
    "width": 400,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "gender_timeline.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(gender_timeline_spec, f, ensure_ascii=False, indent=2)

print(f"[{timestamp()}] Output: {output_path}")

# ---------------------------------------------------------------------------
# Summary statistics
# ---------------------------------------------------------------------------
print(f"\n[{timestamp()}] {'='*50}")
print(f"[{timestamp()}] GENDER TIMELINE SUMMARY")
print(f"[{timestamp()}] {'='*50}")

if gender_timeline_data:
    earliest = min(gender_timeline_data, key=lambda x: x["decade"])
    latest = max(gender_timeline_data, key=lambda x: x["decade"])
    peak = max(gender_timeline_data, key=lambda x: x["percentage"])
    
    print(f"[{timestamp()}] Earliest decade: {earliest['decade']}s ({earliest['percentage']:.1f}% female)")
    print(f"[{timestamp()}] Latest decade: {latest['decade']}s ({latest['percentage']:.1f}% female)")
    print(f"[{timestamp()}] Peak: {peak['decade']}s ({peak['percentage']:.1f}% female)")
print(f"[{timestamp()}] {'='*50}")

In [None]:
# =============================================================================
# ARTIST SITELINKS VS PAINTINGS SCATTERPLOT
# =============================================================================
print(f"\n[{timestamp()}] Analyzing artist sitelinks vs paintings...")

PAINTER_QID = "Q1028181"  # painter occupation

# Exclude entities that aren't primarily artists
EXCLUDE_QIDS = {
    "Q302",      # Jesus Christ
    "Q352",      # Adolf Hitler
    "Q8016",     # Winston Churchill
    "Q5879",     # Walt Disney
    "Q7243",     # Le Corbusier (primarily architect)
    "Q529",      # Louis Pasteur
    "Q7241",     # Rabindranath Tagore (primarily writer)
}

sitelink_data = []
non_painters_skipped = 0
excluded_skipped = 0

for creator_qid in all_creator_qids:
    if creator_qid in EXCLUDE_QIDS:
        excluded_skipped += 1
        continue
    
    entity = creator_entities.get(creator_qid)
    if entity is None:
        continue
    
    # Check if occupation includes painter
    occupations = get_claim_values(entity, "P106")
    if PAINTER_QID not in occupations:
        non_painters_skipped += 1
        continue
    
    sitelinks = entity.get("sitelinks", {})
    # Count only Wikipedia links
    wiki_count = sum(1 for key in sitelinks.keys() if key.endswith("wiki") and not key.endswith(("wikiquote", "wikisource", "wikivoyage", "wikinews", "wikiversity", "wikibooks")))
    
    paintings = creator_painting_count.get(creator_qid, 0)
    
    sitelink_data.append({
        "qid": creator_qid,
        "sitelinks": wiki_count,
        "paintings": paintings
    })

print(f"[{timestamp()}] Painters found: {len(sitelink_data):,}")
print(f"[{timestamp()}] Non-painters skipped: {non_painters_skipped:,}")
print(f"[{timestamp()}] Excluded manually: {excluded_skipped:,}")

# Add labels for chart
for item in sitelink_data:
    item["label"] = get_label(item["qid"])

# ---------------------------------------------------------------------------
# Scatterplot: Sitelinks (x) vs Paintings (y)
# ---------------------------------------------------------------------------
scatter_data = [
    {"sitelinks": d["sitelinks"], "paintings": d["paintings"], "label": d["label"]}
    for d in sitelink_data
]

sitelinks_scatter_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": scatter_data},
    "mark": {"type": "circle", "opacity": 0.6, "color": "#000000", "size": 10},
    "encoding": {
        "x": {
            "field": "sitelinks",
            "type": "quantitative",
            "title": "",
            "scale": {"domain": [0, 250]}
        },
        "y": {
            "field": "paintings",
            "type": "quantitative",
            "title": "",
            "scale": {"domain": [0, 4000]}
        },
        "tooltip": [
            {"field": "label", "type": "nominal", "title": "Artist"},
            {"field": "sitelinks", "type": "quantitative", "title": "Wikipedias"},
            {"field": "paintings", "type": "quantitative", "title": "Paintings"}
        ]
    },
    "width": 400,
    "height": 300
}

output_path = os.path.join(VIS_DIR, "sitelinks_paintings.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(sitelinks_scatter_spec, f, ensure_ascii=False, indent=2)

print(f"[{timestamp()}] Output: {output_path}")

# ---------------------------------------------------------------------------
# Interesting outliers
# ---------------------------------------------------------------------------
print(f"\n[{timestamp()}] OUTLIERS:")

# Famous but few works (high sitelinks, low paintings)
famous_few_works = [d for d in sitelink_data if d["sitelinks"] >= 50 and d["paintings"] <= 20]
famous_few_works = sorted(famous_few_works, key=lambda x: x["sitelinks"], reverse=True)[:10]
print(f"\n[{timestamp()}] Famous but few works (50+ Wikipedias, ≤20 paintings):")
for d in famous_few_works:
    print(f"[{timestamp()}]   {d['label']}: {d['sitelinks']} Wikipedias, {d['paintings']} paintings")

# Prolific but obscure (many paintings, few sitelinks)
prolific_obscure = [d for d in sitelink_data if d["paintings"] >= 100 and d["sitelinks"] <= 10]
prolific_obscure = sorted(prolific_obscure, key=lambda x: x["paintings"], reverse=True)[:10]
print(f"\n[{timestamp()}] Prolific but obscure (100+ paintings, ≤10 Wikipedias):")
for d in prolific_obscure:
    print(f"[{timestamp()}]   {d['label']}: {d['paintings']} paintings, {d['sitelinks']} Wikipedias")

# Most famous painters
most_famous = sorted(sitelink_data, key=lambda x: x["sitelinks"], reverse=True)[:10]
print(f"\n[{timestamp()}] Most famous painters:")
for d in most_famous:
    print(f"[{timestamp()}]   {d['label']}: {d['sitelinks']} Wikipedias, {d['paintings']} paintings")

In [None]:
# =============================================================================
# FINAL SUMMARY
# =============================================================================
print(f"\n[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] PROCESSING COMPLETE")
print(f"[{timestamp()}] {'='*60}")
print(f"[{timestamp()}] Total paintings: {len(painting_qids):,}")
print(f"[{timestamp()}] Total creators: {len(all_creator_qids):,}")
print(f"[{timestamp()}]")
print(f"[{timestamp()}] Paintings with:")
print(f"[{timestamp()}]   Movement: {len(paintings_with_movement_inherited):,} ({100*len(paintings_with_movement_inherited)/len(painting_qids):.1f}%)")
print(f"[{timestamp()}]   Material: {len(paintings_with_material):,} ({100*len(paintings_with_material)/len(painting_qids):.1f}%)")
print(f"[{timestamp()}]   Genre: {len(paintings_with_genre):,} ({100*len(paintings_with_genre)/len(painting_qids):.1f}%)")
print(f"[{timestamp()}]   Collection: {len(paintings_with_collection):,} ({100*len(paintings_with_collection)/len(painting_qids):.1f}%)")
print(f"[{timestamp()}]   Year: {len(paintings_with_year):,} ({100*len(paintings_with_year)/len(painting_qids):.1f}%)")
print(f"[{timestamp()}]")
print(f"[{timestamp()}] Creators with:")
print(f"[{timestamp()}]   Gender: {creators_with_gender:,}")
print(f"[{timestamp()}]   Birthplace: {creators_with_birthplace:,}")
print(f"[{timestamp()}]   Country: {creators_with_country:,}")
print(f"[{timestamp()}]   Lifespan data: {len(artist_data):,}")
print(f"[{timestamp()}]")
print(f"[{timestamp()}] Gender timeline:")
total_female = sum(d["female"] for d in gender_timeline_data)
total_in_timeline = sum(d["total"] for d in gender_timeline_data)
print(f"[{timestamp()}]   Painters in timeline: {total_in_timeline:,}")
print(f"[{timestamp()}]   Female painters: {total_female:,} ({100*total_female/total_in_timeline:.1f}%)")
if gender_timeline_data:
    earliest = min(gender_timeline_data, key=lambda x: x["decade"])
    latest = max(gender_timeline_data, key=lambda x: x["decade"])
    peak = max(gender_timeline_data, key=lambda x: x["percentage"])
    print(f"[{timestamp()}]   Earliest: {earliest['decade']}s ({earliest['percentage']:.1f}% female)")
    print(f"[{timestamp()}]   Latest: {latest['decade']}s ({latest['percentage']:.1f}% female)")
    print(f"[{timestamp()}]   Peak: {peak['decade']}s ({peak['percentage']:.1f}% female)")
print(f"[{timestamp()}]")
print(f"[{timestamp()}] Output files generated:")
print(f"[{timestamp()}]   - movements.json")
print(f"[{timestamp()}]   - movements_timeline_*.json (6 files)")
print(f"[{timestamp()}]   - materials.json")
print(f"[{timestamp()}]   - genre.json")
print(f"[{timestamp()}]   - genre_timeline_*.json (3 files)")
print(f"[{timestamp()}]   - collection.json")
print(f"[{timestamp()}]   - gender.json")
print(f"[{timestamp()}]   - gender_timeline.json")
print(f"[{timestamp()}]   - country.json")
print(f"[{timestamp()}]   - continent.json")
print(f"[{timestamp()}]   - inception.json")
print(f"[{timestamp()}]   - productivity.json")
print(f"[{timestamp()}] {'='*60}")

In [None]:
# =============================================================================
# COVERAGE / DATA GAPS VISUALIZATION
# =============================================================================
print(f"\n[{timestamp()}] Creating coverage chart...")

total = len(painting_qids)

coverage_data = [
    {"property": "Inception", "percentage": round(100 * len(paintings_with_year) / total, 1)},
    {"property": "Creator", "percentage": round(100 * len(paintings_with_creator) / total, 1)},
    {"property": "Material", "percentage": round(100 * len(paintings_with_material) / total, 1)},
    {"property": "Genre", "percentage": round(100 * len(paintings_with_genre) / total, 1)},
    {"property": "Movement", "percentage": round(100 * len(paintings_with_movement_inherited) / total, 1)},
]

# Sort by coverage
coverage_data = sorted(coverage_data, key=lambda x: x["percentage"], reverse=True)

for d in coverage_data:
    print(f"[{timestamp()}]   {d['property']}: {d['percentage']}%")

coverage_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "data": {"values": coverage_data},
    "mark": {"type": "bar", "color": "#999999", "height": 20},
    "encoding": {
        "y": {
            "field": "property",
            "type": "nominal",
            "sort": [d["property"] for d in coverage_data],
            "title": ""
        },
        "x": {
            "field": "percentage",
            "type": "quantitative",
            "title": "",
            "scale": {"domain": [0, 100]},
            "axis": {"format": ".0f"}
        }
    },
    "width": 300,
    "height": 200
}

output_path = os.path.join(VIS_DIR, "coverage.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(coverage_spec, f, ensure_ascii=False, indent=2)
print(f"[{timestamp()}] Output: {output_path}")