In [6]:
# If running in a fresh environment, install Playwright and friends:
!pip -q install playwright bs4 pandas requests
!python -m playwright install chromium



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
Chromium 140.0.7339.16 (playwright build v1187) downloaded to /home/codespace/.cache/ms-playwright/chromium-1187
Downloading FFMPEG playwright build v1011[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/ffmpeg/1011/ffmpeg-linux.zip[22m
FFMPEG playwright build v1011 downloaded to /home/codespace/.cache/ms-playwright/ffmpeg-1011
Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-linux.zip[22m
Chromium Headless

In [7]:
import random, time, csv, re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from playwright.sync_api import sync_playwright

BASE = "https://kv.mozarteum.at"
LIST_URL_TPL = "https://kv.mozarteum.at/en/search?koechel_works_en%5Bpage%5D={page}"

START_PAGE = 1
END_PAGE   = 35       # inclusive
MIN_DELAY  = 1.5      # polite delay between page fetches (seconds)
MAX_DELAY  = 3.5
OUT_CSV    = "koechel.csv"

def human_delay(a=MIN_DELAY, b=MAX_DELAY):
    time.sleep(random.uniform(a, b))

def make_requests_session():
    s = requests.Session()
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (compatible; KV-Notebook/1.0; +https://example.org/polite)",
        "Accept-Language": "en-US,en;q=0.8",
    })
    retries = Retry(
        total=5,
        backoff_factor=0.6,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET",),
        raise_on_status=False,
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

req_sess = make_requests_session()
print("Config ready.")


Config ready.


In [8]:
KNUM_RE = re.compile(r"\bK[öo]?(?:V|\.)\s*\d+[a-z]?\b", re.IGNORECASE)

def text_or_none(node):
    return node.get_text(" ", strip=True) if node else None

def parse_detail_page(html: str, url: str):
    soup = BeautifulSoup(html, "html.parser")

    # Title/KV header
    kv_header = None
    for sel in ["h1", ".work-title", ".title"]:
        n = soup.select_one(sel)
        if n:
            kv_header = n.get_text(" ", strip=True)
            break

    # Try to locate an explicit KV field via <dt>/<dd> pairs
    kv_official = None
    for dt in soup.find_all("dt"):
        label = dt.get_text(" ", strip=True).lower()
        if any(k in label for k in ["köchel", "kv", "k."]):
            dd = dt.find_next_sibling("dd")
            if dd and dd.get_text(strip=True):
                kv_official = dd.get_text(" ", strip=True)
                break

    # Key / Genre / Date via dt/dd
    def dl_lookup(keys):
        for dt in soup.find_all("dt"):
            label = dt.get_text(" ", strip=True).lower()
            if any(k in label for k in keys):
                dd = dt.find_next_sibling("dd")
                if dd:
                    return dd.get_text(" ", strip=True)
        return None

    musical_key = dl_lookup(["key", "tonality"])
    genre       = dl_lookup(["genre", "work type", "category"])
    date        = dl_lookup(["date", "composition date", "year"])

    # Short description / first paragraph if any
    description = None
    for sel in [".work-description", ".entry-content p", "article p"]:
        n = soup.select_one(sel)
        if n:
            description = n.get_text(" ", strip=True)
            break

    # Fallback: guess a K number from visible text
    knum_guess = None
    for cand in filter(None, [kv_header, description]):
        m = KNUM_RE.search(cand)
        if m:
            knum_guess = m.group(0)
            break

    return {
        "detail_url": url,
        "detail_title": kv_header,
        "kv_official": kv_official or knum_guess,
        "key": musical_key,
        "genre": genre,
        "date": date,
        "description": description,
    }


In [10]:
# Cell 4: Use the ASYNC Playwright API (works inside Jupyter)

import asyncio
from playwright.async_api import async_playwright

async def collect_work_urls_with_playwright_async(start_page=START_PAGE, end_page=END_PAGE):
    """
    Opens each listing page in headless Chromium, waits for JS-rendered results,
    and extracts links to individual works (/en/work/...).
    """
    all_urls = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(user_agent="Mozilla/5.0 (KV-Notebook/1.0)")
        page = await ctx.new_page()

        for page_no in range(start_page, end_page + 1):
            url = LIST_URL_TPL.format(page=page_no)
            print(f"[list] {url}")
            await page.goto(url, wait_until="domcontentloaded")
            # Let the JS fetch results; then wait for network to settle a bit
            try:
                await page.wait_for_load_state("networkidle", timeout=15000)
            except Exception:
                pass  # keep going; some pages may not fully idle if analytics keep chattering

            # Results should contain anchors to /en/work/...
            try:
                await page.wait_for_selector('a[href^="/en/work/"]', timeout=10000)
            except Exception:
                print(f"  !! No work links found on page {page_no} (maybe empty/slow).")
                human_delay()
                continue

            anchors = await page.query_selector_all('a[href^="/en/work/"]')
            hrefs = set()
            for a in anchors:
                href = await a.get_attribute("href")
                if href and href.startswith("/en/work/"):
                    hrefs.add(urljoin(BASE, href))

            print(f"  -> {len(hrefs)} work URLs found")
            all_urls.extend(sorted(hrefs))
            human_delay()

        await ctx.close()
        await browser.close()

    all_urls = sorted(set(all_urls))
    print(f"Collected {len(all_urls)} unique work URLs.")
    return all_urls


In [11]:
# Cell 4b (Option A): Directly await
work_urls = await collect_work_urls_with_playwright_async(START_PAGE, END_PAGE)
len(work_urls), work_urls[:5]


Error: BrowserType.launch: 
╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Please install them with the following command:      ║
║                                                      ║
║     sudo playwright install-deps                     ║
║                                                      ║
║ Alternatively, use apt:                              ║
║     sudo apt-get install libatk1.0-0t64\             ║
║         libatk-bridge2.0-0t64\                       ║
║         libatspi2.0-0t64\                            ║
║         libxcomposite1\                              ║
║         libxdamage1\                                 ║
║         libxfixes3\                                  ║
║         libxrandr2\                                  ║
║         libgbm1\                                     ║
║         libxkbcommon0\                               ║
║         libasound2t64                                ║
║                                                      ║
║ <3 Playwright Team                                   ║
╚══════════════════════════════════════════════════════╝

In [None]:
rows = []
for i, wurl in enumerate(work_urls, 1):
    try:
        r = req_sess.get(wurl, timeout=30)
        if r.status_code != 200:
            print(f"  !! HTTP {r.status_code} {wurl}")
            human_delay()
            continue
        detail = parse_detail_page(r.text, wurl)
        rows.append(detail)
    except Exception as e:
        print(f"  !! Error on {wurl}: {e}")
    human_delay()

df = pd.DataFrame(rows, columns=[
    "detail_url", "detail_title", "kv_official", "key", "genre", "date", "description"
])
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"Saved {len(df)} rows to {OUT_CSV}")
df.head(10)
