In [29]:
! pip install selenium
! pip install requests beautifulsoup4



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [31]:
#!/usr/bin/env python3
"""
dw_cards_download_selenium.py

Uses Selenium+headless Chrome (or Firefox) to fetch DW Learn German
“card‐style” vocabulary lessons (which are rendered by JS), scrape
German–English pairs + native MP3 audio URLs, download audio, and
write an Anki‐ready CSV.

After running, you will have:
  - dw_lessons_5_6.csv   (fields: German, English, Audio, Tags)
  - audio_dw/            (folder full of downloaded .mp3 files)

Then import dw_lessons_5_6.csv into Anki, and copy audio_dw/ into
Anki’s media folder so that the “Audio” column links to actual MP3s.
"""

import os
import time
import csv
import requests
from urllib.parse import urljoin
from pathlib import Path

# You can switch between Chrome and Firefox here.
# (Uncomment the one you want and comment out the other.)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
# from selenium.webdriver.firefox.options import Options as FirefoxOptions

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ----------------------------
# 1) Configuration & constants
# ----------------------------

# List the lessons you want to scrape:
LESSON_URLS = [
    "https://learngerman.dw.com/en/5-time-to-leave/l-56612333/lv",
    "https://learngerman.dw.com/en/6-barking-up-the-wrong-tree/l-56612338/lv"
]

# After scraping, tag every card with this Anki tag:
ANKI_TAG = "lesson5_6"

# Output files/folders:
OUTPUT_CSV = "dw_lessons_5_6.csv"
AUDIO_DIR = Path("audio_dw")
AUDIO_DIR.mkdir(exist_ok=True)

# A “browser‐like” User‐Agent in case some MP3 URLs still need a header:
REQUESTS_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}


# -----------------------------------
# 2) Setup Selenium (headless Chrome)
# -----------------------------------
def make_driver():
    """
    Creates a headless Selenium driver. By default uses ChromeDriver.
    If you prefer Firefox, uncomment that block and comment Chrome.
    """
    # === Chrome (default) ===
    chrome_opts = ChromeOptions()
    chrome_opts.add_argument("--headless")            # run in headless mode
    chrome_opts.add_argument("--disable-gpu")         # often recommended for CI
    chrome_opts.add_argument("--no-sandbox")          # in case of Linux permissions
    chrome_opts.add_argument("--window-size=1920,1080")
    # (Optional) If ChromeDriver is not on your PATH, supply the full path:
    # driver = webdriver.Chrome(executable_path="/path/to/chromedriver", options=chrome_opts)
    driver = webdriver.Chrome(options=chrome_opts)
    driver.set_page_load_timeout(30)
    return driver

    # === Firefox Alternative ===
    # ff_opts = FirefoxOptions()
    # ff_opts.add_argument("--headless")
    # driver = webdriver.Firefox(options=ff_opts)
    # driver.set_page_load_timeout(30)
    # return driver


# ----------------------------
# 3) Scrape each lesson page
# ----------------------------
def scrape_lesson(driver, lesson_url):
    """
    Loads `lesson_url` in Selenium, waits for ".srr46ge" cards to load,
    then returns a list of triples: (German_text, English_text, audio_url).
    """
    print(f"\n→ Loading page: {lesson_url}")
    driver.get(lesson_url)

    # The "card‐style" vocabulary is contained in <div class="srr46ge">…</div>.
    # We wait until at least one such div appears (timeout = 20s).
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.srr46ge"))
        )
    except Exception as e:
        print(f"   ⚠️ Timeout waiting for cards on {lesson_url}: {e}")
        return []

    # Once loaded, grab all card DIVs:
    card_elems = driver.find_elements(By.CSS_SELECTOR, "div.srr46ge")
    print(f"   • Found {len(card_elems)} 'srr46ge' elements (vocab cards).")

    triples = []  # to hold (German, English, audio_url)
    for card in card_elems:
        # 1) German term in <strong>
        try:
            german = card.find_element(By.TAG_NAME, "strong").text.strip()
        except:
            continue

        # 2) English translation lives in <span class="s1lm6hur"><p>…</p></span>
        eng = ""
        try:
            span = card.find_element(By.CSS_SELECTOR, "span.s1lm6hur")
            eng = span.find_element(By.TAG_NAME, "p").text.strip()
        except:
            eng = ""

        # 3) If there's an <audio><source src="…"></audio>, get the 'src' attribute
        audio_url = ""
        try:
            source = card.find_element(By.CSS_SELECTOR, "audio source")
            raw = source.get_attribute("src").strip()
            if raw:
                audio_url = urljoin(lesson_url, raw)
        except:
            audio_url = ""

        if german and eng:
            triples.append((german, eng, audio_url))

    return triples


# -----------------------------------
# 4) Download MP3 & Build CSV rows
# -----------------------------------
def download_mp3(mp3_url: str) -> str:
    """
    Download `mp3_url` into AUDIO_DIR, if not already present.
    Returns the local filename (basename only), or "" if failure.
    """
    filename = os.path.basename(mp3_url.split("?")[0])
    local_path = AUDIO_DIR / filename
    if not local_path.exists():
        try:
            r = requests.get(mp3_url, headers=REQUESTS_HEADERS, stream=True, timeout=20)
            r.raise_for_status()
            with open(local_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=4096):
                    f.write(chunk)
            print(f"  • Downloaded audio: {filename}")
        except Exception as exc:
            print(f"    ⚠️ Failed to download {mp3_url}: {exc}")
            return ""
    else:
        print(f"  • Already have: {filename}")
    return filename


# ----------------------------
# 5) Main: orchestrate all of it
# ----------------------------
def main():
    driver = make_driver()
    all_entries = []

    for url in LESSON_URLS:
        lesson_triples = scrape_lesson(driver, url)
        all_entries.extend(lesson_triples)

    driver.quit()

    if not all_entries:
        print("\n⚠️ No vocabulary entries were found. Exiting.")
        return

    # Deduplicate by (German, English), preferring any audio_url if available
    deduped = {}
    for german, english, audio_url in all_entries:
        key = (german.strip(), english.strip())
        if key not in deduped or (audio_url and not deduped[key]):
            deduped[key] = audio_url

    print(f"\n→ Total unique (German, English) pairs: {len(deduped)}")

    # Build rows for CSV
    csv_rows = []
    for (german, english), raw_mp3_url in deduped.items():
        audio_fn = ""
        if raw_mp3_url:
            audio_fn = download_mp3(raw_mp3_url)
        csv_rows.append([german, english, audio_fn, ANKI_TAG])

    # Write CSV
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["German", "English", "Audio", "Tags"])
        writer.writerows(csv_rows)

    print(f"\n✅ CSV written to: {OUTPUT_CSV}")
    print(f"✅ Audio files saved to: {AUDIO_DIR.resolve()}")


if __name__ == "__main__":
    main()



→ Loading page: https://learngerman.dw.com/en/5-time-to-leave/l-56612333/lv
   • Found 38 'srr46ge' elements (vocab cards).

→ Loading page: https://learngerman.dw.com/en/6-barking-up-the-wrong-tree/l-56612338/lv
   • Found 70 'srr46ge' elements (vocab cards).

→ Total unique (German, English) pairs: 107
  • Downloaded audio: Harry-achtung-interjektion.mp3
  • Downloaded audio: Harry-Auf-Wiederhoeren-1.mp3
  • Downloaded audio: Harry-bahnhof.mp3
  • Downloaded audio: Harry-das.mp3
  • Downloaded audio: Harry-dein-2.mp3
  • Downloaded audio: Harry-Die-Rechnung-bitte.mp3
  • Downloaded audio: Harry-einhundert-1.mp3
  • Downloaded audio: Harry-euro-1.mp3
  • Downloaded audio: Harry-geld-1.mp3
  • Downloaded audio: Harry-haben-1.mp3
  • Downloaded audio: Harry-hier-das-ist.mp3
  • Downloaded audio: Harry-Ich-moechte-zahlen.mp3
  • Downloaded audio: Harry-Ich-moechte-zum-Bahnhof.mp3
  • Downloaded audio: Harry-Ich-spreche-kein-Deutsch-1.mp3
  • Downloaded audio: Harry-ihr-possessivpronomen

In [37]:
import pathlib

# 1. Change this to the exact path of your CSV:
csv_path = pathlib.Path("/Users/xinruyu/Desktop/Job/dw_lessons_5_6.csv")
output_path = csv_path.with_name("dw_lessons_5_6_clean.csv")

with csv_path.open("r", encoding="utf-8") as fin, \
     output_path.open("w", encoding="utf-8", newline="") as fout:
    for line in fin:
        # Remove any trailing semicolons (and any spaces after them)
        cleaned = line.rstrip().rstrip(";")
        fout.write(cleaned + "\n")

print(f"Written cleaned CSV to {output_path}")




Written cleaned CSV to /Users/xinruyu/Desktop/Job/dw_lessons_5_6_clean.csv
