From 46d4c8d48de3d730fbd1a6cc738275c3e1208389 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 8 Jul 2025 16:45:01 -0700 Subject: [PATCH 1/3] demo to transcribe the first 100 songs in the national jukebox --- 2025/national-jukebox/.gitignore | 1 + 2025/national-jukebox/download_first_page.py | 46 ++++++++++ 2025/national-jukebox/extract_item_info.py | 94 ++++++++++++++++++++ 2025/national-jukebox/extract_mp3.py | 46 ++++++++++ 2025/national-jukebox/list_urls.py | 63 +++++++++++++ 5 files changed, 250 insertions(+) create mode 100644 2025/national-jukebox/.gitignore create mode 100644 2025/national-jukebox/download_first_page.py create mode 100644 2025/national-jukebox/extract_item_info.py create mode 100644 2025/national-jukebox/extract_mp3.py create mode 100644 2025/national-jukebox/list_urls.py diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore new file mode 100644 index 0000000..07f43b8 --- /dev/null +++ b/2025/national-jukebox/.gitignore @@ -0,0 +1 @@ +data/* \ No newline at end of file diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py new file mode 100644 index 0000000..878c061 --- /dev/null +++ b/2025/national-jukebox/download_first_page.py @@ -0,0 +1,46 @@ +import json +import pathlib +import requests +import time + +import list_urls +import extract_item_info +import extract_mp3 + + +DATA_DIR = pathlib.Path(__file__).parent / "data" + + +target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100" +item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url) + + +def download_and_extract_item(base_url): + print(f"Fetching content from: {base_url}") + # https://guides.loc.gov/digital-scholarship/faq + # Stay within 20 requests per minute rate limit. + time.sleep(3) + response = requests.get(base_url) + while response.status_code == 429: + print("Too many requests, sleeping") + time.sleep(10) + response = requests.get(base_url) + + try: + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + item = extract_item_info.extract_subheadings_to_dict(response.text) + mp3_url = extract_mp3.extract_mp3_url(response.text) + item["MP3 URL"] = mp3_url + return item + + +with open(DATA_DIR / "jukebox.jsonl", "w") as data_file: + for item_url in item_urls: + item = download_and_extract_item(item_url) + json.dump(item, data_file, indent=None) + data_file.write("\n") + data_file.flush() diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py new file mode 100644 index 0000000..5644458 --- /dev/null +++ b/2025/national-jukebox/extract_item_info.py @@ -0,0 +1,94 @@ +from bs4 import BeautifulSoup +import requests +import json + +def extract_subheadings_to_dict(html_content): + """ + Extracts subheadings from the "About this item" section of HTML + and returns them as a JSON object. + + Args: + html_content (str): The HTML content as a string. + + Returns: + str: A JSON string where each subheading is a key, and its corresponding + value is a list of items under that subheading. + Returns an empty JSON object string if the section is not found. + """ + soup = BeautifulSoup(html_content, 'html.parser') + about_this_item_section = soup.find('div', id='about-this-item') + + if not about_this_item_section: + return json.dumps({}) + + subheadings_data = {} + + # Find the div that contains the actual cataloged data + item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data') + + if item_cataloged_data: + # Iterate through each subheading (h3) within this div + for h3_tag in item_cataloged_data.find_all('h3'): + subheading_text = h3_tag.get_text(strip=True) + items = [] + # The items for each subheading are in the immediately following