Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions 2025/national-jukebox/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/*
73 changes: 73 additions & 0 deletions 2025/national-jukebox/download_first_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import pathlib
import time

import pandas
import requests

import list_urls
import extract_item_info
import extract_mp3


DATA_DIR = pathlib.Path(__file__).parent / "data"


target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)


def download_and_extract_item(base_url):
print(f"Fetching content from: {base_url}")
# https://guides.loc.gov/digital-scholarship/faq
# Stay within 20 requests per minute rate limit.
time.sleep(3)
response = requests.get(base_url)

try:
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None

item = extract_item_info.extract_subheadings_to_dict(response.text)
mp3_url = extract_mp3.extract_mp3_url(response.text)
item["MP3 URL"] = mp3_url
item["URL"] = base_url
return item


visited_urls = {}
jukebox_path = DATA_DIR / "jukebox.jsonl"

if jukebox_path.exists():
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}


with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
for item_url in item_urls:
if item_url in visited_urls:
continue

item = download_and_extract_item(item_url)
if item is None:
continue

json.dump(item, data_file, indent=None)
data_file.write("\n")
data_file.flush()
54 changes: 54 additions & 0 deletions 2025/national-jukebox/download_mp3s.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pathlib
import time

import pandas
import requests


DATA_DIR = pathlib.Path(__file__).parent / "data"



def download_mp3(base_url):
print(f"Fetching content from: {base_url}")
# https://guides.loc.gov/digital-scholarship/faq
# Stay within 20 requests per minute rate limit.
time.sleep(3)
response = requests.get(base_url)

try:
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None

return response.content


jukebox_path = DATA_DIR / "jukebox.jsonl"
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")

for _, row in jukebox.iterrows():
jukebox_id = row["URL"].split("/")[-2]
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
if mp3_path.exists():
continue

mp3_bytes = download_mp3(row["MP3 URL"])
with open(mp3_path, "wb") as mp3_file:
mp3_file.write(mp3_bytes)
print(f"Wrote {mp3_path}")
110 changes: 110 additions & 0 deletions 2025/national-jukebox/extract_item_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from bs4 import BeautifulSoup
import requests
import json

def extract_subheadings_to_dict(html_content):
"""
Extracts subheadings from the "About this item" section of HTML
and returns them as a JSON object.

Args:
html_content (str): The HTML content as a string.

Returns:
str: A JSON string where each subheading is a key, and its corresponding
value is a list of items under that subheading.
Returns an empty JSON object string if the section is not found.
"""
soup = BeautifulSoup(html_content, 'html.parser')
about_this_item_section = soup.find('div', id='about-this-item')

if not about_this_item_section:
return json.dumps({})

subheadings_data = {}

# Find the div that contains the actual cataloged data
item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')

if item_cataloged_data:
# Iterate through each subheading (h3) within this div
for h3_tag in item_cataloged_data.find_all('h3'):
subheading_text = h3_tag.get_text(strip=True)
items = []
# The items for each subheading are in the immediately following <ul>
ul_tag = h3_tag.find_next_sibling('ul')
if ul_tag:
for li_tag in ul_tag.find_all('li'):
# Get text from list items, handling potential nested structures or links
item_text = li_tag.get_text(strip=True)
items.append(item_text)
subheadings_data[subheading_text] = items

# Extract "Part of" section as it's outside item-cataloged-data but still a subheading
part_of_section = about_this_item_section.find('div', id='part-of')
if part_of_section:
h3_tag = part_of_section.find('h3')
if h3_tag:
subheading_text = h3_tag.get_text(strip=True)
items = []
ul_tag = h3_tag.find_next_sibling('ul')
if ul_tag:
for li_tag in ul_tag.find_all('li'):
item_text = li_tag.get_text(strip=True)
# Remove the count in parentheses if present, e.g., "(10,009)"
if '(' in item_text and item_text.endswith(')'):
item_text = item_text.rsplit('(', 1)[0].strip()
items.append(item_text)
subheadings_data[subheading_text] = items

# Extract IIIF Presentation Manifest
iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
if iiif_manifest_section:
subheading_text = iiif_manifest_section.get_text(strip=True)
items = []
ul_tag = iiif_manifest_section.find_next_sibling('ul')
if ul_tag:
for li_tag in ul_tag.find_all('li'):
item_text = li_tag.get_text(strip=True)
items.append(item_text)
subheadings_data[subheading_text] = items

return subheadings_data


def download_and_extract(base_url):
print(f"Fetching content from: {base_url}")
try:
response = requests.get(base_url)
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None

item = extract_subheadings_to_dict(response.text)
item["URL"] = base_url
return item

# Provided HTML content
if __name__ == "__main__":
target_url = "https://www.loc.gov/item/jukebox-679643/"
item = download_and_extract(target_url)
if item:
print("\nFound song detail page URLs:")
print(json.dumps(item, indent=4))
else:
print("No song detail URLs found or an error occurred.")
60 changes: 60 additions & 0 deletions 2025/national-jukebox/extract_mp3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from bs4 import BeautifulSoup
import requests
import json

def extract_mp3_url(html_content):
"""
Extracts the MP3 download URL from the given HTML content.

Args:
html_content (str): The HTML content of the webpage.

Returns:
str or None: The MP3 download URL if found, otherwise None.
"""
soup = BeautifulSoup(html_content, 'html.parser')

# Find the select element that contains download options
# Based on the HTML, it has an ID of 'select-resource0'
download_select = soup.find('select', id='select-resource0')

if download_select:
# Find the option tag specifically for AUDIO download (MP3)
# It has a data-file-download attribute set to "AUDIO"
mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
if mp3_option:
return mp3_option['value'] # Return the value attribute which is the URL
return None # Return None if the select or option is not found

# Example Usage (assuming you've fetched the HTML using requests)
if __name__ == "__main__":
url = "https://www.loc.gov/item/jukebox-679643/"
try:
response = requests.get(url)
response.raise_for_status() # Raise an exception for HTTP errors
html_doc = response.text

mp3_url = extract_mp3_url(html_doc)

if mp3_url:
print(f"Extracted MP3 URL: {mp3_url}")
else:
print("MP3 URL not found in the HTML.")

except requests.exceptions.RequestException as e:
print(f"Error fetching the URL: {e}")

77 changes: 77 additions & 0 deletions 2025/national-jukebox/list_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
"""
Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.

Args:
base_url: The URL of the main collection page (e.g., "https://www.loc.gov/collections/national-jukebox/?sb=date_desc").

Returns:
A list of URLs for the song detail pages.
"""
print(f"Fetching content from: {base_url}")
try:
response = requests.get(base_url)
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return []

soup = BeautifulSoup(response.text, 'html.parser')
detail_urls = []

# The structure of the page suggests that song links are typically within
# elements that represent individual items. Looking for common patterns like 'div'
# with specific classes or 'a' tags directly.
# From a quick inspection, it seems 'a' tags with hrefs pointing to individual
# records are nested within list items or similar structures.
# Let's try to find all links within a common container for search results.

# Assuming the main container for search results items has a class like 'item-results'
# or similar, and individual items have links within them.
# We'll look for <a> tags whose href attributes match a pattern for detail pages.
# A common pattern for Library of Congress detail pages is /item/{id}/ or /record/{id}/
# Let's target links that contain '/item/' in their href and are likely part of the main results.

# Find all 'a' tags that have an 'href' attribute
for link in soup.find_all('a', href=True):
href = link['href']
# Check if the href points to a detail page.
# Examples: /item/jukebox-12345/ or similar.
# We need to construct absolute URLs.
if '/item/jukebox' in href and not href.startswith('#'):
full_url = urljoin(base_url, href)
# Avoid adding duplicates if the same item link appears multiple times
if full_url not in detail_urls:
detail_urls.append(full_url)

return detail_urls

if __name__ == "__main__":
target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
song_urls = get_national_jukebox_song_detail_urls(target_url)

if song_urls:
print("\nFound song detail page URLs:")
for url in song_urls:
print(url)
print(f"\nTotal URLs found: {len(song_urls)}")
else:
print("No song detail URLs found or an error occurred.")
Loading