diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore new file mode 100644 index 0000000..07f43b8 --- /dev/null +++ b/2025/national-jukebox/.gitignore @@ -0,0 +1 @@ +data/* \ No newline at end of file diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py new file mode 100644 index 0000000..9a984e2 --- /dev/null +++ b/2025/national-jukebox/download_first_page.py @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import pathlib +import time + +import pandas +import requests + +import list_urls +import extract_item_info +import extract_mp3 + + +DATA_DIR = pathlib.Path(__file__).parent / "data" + + +target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100" +item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url) + + +def download_and_extract_item(base_url): + print(f"Fetching content from: {base_url}") + # https://guides.loc.gov/digital-scholarship/faq + # Stay within 20 requests per minute rate limit. + time.sleep(3) + response = requests.get(base_url) + + try: + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + item = extract_item_info.extract_subheadings_to_dict(response.text) + mp3_url = extract_mp3.extract_mp3_url(response.text) + item["MP3 URL"] = mp3_url + item["URL"] = base_url + return item + + +visited_urls = {} +jukebox_path = DATA_DIR / "jukebox.jsonl" + +if jukebox_path.exists(): + jukebox = pandas.read_json(jukebox_path, lines=True, orient="records") + visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {} + + +with open(DATA_DIR / "jukebox.jsonl", "a") as data_file: + for item_url in item_urls: + if item_url in visited_urls: + continue + + item = download_and_extract_item(item_url) + if item is None: + continue + + json.dump(item, data_file, indent=None) + data_file.write("\n") + data_file.flush() diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py new file mode 100644 index 0000000..b4486f0 --- /dev/null +++ b/2025/national-jukebox/download_mp3s.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import time + +import pandas +import requests + + +DATA_DIR = pathlib.Path(__file__).parent / "data" + + + +def download_mp3(base_url): + print(f"Fetching content from: {base_url}") + # https://guides.loc.gov/digital-scholarship/faq + # Stay within 20 requests per minute rate limit. + time.sleep(3) + response = requests.get(base_url) + + try: + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + return response.content + + +jukebox_path = DATA_DIR / "jukebox.jsonl" +jukebox = pandas.read_json(jukebox_path, lines=True, orient="records") + +for _, row in jukebox.iterrows(): + jukebox_id = row["URL"].split("/")[-2] + mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3") + if mp3_path.exists(): + continue + + mp3_bytes = download_mp3(row["MP3 URL"]) + with open(mp3_path, "wb") as mp3_file: + mp3_file.write(mp3_bytes) + print(f"Wrote {mp3_path}") diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py new file mode 100644 index 0000000..a878c13 --- /dev/null +++ b/2025/national-jukebox/extract_item_info.py @@ -0,0 +1,110 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bs4 import BeautifulSoup +import requests +import json + +def extract_subheadings_to_dict(html_content): + """ + Extracts subheadings from the "About this item" section of HTML + and returns them as a JSON object. + + Args: + html_content (str): The HTML content as a string. + + Returns: + str: A JSON string where each subheading is a key, and its corresponding + value is a list of items under that subheading. + Returns an empty JSON object string if the section is not found. + """ + soup = BeautifulSoup(html_content, 'html.parser') + about_this_item_section = soup.find('div', id='about-this-item') + + if not about_this_item_section: + return json.dumps({}) + + subheadings_data = {} + + # Find the div that contains the actual cataloged data + item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data') + + if item_cataloged_data: + # Iterate through each subheading (h3) within this div + for h3_tag in item_cataloged_data.find_all('h3'): + subheading_text = h3_tag.get_text(strip=True) + items = [] + # The items for each subheading are in the immediately following
| \n", + " | URL | \n", + "Recording Repository | \n", + "Recording Label | \n", + "Recording Take Number | \n", + "Recording Date | \n", + "Part of | \n", + "Names | \n", + "Recording Matrix Number | \n", + "Recording Catalog Number | \n", + "Media Size | \n", + "MP3 URL | \n", + "IIIF Presentation Manifest | \n", + "Genre | \n", + "Other Title | \n", + "Recording Location | \n", + "Online Format | \n", + "Summary | \n", + "Rights Advisory | \n", + "Title | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 69 | \n", + "https://www.loc.gov/item/jukebox-9538/ | \n", + "['Source of original recording: Department of ... | \n", + "['Victor'] | \n", + "['1'] | \n", + "['1925-04-30'] | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Collazo, Ramón -- Composer -- Composer'\n", + " 'Ca... | \n", + "['BA-591 (Matrix ID)'] | \n", + "['79553'] | \n", + "['10-in.'] | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Spanish (Argentina)' 'Ethnic music' 'Spanish... | \n", + "['Tango'] | \n", + "['Buenos Aires, Argentina [unconfirmed]'] | \n", + "['audio' 'image'] | \n", + "['Instrumental ensemble'] | \n", + "['Inclusion of the recording in the National J... | \n", + "['Volveras'] | \n", + "
| 50 | \n", + "https://www.loc.gov/item/jukebox-767050/ | \n", + "['Source of original recording: Department of ... | \n", + "['Columbia'] | \n", + "['2'] | \n", + "['1925-12-31'] | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Foster, Stephen Collins -- Composer -- Compo... | \n", + "['90141 (Matrix ID)'] | \n", + "['A3109'] | \n", + "['10-in.'] | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "[] | \n", + "['My old Kentucky home'] | \n", + "[] | \n", + "['audio' 'image'] | \n", + "['Band'] | \n", + "['Inclusion of the recording in the National J... | \n", + "['Battle hymn of the Republic'] | \n", + "
| 72 | \n", + "https://www.loc.gov/item/jukebox-744502/ | \n", + "['Source of original recording: Department of ... | \n", + "['Columbia'] | \n", + "['2'] | \n", + "['1925-03-30'] | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Leal, Gustavo -- Vocalist -- Tenor Vocal'] | \n", + "['105566 (Matrix ID)'] | \n", + "['1007-X'] | \n", + "['10-in.'] | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Portuguese'] | \n", + "['Cancao popular'] | \n", + "[] | \n", + "['audio' 'image'] | \n", + "['Male vocal solo, with orchestra'] | \n", + "['Inclusion of the recording in the National J... | \n", + "['Fado do padeiro'] | \n", + "
| 89 | \n", + "https://www.loc.gov/item/jukebox-9648/ | \n", + "['Source of original recording: Department of ... | \n", + "['Victor'] | \n", + "['2'] | \n", + "['1925-11-09'] | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Orquesta Típica Victor -- Musical Group -- M... | \n", + "['BA-703 (Matrix ID)'] | \n", + "['79608'] | \n", + "['10-in.'] | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Spanish (Argentina)' 'Ethnic music' 'Spanish... | \n", + "['Tango'] | \n", + "['Buenos Aires, Argentina [unconfirmed]'] | \n", + "['audio' 'image'] | \n", + "['Instrumental ensemble'] | \n", + "['Inclusion of the recording in the National J... | \n", + "['Sarandi'] | \n", + "
| 48 | \n", + "https://www.loc.gov/item/jukebox-767143/ | \n", + "['Source of original recording: Department of ... | \n", + "['Columbia'] | \n", + "['2'] | \n", + "['1925-12-31'] | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Columbia Band -- Musical Group -- Musical Gr... | \n", + "['90134 (Matrix ID)'] | \n", + "['A3114'] | \n", + "['10-in.'] | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Educational'] | \n", + "['Come, Thou almighty King'] | \n", + "[] | \n", + "['audio' 'image'] | \n", + "['Band'] | \n", + "['Inclusion of the recording in the National J... | \n", + "[\"Love's old sweet song\"] | \n", + "
| \n", + " | URL | \n", + "Recording Repository | \n", + "Recording Label | \n", + "Recording Take Number | \n", + "Recording Date | \n", + "Part of | \n", + "Names | \n", + "Recording Matrix Number | \n", + "Recording Catalog Number | \n", + "Media Size | \n", + "MP3 URL | \n", + "IIIF Presentation Manifest | \n", + "Genre | \n", + "Other Title | \n", + "Recording Location | \n", + "Online Format | \n", + "Summary | \n", + "Rights Advisory | \n", + "Title | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20 | \n", + "https://www.loc.gov/item/jukebox-73006/ | \n", + "Source of original recording: Department of Sp... | \n", + "Victor | \n", + "2 | \n", + "1925-03-02 | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Medoff, David -- Vocalist -- Tenor Vocal'\n", + " '... | \n", + "B-32034 (Matrix ID) | \n", + "77981 | \n", + "10-in. | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Russian' 'Ethnic music' 'Russian'] | \n", + "['Zaitshek' \"The rabbit's footsteps\" 'Chiberia... | \n", + "New York, New York | \n", + "['audio' 'image'] | \n", + "Male vocal solo, with orchestra | \n", + "Inclusion of the recording in the National Juk... | \n", + "Zaichik | \n", + "
| 4 | \n", + "https://www.loc.gov/item/jukebox-675580/ | \n", + "Source of original recording: Department of Sp... | \n", + "Columbia | \n", + "1 | \n", + "1925-02-27 | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Cavaliers, The -- Musical Group -- Musical G... | \n", + "140403 (Matrix ID) | \n", + "331-D | \n", + "10-in. | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "[] | \n", + "[] | \n", + "New York, New York | \n", + "['audio' 'image'] | \n", + "Jazz/dance band | \n", + "Inclusion of the recording in the National Juk... | \n", + "The midnight waltz | \n", + "
| 6 | \n", + "https://www.loc.gov/item/jukebox-9599/ | \n", + "Source of original recording: Department of Sp... | \n", + "Victor | \n", + "1 | \n", + "1925-09-02 | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Piana, Sebastián -- Composer -- Composer'\n", + " '... | \n", + "BA-652 (Matrix ID) | \n", + "79584 | \n", + "10-in. | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Spanish (Argentina)' 'Ethnic music' 'Spanish... | \n", + "[] | \n", + "Buenos Aires, Argentina [unconfirmed] | \n", + "['audio' 'image'] | \n", + "Instrumental ensemble | \n", + "Inclusion of the recording in the National Juk... | \n", + "Sobre el pucho | \n", + "
| 12 | \n", + "https://www.loc.gov/item/jukebox-675604/ | \n", + "Source of original recording: Department of Sp... | \n", + "Columbia | \n", + "5 | \n", + "1925-03-21 | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Mandoliers, The -- Musical Group -- Musical ... | \n", + "W140409 (Matrix ID) | \n", + "345-D | \n", + "10-in. | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "[] | \n", + "['Waltz'] | \n", + "New York, New York | \n", + "['audio' 'image'] | \n", + "Jazz/dance band, with male vocal solo | \n", + "Inclusion of the recording in the National Juk... | \n", + "Only a weaver of dreams | \n", + "
| 63 | \n", + "https://www.loc.gov/item/jukebox-9670/ | \n", + "Source of original recording: Department of Sp... | \n", + "Victor | \n", + "1 | \n", + "1925-10-31 | \n", + "['Department of Special Collections, Davidson ... | \n", + "['Laina, José -- Composer -- Composer'\n", + " 'Orque... | \n", + "BA-725 (Matrix ID) | \n", + "79618 | \n", + "10-in. | \n", + "https://tile.loc.gov/streaming-services/iiif/s... | \n", + "['Manifest (JSON/LD)'] | \n", + "['Spanish (Argentina)' 'Ethnic music' 'Spanish... | \n", + "['Tango'] | \n", + "Buenos Aires, Argentina [unconfirmed] | \n", + "['audio' 'image'] | \n", + "Instrumental ensemble | \n", + "Inclusion of the recording in the National Juk... | \n", + "Y te deje llorando | \n", + "