diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore new file mode 100644 index 0000000..07f43b8 --- /dev/null +++ b/2025/national-jukebox/.gitignore @@ -0,0 +1 @@ +data/* \ No newline at end of file diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py new file mode 100644 index 0000000..9a984e2 --- /dev/null +++ b/2025/national-jukebox/download_first_page.py @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import pathlib +import time + +import pandas +import requests + +import list_urls +import extract_item_info +import extract_mp3 + + +DATA_DIR = pathlib.Path(__file__).parent / "data" + + +target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100" +item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url) + + +def download_and_extract_item(base_url): + print(f"Fetching content from: {base_url}") + # https://guides.loc.gov/digital-scholarship/faq + # Stay within 20 requests per minute rate limit. + time.sleep(3) + response = requests.get(base_url) + + try: + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + item = extract_item_info.extract_subheadings_to_dict(response.text) + mp3_url = extract_mp3.extract_mp3_url(response.text) + item["MP3 URL"] = mp3_url + item["URL"] = base_url + return item + + +visited_urls = {} +jukebox_path = DATA_DIR / "jukebox.jsonl" + +if jukebox_path.exists(): + jukebox = pandas.read_json(jukebox_path, lines=True, orient="records") + visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {} + + +with open(DATA_DIR / "jukebox.jsonl", "a") as data_file: + for item_url in item_urls: + if item_url in visited_urls: + continue + + item = download_and_extract_item(item_url) + if item is None: + continue + + json.dump(item, data_file, indent=None) + data_file.write("\n") + data_file.flush() diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py new file mode 100644 index 0000000..b4486f0 --- /dev/null +++ b/2025/national-jukebox/download_mp3s.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import time + +import pandas +import requests + + +DATA_DIR = pathlib.Path(__file__).parent / "data" + + + +def download_mp3(base_url): + print(f"Fetching content from: {base_url}") + # https://guides.loc.gov/digital-scholarship/faq + # Stay within 20 requests per minute rate limit. + time.sleep(3) + response = requests.get(base_url) + + try: + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + return response.content + + +jukebox_path = DATA_DIR / "jukebox.jsonl" +jukebox = pandas.read_json(jukebox_path, lines=True, orient="records") + +for _, row in jukebox.iterrows(): + jukebox_id = row["URL"].split("/")[-2] + mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3") + if mp3_path.exists(): + continue + + mp3_bytes = download_mp3(row["MP3 URL"]) + with open(mp3_path, "wb") as mp3_file: + mp3_file.write(mp3_bytes) + print(f"Wrote {mp3_path}") diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py new file mode 100644 index 0000000..a878c13 --- /dev/null +++ b/2025/national-jukebox/extract_item_info.py @@ -0,0 +1,110 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bs4 import BeautifulSoup +import requests +import json + +def extract_subheadings_to_dict(html_content): + """ + Extracts subheadings from the "About this item" section of HTML + and returns them as a JSON object. + + Args: + html_content (str): The HTML content as a string. + + Returns: + str: A JSON string where each subheading is a key, and its corresponding + value is a list of items under that subheading. + Returns an empty JSON object string if the section is not found. + """ + soup = BeautifulSoup(html_content, 'html.parser') + about_this_item_section = soup.find('div', id='about-this-item') + + if not about_this_item_section: + return json.dumps({}) + + subheadings_data = {} + + # Find the div that contains the actual cataloged data + item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data') + + if item_cataloged_data: + # Iterate through each subheading (h3) within this div + for h3_tag in item_cataloged_data.find_all('h3'): + subheading_text = h3_tag.get_text(strip=True) + items = [] + # The items for each subheading are in the immediately following