From 46d4c8d48de3d730fbd1a6cc738275c3e1208389 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Tue, 8 Jul 2025 16:45:01 -0700
Subject: [PATCH 1/3] demo to transcribe the first 100 songs in the national
 jukebox

---
 2025/national-jukebox/.gitignore             |  1 +
 2025/national-jukebox/download_first_page.py | 46 ++++++++++
 2025/national-jukebox/extract_item_info.py   | 94 ++++++++++++++++++++
 2025/national-jukebox/extract_mp3.py         | 46 ++++++++++
 2025/national-jukebox/list_urls.py           | 63 +++++++++++++
 5 files changed, 250 insertions(+)
 create mode 100644 2025/national-jukebox/.gitignore
 create mode 100644 2025/national-jukebox/download_first_page.py
 create mode 100644 2025/national-jukebox/extract_item_info.py
 create mode 100644 2025/national-jukebox/extract_mp3.py
 create mode 100644 2025/national-jukebox/list_urls.py

diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore
new file mode 100644
index 0000000..07f43b8
--- /dev/null
+++ b/2025/national-jukebox/.gitignore
@@ -0,0 +1 @@
+data/*
\ No newline at end of file
diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py
new file mode 100644
index 0000000..878c061
--- /dev/null
+++ b/2025/national-jukebox/download_first_page.py
@@ -0,0 +1,46 @@
+import json
+import pathlib
+import requests
+import time
+
+import list_urls
+import extract_item_info
+import extract_mp3
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
+item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
+
+
+def download_and_extract_item(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+    response = requests.get(base_url)
+    while response.status_code == 429:
+        print("Too many requests, sleeping")
+        time.sleep(10)
+        response = requests.get(base_url)
+
+    try:
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    item = extract_item_info.extract_subheadings_to_dict(response.text)
+    mp3_url = extract_mp3.extract_mp3_url(response.text)
+    item["MP3 URL"] = mp3_url
+    return item
+
+
+with open(DATA_DIR / "jukebox.jsonl", "w") as data_file:
+    for item_url in item_urls:
+        item = download_and_extract_item(item_url)
+        json.dump(item, data_file, indent=None)
+        data_file.write("\n")
+        data_file.flush()
diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py
new file mode 100644
index 0000000..5644458
--- /dev/null
+++ b/2025/national-jukebox/extract_item_info.py
@@ -0,0 +1,94 @@
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_subheadings_to_dict(html_content):
+    """
+    Extracts subheadings from the "About this item" section of HTML
+    and returns them as a JSON object.
+
+    Args:
+        html_content (str): The HTML content as a string.
+
+    Returns:
+        str: A JSON string where each subheading is a key, and its corresponding
+             value is a list of items under that subheading.
+             Returns an empty JSON object string if the section is not found.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    about_this_item_section = soup.find('div', id='about-this-item')
+
+    if not about_this_item_section:
+        return json.dumps({})
+
+    subheadings_data = {}
+    
+    # Find the div that contains the actual cataloged data
+    item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
+
+    if item_cataloged_data:
+        # Iterate through each subheading (h3) within this div
+        for h3_tag in item_cataloged_data.find_all('h3'):
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            # The items for each subheading are in the immediately following <ul>
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    # Get text from list items, handling potential nested structures or links
+                    item_text = li_tag.get_text(strip=True)
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract "Part of" section as it's outside item-cataloged-data but still a subheading
+    part_of_section = about_this_item_section.find('div', id='part-of')
+    if part_of_section:
+        h3_tag = part_of_section.find('h3')
+        if h3_tag:
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    item_text = li_tag.get_text(strip=True)
+                    # Remove the count in parentheses if present, e.g., "(10,009)"
+                    if '(' in item_text and item_text.endswith(')'):
+                        item_text = item_text.rsplit('(', 1)[0].strip()
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract IIIF Presentation Manifest
+    iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
+    if iiif_manifest_section:
+        subheading_text = iiif_manifest_section.get_text(strip=True)
+        items = []
+        ul_tag = iiif_manifest_section.find_next_sibling('ul')
+        if ul_tag:
+            for li_tag in ul_tag.find_all('li'):
+                item_text = li_tag.get_text(strip=True)
+                items.append(item_text)
+        subheadings_data[subheading_text] = items
+
+    return subheadings_data
+
+
+def download_and_extract(base_url):
+    print(f"Fetching content from: {base_url}")
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    return extract_subheadings_to_dict(response.text)
+
+# Provided HTML content
+if __name__ == "__main__":
+    target_url = "https://www.loc.gov/item/jukebox-679643/"
+    item = download_and_extract(target_url)
+    if item:
+        print("\nFound song detail page URLs:")
+        print(json.dumps(item, indent=4))
+    else:
+        print("No song detail URLs found or an error occurred.")
diff --git a/2025/national-jukebox/extract_mp3.py b/2025/national-jukebox/extract_mp3.py
new file mode 100644
index 0000000..2e5d9f7
--- /dev/null
+++ b/2025/national-jukebox/extract_mp3.py
@@ -0,0 +1,46 @@
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_mp3_url(html_content):
+    """
+    Extracts the MP3 download URL from the given HTML content.
+
+    Args:
+        html_content (str): The HTML content of the webpage.
+
+    Returns:
+        str or None: The MP3 download URL if found, otherwise None.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Find the select element that contains download options
+    # Based on the HTML, it has an ID of 'select-resource0'
+    download_select = soup.find('select', id='select-resource0')
+
+    if download_select:
+        # Find the option tag specifically for AUDIO download (MP3)
+        # It has a data-file-download attribute set to "AUDIO"
+        mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
+        if mp3_option:
+            return mp3_option['value'] # Return the value attribute which is the URL
+    return None # Return None if the select or option is not found
+
+# Example Usage (assuming you've fetched the HTML using requests)
+if __name__ == "__main__":
+    url = "https://www.loc.gov/item/jukebox-679643/"
+    try:
+        response = requests.get(url)
+        response.raise_for_status() # Raise an exception for HTTP errors
+        html_doc = response.text
+
+        mp3_url = extract_mp3_url(html_doc)
+
+        if mp3_url:
+            print(f"Extracted MP3 URL: {mp3_url}")
+        else:
+            print("MP3 URL not found in the HTML.")
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the URL: {e}")
+
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
new file mode 100644
index 0000000..340e545
--- /dev/null
+++ b/2025/national-jukebox/list_urls.py
@@ -0,0 +1,63 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
+    """
+    Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
+
+    Args:
+        base_url: The URL of the main collection page (e.g., "https://www.loc.gov/collections/national-jukebox/?sb=date_desc").
+
+    Returns:
+        A list of URLs for the song detail pages.
+    """
+    print(f"Fetching content from: {base_url}")
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return []
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    detail_urls = []
+
+    # The structure of the page suggests that song links are typically within
+    # elements that represent individual items. Looking for common patterns like 'div'
+    # with specific classes or 'a' tags directly.
+    # From a quick inspection, it seems 'a' tags with hrefs pointing to individual
+    # records are nested within list items or similar structures.
+    # Let's try to find all links within a common container for search results.
+
+    # Assuming the main container for search results items has a class like 'item-results'
+    # or similar, and individual items have links within them.
+    # We'll look for <a> tags whose href attributes match a pattern for detail pages.
+    # A common pattern for Library of Congress detail pages is /item/{id}/ or /record/{id}/
+    # Let's target links that contain '/item/' in their href and are likely part of the main results.
+
+    # Find all 'a' tags that have an 'href' attribute
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        # Check if the href points to a detail page.
+        # Examples: /item/jukebox-12345/ or similar.
+        # We need to construct absolute URLs.
+        if '/item/jukebox' in href and not href.startswith('#'):
+            full_url = urljoin(base_url, href)
+            # Avoid adding duplicates if the same item link appears multiple times
+            if full_url not in detail_urls:
+                detail_urls.append(full_url)
+
+    return detail_urls
+
+if __name__ == "__main__":
+    target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
+    song_urls = get_national_jukebox_song_detail_urls(target_url)
+
+    if song_urls:
+        print("\nFound song detail page URLs:")
+        for url in song_urls:
+            print(url)
+        print(f"\nTotal URLs found: {len(song_urls)}")
+    else:
+        print("No song detail URLs found or an error occurred.")

From 89f02960336c4301d35bcfc6f66b57f1f665444b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Tue, 8 Jul 2025 21:48:21 -0700
Subject: [PATCH 2/3] download mp3s

---
 2025/national-jukebox/download_first_page.py | 39 +++++++++++---
 2025/national-jukebox/download_mp3s.py       | 54 ++++++++++++++++++++
 2025/national-jukebox/extract_item_info.py   | 18 ++++++-
 2025/national-jukebox/extract_mp3.py         | 14 +++++
 2025/national-jukebox/list_urls.py           | 14 +++++
 5 files changed, 132 insertions(+), 7 deletions(-)
 create mode 100644 2025/national-jukebox/download_mp3s.py

diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py
index 878c061..9a984e2 100644
--- a/2025/national-jukebox/download_first_page.py
+++ b/2025/national-jukebox/download_first_page.py
@@ -1,8 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import pathlib
-import requests
 import time
 
+import pandas
+import requests
+
 import list_urls
 import extract_item_info
 import extract_mp3
@@ -21,10 +37,6 @@ def download_and_extract_item(base_url):
     # Stay within 20 requests per minute rate limit.
     time.sleep(3)
     response = requests.get(base_url)
-    while response.status_code == 429:
-        print("Too many requests, sleeping")
-        time.sleep(10)
-        response = requests.get(base_url)
 
     try:
         response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
@@ -35,12 +47,27 @@ def download_and_extract_item(base_url):
     item = extract_item_info.extract_subheadings_to_dict(response.text)
     mp3_url = extract_mp3.extract_mp3_url(response.text)
     item["MP3 URL"] = mp3_url
+    item["URL"] = base_url
     return item
 
 
-with open(DATA_DIR / "jukebox.jsonl", "w") as data_file:
+visited_urls = {}
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+
+if jukebox_path.exists():
+    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+    visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
+
+
+with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
     for item_url in item_urls:
+        if item_url in visited_urls:
+            continue
+
         item = download_and_extract_item(item_url)
+        if item is None:
+            continue
+
         json.dump(item, data_file, indent=None)
         data_file.write("\n")
         data_file.flush()
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
new file mode 100644
index 0000000..b4486f0
--- /dev/null
+++ b/2025/national-jukebox/download_mp3s.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import time
+
+import pandas
+import requests
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+
+def download_mp3(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+    response = requests.get(base_url)
+
+    try:
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    return response.content
+
+
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+
+for _, row in jukebox.iterrows():
+    jukebox_id = row["URL"].split("/")[-2]
+    mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
+    if mp3_path.exists():
+        continue
+
+    mp3_bytes = download_mp3(row["MP3 URL"])
+    with open(mp3_path, "wb") as mp3_file:
+        mp3_file.write(mp3_bytes)
+    print(f"Wrote {mp3_path}")
diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py
index 5644458..a878c13 100644
--- a/2025/national-jukebox/extract_item_info.py
+++ b/2025/national-jukebox/extract_item_info.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from bs4 import BeautifulSoup
 import requests
 import json
@@ -81,7 +95,9 @@ def download_and_extract(base_url):
         print(f"Error fetching URL: {e}")
         return None
     
-    return extract_subheadings_to_dict(response.text)
+    item = extract_subheadings_to_dict(response.text)
+    item["URL"] = base_url
+    return item
 
 # Provided HTML content
 if __name__ == "__main__":
diff --git a/2025/national-jukebox/extract_mp3.py b/2025/national-jukebox/extract_mp3.py
index 2e5d9f7..81d3490 100644
--- a/2025/national-jukebox/extract_mp3.py
+++ b/2025/national-jukebox/extract_mp3.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from bs4 import BeautifulSoup
 import requests
 import json
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
index 340e545..ab55eeb 100644
--- a/2025/national-jukebox/list_urls.py
+++ b/2025/national-jukebox/list_urls.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin

From a33146461c4bca198bd601094e2c506b10e4a82f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Tue, 8 Jul 2025 22:37:12 -0700
Subject: [PATCH 3/3] create transcriptions

---
 2025/national-jukebox/transcribe_songs.ipynb | 950 +++++++++++++++++++
 1 file changed, 950 insertions(+)
 create mode 100644 2025/national-jukebox/transcribe_songs.ipynb

diff --git a/2025/national-jukebox/transcribe_songs.ipynb b/2025/national-jukebox/transcribe_songs.ipynb
new file mode 100644
index 0000000..522c4d3
--- /dev/null
+++ b/2025/national-jukebox/transcribe_songs.ipynb
@@ -0,0 +1,950 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2a4dd1c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bigframes.pandas as bpd\n",
+    "\n",
+    "# Set to your BigQuery GCP project.\n",
+    "bpd.options.bigquery.project = \"bigframes-dev\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ab3fce43",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swast/src/github.com/tswast/code-snippets/venv/lib/python3.12/site-packages/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
+      "  _global_session = bigframes.session.connect(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job bba16bb2-568c-4e49-b9d9-c03177b09ee0 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:bba16bb2-568c-4e49-b9d9-c03177b09ee0&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Load job 47733f1b-b249-4483-a989-c05c254592d5 is DONE. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:47733f1b-b249-4483-a989-c05c254592d5&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(100, 19)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = bpd.read_json(\n",
+    "    \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/jukebox.jsonl\",\n",
+    "    engine=\"bigquery\",\n",
+    "    orient=\"records\",\n",
+    "    lines=True,\n",
+    ")\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1a8576c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job d035f356-c15b-43e8-bd25-187c487c235a is DONE. 92.9 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d035f356-c15b-43e8-bd25-187c487c235a&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>URL</th>\n",
+       "      <th>Recording Repository</th>\n",
+       "      <th>Recording Label</th>\n",
+       "      <th>Recording Take Number</th>\n",
+       "      <th>Recording Date</th>\n",
+       "      <th>Part of</th>\n",
+       "      <th>Names</th>\n",
+       "      <th>Recording Matrix Number</th>\n",
+       "      <th>Recording Catalog Number</th>\n",
+       "      <th>Media Size</th>\n",
+       "      <th>MP3 URL</th>\n",
+       "      <th>IIIF Presentation Manifest</th>\n",
+       "      <th>Genre</th>\n",
+       "      <th>Other Title</th>\n",
+       "      <th>Recording Location</th>\n",
+       "      <th>Online Format</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Rights Advisory</th>\n",
+       "      <th>Title</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>69</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-9538/</td>\n",
+       "      <td>['Source of original recording: Department of ...</td>\n",
+       "      <td>['Victor']</td>\n",
+       "      <td>['1']</td>\n",
+       "      <td>['1925-04-30']</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Collazo, Ramón -- Composer -- Composer'\n",
+       " 'Ca...</td>\n",
+       "      <td>['BA-591 (Matrix ID)']</td>\n",
+       "      <td>['79553']</td>\n",
+       "      <td>['10-in.']</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Spanish (Argentina)' 'Ethnic music' 'Spanish...</td>\n",
+       "      <td>['Tango']</td>\n",
+       "      <td>['Buenos Aires, Argentina [unconfirmed]']</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>['Instrumental ensemble']</td>\n",
+       "      <td>['Inclusion of the recording in the National J...</td>\n",
+       "      <td>['Volveras']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-767050/</td>\n",
+       "      <td>['Source of original recording: Department of ...</td>\n",
+       "      <td>['Columbia']</td>\n",
+       "      <td>['2']</td>\n",
+       "      <td>['1925-12-31']</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Foster, Stephen Collins -- Composer -- Compo...</td>\n",
+       "      <td>['90141 (Matrix ID)']</td>\n",
+       "      <td>['A3109']</td>\n",
+       "      <td>['10-in.']</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>['My old Kentucky home']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>['Band']</td>\n",
+       "      <td>['Inclusion of the recording in the National J...</td>\n",
+       "      <td>['Battle hymn of the Republic']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>72</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-744502/</td>\n",
+       "      <td>['Source of original recording: Department of ...</td>\n",
+       "      <td>['Columbia']</td>\n",
+       "      <td>['2']</td>\n",
+       "      <td>['1925-03-30']</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Leal, Gustavo -- Vocalist -- Tenor Vocal']</td>\n",
+       "      <td>['105566 (Matrix ID)']</td>\n",
+       "      <td>['1007-X']</td>\n",
+       "      <td>['10-in.']</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Portuguese']</td>\n",
+       "      <td>['Cancao popular']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>['Male vocal solo, with orchestra']</td>\n",
+       "      <td>['Inclusion of the recording in the National J...</td>\n",
+       "      <td>['Fado do padeiro']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-9648/</td>\n",
+       "      <td>['Source of original recording: Department of ...</td>\n",
+       "      <td>['Victor']</td>\n",
+       "      <td>['2']</td>\n",
+       "      <td>['1925-11-09']</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Orquesta Típica Victor -- Musical Group -- M...</td>\n",
+       "      <td>['BA-703 (Matrix ID)']</td>\n",
+       "      <td>['79608']</td>\n",
+       "      <td>['10-in.']</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Spanish (Argentina)' 'Ethnic music' 'Spanish...</td>\n",
+       "      <td>['Tango']</td>\n",
+       "      <td>['Buenos Aires, Argentina [unconfirmed]']</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>['Instrumental ensemble']</td>\n",
+       "      <td>['Inclusion of the recording in the National J...</td>\n",
+       "      <td>['Sarandi']</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-767143/</td>\n",
+       "      <td>['Source of original recording: Department of ...</td>\n",
+       "      <td>['Columbia']</td>\n",
+       "      <td>['2']</td>\n",
+       "      <td>['1925-12-31']</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Columbia Band -- Musical Group -- Musical Gr...</td>\n",
+       "      <td>['90134 (Matrix ID)']</td>\n",
+       "      <td>['A3114']</td>\n",
+       "      <td>['10-in.']</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Educational']</td>\n",
+       "      <td>['Come, Thou almighty King']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>['Band']</td>\n",
+       "      <td>['Inclusion of the recording in the National J...</td>\n",
+       "      <td>[\"Love's old sweet song\"]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                         URL  \\\n",
+       "69    https://www.loc.gov/item/jukebox-9538/   \n",
+       "50  https://www.loc.gov/item/jukebox-767050/   \n",
+       "72  https://www.loc.gov/item/jukebox-744502/   \n",
+       "89    https://www.loc.gov/item/jukebox-9648/   \n",
+       "48  https://www.loc.gov/item/jukebox-767143/   \n",
+       "\n",
+       "                                 Recording Repository Recording Label  \\\n",
+       "69  ['Source of original recording: Department of ...      ['Victor']   \n",
+       "50  ['Source of original recording: Department of ...    ['Columbia']   \n",
+       "72  ['Source of original recording: Department of ...    ['Columbia']   \n",
+       "89  ['Source of original recording: Department of ...      ['Victor']   \n",
+       "48  ['Source of original recording: Department of ...    ['Columbia']   \n",
+       "\n",
+       "   Recording Take Number  Recording Date  \\\n",
+       "69                 ['1']  ['1925-04-30']   \n",
+       "50                 ['2']  ['1925-12-31']   \n",
+       "72                 ['2']  ['1925-03-30']   \n",
+       "89                 ['2']  ['1925-11-09']   \n",
+       "48                 ['2']  ['1925-12-31']   \n",
+       "\n",
+       "                                              Part of  \\\n",
+       "69  ['Department of Special Collections, Davidson ...   \n",
+       "50  ['Department of Special Collections, Davidson ...   \n",
+       "72  ['Department of Special Collections, Davidson ...   \n",
+       "89  ['Department of Special Collections, Davidson ...   \n",
+       "48  ['Department of Special Collections, Davidson ...   \n",
+       "\n",
+       "                                                Names Recording Matrix Number  \\\n",
+       "69  ['Collazo, Ramón -- Composer -- Composer'\n",
+       " 'Ca...  ['BA-591 (Matrix ID)']   \n",
+       "50  ['Foster, Stephen Collins -- Composer -- Compo...   ['90141 (Matrix ID)']   \n",
+       "72       ['Leal, Gustavo -- Vocalist -- Tenor Vocal']  ['105566 (Matrix ID)']   \n",
+       "89  ['Orquesta Típica Victor -- Musical Group -- M...  ['BA-703 (Matrix ID)']   \n",
+       "48  ['Columbia Band -- Musical Group -- Musical Gr...   ['90134 (Matrix ID)']   \n",
+       "\n",
+       "   Recording Catalog Number  Media Size  \\\n",
+       "69                ['79553']  ['10-in.']   \n",
+       "50                ['A3109']  ['10-in.']   \n",
+       "72               ['1007-X']  ['10-in.']   \n",
+       "89                ['79608']  ['10-in.']   \n",
+       "48                ['A3114']  ['10-in.']   \n",
+       "\n",
+       "                                              MP3 URL  \\\n",
+       "69  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "50  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "72  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "89  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "48  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "\n",
+       "   IIIF Presentation Manifest  \\\n",
+       "69     ['Manifest (JSON/LD)']   \n",
+       "50     ['Manifest (JSON/LD)']   \n",
+       "72     ['Manifest (JSON/LD)']   \n",
+       "89     ['Manifest (JSON/LD)']   \n",
+       "48     ['Manifest (JSON/LD)']   \n",
+       "\n",
+       "                                                Genre  \\\n",
+       "69  ['Spanish (Argentina)' 'Ethnic music' 'Spanish...   \n",
+       "50                                                 []   \n",
+       "72                                     ['Portuguese']   \n",
+       "89  ['Spanish (Argentina)' 'Ethnic music' 'Spanish...   \n",
+       "48                                    ['Educational']   \n",
+       "\n",
+       "                     Other Title                         Recording Location  \\\n",
+       "69                     ['Tango']  ['Buenos Aires, Argentina [unconfirmed]']   \n",
+       "50      ['My old Kentucky home']                                         []   \n",
+       "72            ['Cancao popular']                                         []   \n",
+       "89                     ['Tango']  ['Buenos Aires, Argentina [unconfirmed]']   \n",
+       "48  ['Come, Thou almighty King']                                         []   \n",
+       "\n",
+       "        Online Format                              Summary  \\\n",
+       "69  ['audio' 'image']            ['Instrumental ensemble']   \n",
+       "50  ['audio' 'image']                             ['Band']   \n",
+       "72  ['audio' 'image']  ['Male vocal solo, with orchestra']   \n",
+       "89  ['audio' 'image']            ['Instrumental ensemble']   \n",
+       "48  ['audio' 'image']                             ['Band']   \n",
+       "\n",
+       "                                      Rights Advisory  \\\n",
+       "69  ['Inclusion of the recording in the National J...   \n",
+       "50  ['Inclusion of the recording in the National J...   \n",
+       "72  ['Inclusion of the recording in the National J...   \n",
+       "89  ['Inclusion of the recording in the National J...   \n",
+       "48  ['Inclusion of the recording in the National J...   \n",
+       "\n",
+       "                              Title  \n",
+       "69                     ['Volveras']  \n",
+       "50  ['Battle hymn of the Republic']  \n",
+       "72              ['Fado do padeiro']  \n",
+       "89                      ['Sarandi']  \n",
+       "48        [\"Love's old sweet song\"]  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.peek()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9cff74db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 19)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Flatten lists where we (probably?) only have one item.\n",
+    "flattened = df.explode([\n",
+    "    \"Recording Repository\",\n",
+    "    \"Recording Label\",\n",
+    "    \"Recording Take Number\",\n",
+    "    \"Recording Date\",\n",
+    "    \"Recording Matrix Number\",\n",
+    "    \"Recording Catalog Number\",\n",
+    "    \"Media Size\",\n",
+    "    \"Recording Location\",\n",
+    "    \"Summary\",\n",
+    "    \"Rights Advisory\",\n",
+    "    \"Title\",\n",
+    "])\n",
+    "flattened.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a3049f10",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job b3cb5627-99f0-4859-ae7e-1118e7b26b20 is DONE. 93.7 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b3cb5627-99f0-4859-ae7e-1118e7b26b20&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>URL</th>\n",
+       "      <th>Recording Repository</th>\n",
+       "      <th>Recording Label</th>\n",
+       "      <th>Recording Take Number</th>\n",
+       "      <th>Recording Date</th>\n",
+       "      <th>Part of</th>\n",
+       "      <th>Names</th>\n",
+       "      <th>Recording Matrix Number</th>\n",
+       "      <th>Recording Catalog Number</th>\n",
+       "      <th>Media Size</th>\n",
+       "      <th>MP3 URL</th>\n",
+       "      <th>IIIF Presentation Manifest</th>\n",
+       "      <th>Genre</th>\n",
+       "      <th>Other Title</th>\n",
+       "      <th>Recording Location</th>\n",
+       "      <th>Online Format</th>\n",
+       "      <th>Summary</th>\n",
+       "      <th>Rights Advisory</th>\n",
+       "      <th>Title</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-73006/</td>\n",
+       "      <td>Source of original recording: Department of Sp...</td>\n",
+       "      <td>Victor</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1925-03-02</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Medoff, David -- Vocalist -- Tenor Vocal'\n",
+       " '...</td>\n",
+       "      <td>B-32034 (Matrix ID)</td>\n",
+       "      <td>77981</td>\n",
+       "      <td>10-in.</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Russian' 'Ethnic music' 'Russian']</td>\n",
+       "      <td>['Zaitshek' \"The rabbit's footsteps\" 'Chiberia...</td>\n",
+       "      <td>New York, New York</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>Male vocal solo, with orchestra</td>\n",
+       "      <td>Inclusion of the recording in the National Juk...</td>\n",
+       "      <td>Zaichik</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-675580/</td>\n",
+       "      <td>Source of original recording: Department of Sp...</td>\n",
+       "      <td>Columbia</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1925-02-27</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Cavaliers, The -- Musical Group -- Musical G...</td>\n",
+       "      <td>140403 (Matrix ID)</td>\n",
+       "      <td>331-D</td>\n",
+       "      <td>10-in.</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>New York, New York</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>Jazz/dance band</td>\n",
+       "      <td>Inclusion of the recording in the National Juk...</td>\n",
+       "      <td>The midnight waltz</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-9599/</td>\n",
+       "      <td>Source of original recording: Department of Sp...</td>\n",
+       "      <td>Victor</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1925-09-02</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Piana, Sebastián -- Composer -- Composer'\n",
+       " '...</td>\n",
+       "      <td>BA-652 (Matrix ID)</td>\n",
+       "      <td>79584</td>\n",
+       "      <td>10-in.</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Spanish (Argentina)' 'Ethnic music' 'Spanish...</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>Buenos Aires, Argentina [unconfirmed]</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>Instrumental ensemble</td>\n",
+       "      <td>Inclusion of the recording in the National Juk...</td>\n",
+       "      <td>Sobre el pucho</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-675604/</td>\n",
+       "      <td>Source of original recording: Department of Sp...</td>\n",
+       "      <td>Columbia</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1925-03-21</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Mandoliers, The -- Musical Group -- Musical ...</td>\n",
+       "      <td>W140409 (Matrix ID)</td>\n",
+       "      <td>345-D</td>\n",
+       "      <td>10-in.</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>['Waltz']</td>\n",
+       "      <td>New York, New York</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>Jazz/dance band, with male vocal solo</td>\n",
+       "      <td>Inclusion of the recording in the National Juk...</td>\n",
+       "      <td>Only a weaver of dreams</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>63</th>\n",
+       "      <td>https://www.loc.gov/item/jukebox-9670/</td>\n",
+       "      <td>Source of original recording: Department of Sp...</td>\n",
+       "      <td>Victor</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1925-10-31</td>\n",
+       "      <td>['Department of Special Collections, Davidson ...</td>\n",
+       "      <td>['Laina, José -- Composer -- Composer'\n",
+       " 'Orque...</td>\n",
+       "      <td>BA-725 (Matrix ID)</td>\n",
+       "      <td>79618</td>\n",
+       "      <td>10-in.</td>\n",
+       "      <td>https://tile.loc.gov/streaming-services/iiif/s...</td>\n",
+       "      <td>['Manifest (JSON/LD)']</td>\n",
+       "      <td>['Spanish (Argentina)' 'Ethnic music' 'Spanish...</td>\n",
+       "      <td>['Tango']</td>\n",
+       "      <td>Buenos Aires, Argentina [unconfirmed]</td>\n",
+       "      <td>['audio' 'image']</td>\n",
+       "      <td>Instrumental ensemble</td>\n",
+       "      <td>Inclusion of the recording in the National Juk...</td>\n",
+       "      <td>Y te deje llorando</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                         URL  \\\n",
+       "20   https://www.loc.gov/item/jukebox-73006/   \n",
+       "4   https://www.loc.gov/item/jukebox-675580/   \n",
+       "6     https://www.loc.gov/item/jukebox-9599/   \n",
+       "12  https://www.loc.gov/item/jukebox-675604/   \n",
+       "63    https://www.loc.gov/item/jukebox-9670/   \n",
+       "\n",
+       "                                 Recording Repository Recording Label  \\\n",
+       "20  Source of original recording: Department of Sp...          Victor   \n",
+       "4   Source of original recording: Department of Sp...        Columbia   \n",
+       "6   Source of original recording: Department of Sp...          Victor   \n",
+       "12  Source of original recording: Department of Sp...        Columbia   \n",
+       "63  Source of original recording: Department of Sp...          Victor   \n",
+       "\n",
+       "   Recording Take Number Recording Date  \\\n",
+       "20                     2     1925-03-02   \n",
+       "4                      1     1925-02-27   \n",
+       "6                      1     1925-09-02   \n",
+       "12                     5     1925-03-21   \n",
+       "63                     1     1925-10-31   \n",
+       "\n",
+       "                                              Part of  \\\n",
+       "20  ['Department of Special Collections, Davidson ...   \n",
+       "4   ['Department of Special Collections, Davidson ...   \n",
+       "6   ['Department of Special Collections, Davidson ...   \n",
+       "12  ['Department of Special Collections, Davidson ...   \n",
+       "63  ['Department of Special Collections, Davidson ...   \n",
+       "\n",
+       "                                                Names Recording Matrix Number  \\\n",
+       "20  ['Medoff, David -- Vocalist -- Tenor Vocal'\n",
+       " '...     B-32034 (Matrix ID)   \n",
+       "4   ['Cavaliers, The -- Musical Group -- Musical G...      140403 (Matrix ID)   \n",
+       "6   ['Piana, Sebastián -- Composer -- Composer'\n",
+       " '...      BA-652 (Matrix ID)   \n",
+       "12  ['Mandoliers, The -- Musical Group -- Musical ...     W140409 (Matrix ID)   \n",
+       "63  ['Laina, José -- Composer -- Composer'\n",
+       " 'Orque...      BA-725 (Matrix ID)   \n",
+       "\n",
+       "   Recording Catalog Number Media Size  \\\n",
+       "20                    77981     10-in.   \n",
+       "4                     331-D     10-in.   \n",
+       "6                     79584     10-in.   \n",
+       "12                    345-D     10-in.   \n",
+       "63                    79618     10-in.   \n",
+       "\n",
+       "                                              MP3 URL  \\\n",
+       "20  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "4   https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "6   https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "12  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "63  https://tile.loc.gov/streaming-services/iiif/s...   \n",
+       "\n",
+       "   IIIF Presentation Manifest  \\\n",
+       "20     ['Manifest (JSON/LD)']   \n",
+       "4      ['Manifest (JSON/LD)']   \n",
+       "6      ['Manifest (JSON/LD)']   \n",
+       "12     ['Manifest (JSON/LD)']   \n",
+       "63     ['Manifest (JSON/LD)']   \n",
+       "\n",
+       "                                                Genre  \\\n",
+       "20               ['Russian' 'Ethnic music' 'Russian']   \n",
+       "4                                                  []   \n",
+       "6   ['Spanish (Argentina)' 'Ethnic music' 'Spanish...   \n",
+       "12                                                 []   \n",
+       "63  ['Spanish (Argentina)' 'Ethnic music' 'Spanish...   \n",
+       "\n",
+       "                                          Other Title  \\\n",
+       "20  ['Zaitshek' \"The rabbit's footsteps\" 'Chiberia...   \n",
+       "4                                                  []   \n",
+       "6                                                  []   \n",
+       "12                                          ['Waltz']   \n",
+       "63                                          ['Tango']   \n",
+       "\n",
+       "                       Recording Location      Online Format  \\\n",
+       "20                     New York, New York  ['audio' 'image']   \n",
+       "4                      New York, New York  ['audio' 'image']   \n",
+       "6   Buenos Aires, Argentina [unconfirmed]  ['audio' 'image']   \n",
+       "12                     New York, New York  ['audio' 'image']   \n",
+       "63  Buenos Aires, Argentina [unconfirmed]  ['audio' 'image']   \n",
+       "\n",
+       "                                  Summary  \\\n",
+       "20        Male vocal solo, with orchestra   \n",
+       "4                         Jazz/dance band   \n",
+       "6                   Instrumental ensemble   \n",
+       "12  Jazz/dance band, with male vocal solo   \n",
+       "63                  Instrumental ensemble   \n",
+       "\n",
+       "                                      Rights Advisory                    Title  \n",
+       "20  Inclusion of the recording in the National Juk...                  Zaichik  \n",
+       "4   Inclusion of the recording in the National Juk...       The midnight waltz  \n",
+       "6   Inclusion of the recording in the National Juk...           Sobre el pucho  \n",
+       "12  Inclusion of the recording in the National Juk...  Only a weaver of dreams  \n",
+       "63  Inclusion of the recording in the National Juk...       Y te deje llorando  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "flattened.peek()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "9f12ba57",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job e1888e05-f145-474f-a3f3-58be7811b7b0 is DONE. 94.5 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:e1888e05-f145-474f-a3f3-58be7811b7b0&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "flattened = flattened.assign(**{\n",
+    "    \"GCS Prefix\": \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/\",\n",
+    "    \"GCS Stub\": flattened['URL'].str.extract(r'/(jukebox-[0-9]+)/'),\n",
+    "})\n",
+    "flattened.cache()\n",
+    "flattened[\"GCS URI\"] = flattened[\"GCS Prefix\"] + flattened[\"GCS Stub\"] + \".mp3\"\n",
+    "flattened[\"GCS Blob\"] = flattened[\"GCS URI\"].str.to_blob()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "58a68a62",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 725dd410-d5bc-4f5c-870d-937653bbd843 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:725dd410-d5bc-4f5c-870d-937653bbd843&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 8d530b66-2036-481e-b7aa-c92a4692bdbc is DONE. 8.6 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8d530b66-2036-481e-b7aa-c92a4692bdbc&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swast/src/github.com/tswast/code-snippets/venv/lib/python3.12/site-packages/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
+      "`db_dtypes` is a preview feature and subject to change.\n",
+      "  warnings.warn(msg, bfe.PreviewWarning)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 302331e3-8e70-461b-b5d1-e5afd9aedade is DONE. 151.2 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:302331e3-8e70-461b-b5d1-e5afd9aedade&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 5c28835a-5cca-4c9a-8f9f-a0f3cebe98a4 is DONE. 48.9 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:5c28835a-5cca-4c9a-8f9f-a0f3cebe98a4&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0     {'status': '', 'content': \"Somebody's trying t...\n",
+       "1                        {'status': '', 'content': '.'}\n",
+       "2     {'status': '', 'content': \"Softly the pale blo...\n",
+       "3     {'status': '', 'content': \"It's goodbye you sa...\n",
+       "4                        {'status': '', 'content': '.'}\n",
+       "5                        {'status': '', 'content': '.'}\n",
+       "6                         {'status': '', 'content': ''}\n",
+       "7     {'status': '', 'content': \"Oh look at father t...\n",
+       "8     {'status': '', 'content': 'Una indita y un ran...\n",
+       "9                         {'status': '', 'content': ''}\n",
+       "10                       {'status': '', 'content': '.'}\n",
+       "11    {'status': '', 'content': 'Где ты, блаженство ...\n",
+       "12    {'status': '', 'content': \"Only a weaver of dr...\n",
+       "13                     {'status': '', 'content': '.\\n'}\n",
+       "14                    {'status': '', 'content': 'Woo!'}\n",
+       "15                     {'status': '', 'content': '.\\n'}\n",
+       "16    {'status': '', 'content': 'Prišiel ten opar, p...\n",
+       "17                       {'status': '', 'content': '.'}\n",
+       "18    {'status': '', 'content': \"Crosswords have mad...\n",
+       "19    {'status': '', 'content': 'From the crown of b...\n",
+       "20    {'status': '', 'content': 'Запрягли девки в до...\n",
+       "21    {'status': '', 'content': \"No wonder, no wonde...\n",
+       "22                        {'status': '', 'content': ''}\n",
+       "23    {'status': '', 'content': 'Načo som na svete n...\n",
+       "24                        {'status': '', 'content': ''}\n",
+       "Name: Transcription, dtype: struct<status: string, content: string>[pyarrow]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "flattened[\"Transcription\"] = flattened[\"GCS Blob\"].blob.audio_transcribe(\n",
+    "    model_name=\"gemini-2.0-flash-001\",\n",
+    "    verbose=True,\n",
+    ")\n",
+    "flattened[\"Transcription\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1b88401e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job da1d9ceb-5c79-430d-8278-6d0fd5b234ea is DONE. 170.4 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:da1d9ceb-5c79-430d-8278-6d0fd5b234ea&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 766e96ed-5b32-4e5c-bcb4-aff97e45abea is DONE. 48.7 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:766e96ed-5b32-4e5c-bcb4-aff97e45abea&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0     Somebody's trying to take you away, trying to ...\n",
+       "1                                                     .\n",
+       "2     Softly the pale blossoms fall, here by the old...\n",
+       "3     It's goodbye you said, parting of the way.\\nMe...\n",
+       "4                                                     .\n",
+       "5                                                     .\n",
+       "6                                                      \n",
+       "7     Oh look at father throwing us on.\\nGoodbye mou...\n",
+       "8     Una indita y un ranchero\\nfueron a la zanja a ...\n",
+       "9                                                      \n",
+       "10                                                    .\n",
+       "11    Где ты, блаженство прежних дней?\\nУвы, давно в...\n",
+       "12    Only a weaver of dreams, all and dreams, dream...\n",
+       "13                                                  .\\n\n",
+       "14                                                 Woo!\n",
+       "15                                                  .\\n\n",
+       "16    Prišiel ten opar, príroda usnulá zas,\\nna okie...\n",
+       "17                                                    .\n",
+       "18    Crosswords have made me blue as can be.\\nWrong...\n",
+       "19    From the crown of bitter thorns, the King of G...\n",
+       "20    Запрягли девки в дом кучера,\\nОх, в дом кучера...\n",
+       "21    No wonder, no wonder, I feel so wonderful dear...\n",
+       "22                                                     \n",
+       "23    Načo som na svete narodená, keď nemám žiadneho...\n",
+       "24                                                     \n",
+       "Name: content, dtype: string"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "flattened[\"Transcription\"].struct.field(\"content\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5f825bb3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "np.int64(100)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(flattened[\"Transcription\"].struct.field(\"status\") == \"\").sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b9e9c0ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 24)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "flattened.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99ea9176",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}