# 01 ‚Äî Jamendo CC Catalog Enumeration

Enumerate Jamendo music catalog via API and compute metadata statistics for CC-BY and CC-BY-SA licensed tracks only.

Outputs: summary JSON, CSV breakdowns, optional Parquet archive.

In [2]:
%%bash
pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ1m‚ï∏[0m[90m‚îÅ[0m [32m0.0/914.9 kB[0m [31m?[0m eta [36m-:--:--[0m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m524.3/914.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m914.9/914.9 kB[0m [31m2.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [None]:
from pathlib import Path
import os
import json
from datetime import datetime

# Base paths
BASE_DIR = Path("/root/workspace")
OUTPUT_DIR = BASE_DIR / "data" / "jamendo_cc_catalog"
#OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
JAMENDO_CLIENT_ID="48ecf016"
# API Configuration

if not JAMENDO_CLIENT_ID:
    raise RuntimeError("Set JAMENDO_CLIENT_ID environment variable")

JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"

# Processing Configuration
MAX_PAGES = int(os.environ.get("MAX_PAGES", "0"))  # 0 = all, 5 = dry run
PAGE_SIZE = 200  # Max allowed by Jamendo API
CHECKPOINT_INTERVAL = 10  # Save state every N pages
REQUEST_TIMEOUT = 30
RETRY_MAX_ATTEMPTS = 5
RETRY_BACKOFF_FACTOR = 2

# License Allowlist (strict)
ALLOWED_LICENSES = {"cc-by", "cc-by-sa"}

# Output files
STATE_FILE = OUTPUT_DIR / "state.json"
JSONL_FILE = OUTPUT_DIR / "jamendo_cc_tracks_metadata.jsonl"
SUMMARY_FILE = OUTPUT_DIR / "jamendo_cc_hours_summary.json"
LICENSE_CSV = OUTPUT_DIR / "jamendo_cc_hours_by_license.csv"
DURATION_CSV = OUTPUT_DIR / "jamendo_cc_duration_stats.csv"
PARQUET_FILE = OUTPUT_DIR / "jamendo_cc_tracks_metadata.parquet"

print(f"Output directory: {OUTPUT_DIR}")
print(f"Client ID configured: {'‚úì' if JAMENDO_CLIENT_ID else '‚úó'}")
print(f"Mode: {'DRY RUN (max ' + str(MAX_PAGES) + ' pages)' if MAX_PAGES > 0 else 'FULL CRAWL'}")

In [11]:
# Test: Fetch FULL CATALOG with robust CC license filtering and diagnostics
import httpx
import json
from pathlib import Path
from tqdm.auto import tqdm  # Auto-detects environment (notebook widgets or text-based)
import time
import sys

JAMENDO_CLIENT_ID = "48ecf016"
JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"

# Output paths
OUTPUT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/full_track_info.json")
STATE_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/fetch_state.json")
CHECKPOINT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/tracks_checkpoint.jsonl")

# Rate limiting settings
REQUEST_DELAY = 0  # Delay between requests in seconds (100ms)
MAX_RETRIES = 5
RETRY_DELAY = 2  # Initial retry delay in seconds
CHECKPOINT_INTERVAL = 50  # Save checkpoint every N pages

def extract_canonical_license(track):
    """
    Robust license extractor - checks multiple fields and nested structures.
    Returns: 'cc-by', 'cc-by-sa', or None (rejected)
    """
    # Try multiple license fields
    license_url = None
    
    # Check license_ccurl first
    if track.get('license_ccurl'):
        license_url = track.get('license_ccurl')
    # Check licensecurl (alternate spelling)
    elif track.get('licensecurl'):
        license_url = track.get('licensecurl')
    # Check nested licenses array
    elif track.get('licenses') and isinstance(track.get('licenses'), list) and len(track.get('licenses')) > 0:
        first_license = track['licenses'][0]
        if isinstance(first_license, dict):
            license_url = first_license.get('url') or first_license.get('ccurl')
        elif isinstance(first_license, str):
            license_url = first_license
    
    if not license_url or not isinstance(license_url, str):
        return None
    
    # Normalize URL
    url_lower = license_url.lower().strip().rstrip('/')
    
    # Check for creativecommons.org pattern
    if 'creativecommons.org/licenses/' not in url_lower:
        return None
    
    # Extract license type
    parts = url_lower.split('creativecommons.org/licenses/')
    if len(parts) != 2:
        return None
    
    license_part = parts[1].split('/')[0]
    
    # Reject NC (NonCommercial) or ND (NoDerivatives)
    if 'nc' in license_part or 'nd' in license_part:
        return None
    
    # Accept only CC-BY or CC-BY-SA
    if license_part == 'by':
        return 'cc-by'
    elif license_part == 'by-sa':
        return 'cc-by-sa'
    else:
        return None

def get_license_flags(track):
    """
    Extract cc, ccnc, ccnd flags from track.
    Checks both top-level and nested licenses object.
    Returns tuple: (cc, ccnc, ccnd)
    """
    # Try top-level first
    cc_val = track.get('cc')
    ccnc_val = track.get('ccnc')
    ccnd_val = track.get('ccnd')
    
    # If not found, check inside licenses object
    licenses_obj = track.get('licenses')
    if licenses_obj and isinstance(licenses_obj, dict):
        if cc_val is None:
            cc_val = licenses_obj.get('cc')
        if ccnc_val is None:
            ccnc_val = licenses_obj.get('ccnc')
        if ccnd_val is None:
            ccnd_val = licenses_obj.get('ccnd')
    
    # Convert string "true"/"false" to boolean
    if isinstance(cc_val, str):
        cc_val = cc_val.lower() == 'true'
    if isinstance(ccnc_val, str):
        ccnc_val = ccnc_val.lower() == 'true'
    if isinstance(ccnd_val, str):
        ccnd_val = ccnd_val.lower() == 'true'
    
    return cc_val, ccnc_val, ccnd_val

def fetch_with_retry(client, url, params, max_retries=MAX_RETRIES):
    """Fetch with exponential backoff retry on rate limit errors."""
    for attempt in range(max_retries):
        try:
            response = client.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                # Rate limited - wait and retry with exponential backoff
                wait_time = RETRY_DELAY * (2 ** attempt)
                print(f"\n‚ö† Rate limited (429). Waiting {wait_time}s before retry {attempt + 1}/{max_retries}...")
                time.sleep(wait_time)
                if attempt == max_retries - 1:
                    raise
            else:
                raise
        except httpx.TimeoutException:
            if attempt == max_retries - 1:
                raise
            wait_time = RETRY_DELAY * (2 ** attempt)
            print(f"\n‚ö† Timeout. Waiting {wait_time}s before retry {attempt + 1}/{max_retries}...")
            time.sleep(wait_time)
    
    raise RuntimeError(f"Failed after {max_retries} retries")

def load_checkpoint():
    """Load checkpoint state if exists."""
    if STATE_FILE.exists():
        with open(STATE_FILE, 'r') as f:
            state = json.load(f)
        print(f"‚úì Found checkpoint: resuming from offset {state['last_offset']:,} ({state['tracks_fetched']:,} tracks)")
        return state
    return None

def save_checkpoint(state):
    """Save checkpoint state."""
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)

def load_existing_tracks():
    """Load tracks from checkpoint JSONL file."""
    if not CHECKPOINT_FILE.exists():
        return []
    
    tracks = []
    with open(CHECKPOINT_FILE, 'r') as f:
        for line in f:
            if line.strip():
                tracks.append(json.loads(line))
    return tracks

# Create client
client = httpx.Client(timeout=30)
checkpoint_file = None

try:
    # Check for existing checkpoint
    checkpoint = load_checkpoint()
    
    if checkpoint:
        offset = checkpoint['last_offset']
        all_tracks = load_existing_tracks()
        total_catalog_size = checkpoint['total_catalog_size']
        tracks_already_fetched = len(all_tracks)
        print(f"Resuming fetch: {tracks_already_fetched:,} tracks already fetched")
    else:
        # Get total catalog size first
        print("Starting fresh fetch...")
        print("Fetching catalog size...")
        first_data = fetch_with_retry(client, f"{JAMENDO_API_BASE}/tracks/", {
            "client_id": JAMENDO_CLIENT_ID,
            "format": "json",
            "limit": 1,
            "offset": 0,
            "audiodownload": "true",
            "include": "licenses",
            "fullcount": "true"
        })
        
        total_catalog_size = first_data.get("headers", {}).get("results_fullcount", 0)
        offset = 0
        all_tracks = []
        tracks_already_fetched = 0
        
        # Initialize checkpoint state
        checkpoint = {
            'last_offset': 0,
            'tracks_fetched': 0,
            'total_catalog_size': total_catalog_size
        }
        save_checkpoint(checkpoint)
        
        # Create empty checkpoint file
        CHECKPOINT_FILE.write_text('')
    
    print(f"Total catalog size: {total_catalog_size:,} tracks")
    total_pages = (total_catalog_size // PAGE_SIZE) + (1 if total_catalog_size % PAGE_SIZE else 0)
    print(f"Total pages to fetch: {total_pages:,}")
    print(f"Rate limit: {REQUEST_DELAY}s delay between requests")
    print(f"Checkpoint: saving every {CHECKPOINT_INTERVAL} pages")
    
    # Show resume progress
    remaining_tracks = total_catalog_size - tracks_already_fetched
    if tracks_already_fetched > 0:
        progress_pct = 100 * tracks_already_fetched / total_catalog_size
        print(f"\nüìä Resume Status:")
        print(f"   Already fetched: {tracks_already_fetched:,} tracks ({progress_pct:.1f}%)")
        print(f"   Remaining: {remaining_tracks:,} tracks")
    print()
    
    # Open checkpoint file in append mode
    checkpoint_file = open(CHECKPOINT_FILE, 'a')
    page_count = 0
    
    # Fetch all tracks with progress bar
    # Use tqdm.auto for automatic environment detection (works with or without ipywidgets)
    with tqdm(total=total_catalog_size, 
              initial=tracks_already_fetched, 
              desc="Fetching tracks", 
              unit="track",
              bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:
        
        while offset < total_catalog_size:
            params = {
                "client_id": JAMENDO_CLIENT_ID,
                "format": "json",
                "limit": PAGE_SIZE,
                "offset": offset,
                "audiodownload": "true",
                "include": "licenses",
            }
            
            # Fetch with retry logic
            data = fetch_with_retry(client, f"{JAMENDO_API_BASE}/tracks/", params)
            
            tracks = data.get("results", [])
            if not tracks:
                print(f"\nNo more tracks at offset {offset}")
                break
            
            # Append tracks to list and write to checkpoint file
            all_tracks.extend(tracks)
            for track in tracks:
                checkpoint_file.write(json.dumps(track) + '\n')
            
            offset += len(tracks)
            page_count += 1
            pbar.update(len(tracks))
            
            # Save checkpoint periodically
            if page_count % CHECKPOINT_INTERVAL == 0:
                checkpoint_file.flush()
                checkpoint['last_offset'] = offset
                checkpoint['tracks_fetched'] = len(all_tracks)
                save_checkpoint(checkpoint)
            
            # Rate limiting: wait between requests
            time.sleep(REQUEST_DELAY)
    
    # Close checkpoint file
    checkpoint_file.close()
    checkpoint_file = None
    
    print(f"\nTotal tracks fetched: {len(all_tracks):,}")
    
    # Save final JSON file
    print(f"Saving final file to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(all_tracks, f, indent=2)
    print(f"‚úì Saved full track info to: {OUTPUT_FILE}")
    print(f"  File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.1f} MB\n")
    
    # Clean up checkpoint files
    print("Cleaning up checkpoint files...")
    if STATE_FILE.exists():
        STATE_FILE.unlink()
    if CHECKPOINT_FILE.exists():
        CHECKPOINT_FILE.unlink()
    print("‚úì Checkpoint files removed\n")
    
    # Analytics counters
    total_tracks = len(all_tracks)
    empty_license_count = 0
    content_id_restricted_count = 0
    cc_flags_condition_count = 0
    no_nc_nd_count = 0
    cc_flags_passed_examples = []
    no_nc_nd_examples = []
    passed_cc_filter = []
    rejected_examples = []
    
    print("Analyzing licenses...")
    
    # Process tracks with progress bar
    for track in tqdm(all_tracks, desc="Analyzing licenses", unit="track"):
        # Get CC flags from track (handles nested licenses object)
        cc_val, ccnc_val, ccnd_val = get_license_flags(track)
        
        # Check: cc==true && ccnc==false && ccnd==false
        if cc_val == True and ccnc_val == False and ccnd_val == False:
            cc_flags_condition_count += 1
            if len(cc_flags_passed_examples) < 5:
                licenses_obj = track.get('licenses', {})
                cc_flags_passed_examples.append({
                    'id': track.get('id'),
                    'name': track.get('name'),
                    'cc': licenses_obj.get('cc') if isinstance(licenses_obj, dict) else track.get('cc'),
                    'ccnc': licenses_obj.get('ccnc') if isinstance(licenses_obj, dict) else track.get('ccnc'),
                    'ccnd': licenses_obj.get('ccnd') if isinstance(licenses_obj, dict) else track.get('ccnd'),
                    'license_url': track.get('license_ccurl') or track.get('licensecurl') or '(empty)'
                })
        
        # Check: ccnc==false && ccnd==false (regardless of cc)
        if ccnc_val == False and ccnd_val == False:
            no_nc_nd_count += 1
            if len(no_nc_nd_examples) < 5:
                licenses_obj = track.get('licenses', {})
                no_nc_nd_examples.append({
                    'id': track.get('id'),
                    'name': track.get('name'),
                    'cc': licenses_obj.get('cc') if isinstance(licenses_obj, dict) else track.get('cc'),
                    'ccnc': licenses_obj.get('ccnc') if isinstance(licenses_obj, dict) else track.get('ccnc'),
                    'ccnd': licenses_obj.get('ccnd') if isinstance(licenses_obj, dict) else track.get('ccnd'),
                    'license_url': track.get('license_ccurl') or track.get('licensecurl') or '(empty)'
                })
        
        # Check if license URL is empty
        license_url = track.get('license_ccurl') or track.get('licensecurl') or ''
        if not license_url:
            empty_license_count += 1
            if len(rejected_examples) < 5:
                rejected_examples.append({
                    'id': track.get('id'),
                    'name': track.get('name'),
                    'content_id_free': track.get('content_id_free'),
                    'license_url': '(empty)',
                    'reason': 'empty_license'
                })
            continue
        
        # Check content_id_free
        if track.get('content_id_free') == False:
            content_id_restricted_count += 1
            if len(rejected_examples) < 5:
                rejected_examples.append({
                    'id': track.get('id'),
                    'name': track.get('name'),
                    'content_id_free': track.get('content_id_free'),
                    'license_url': license_url,
                    'reason': 'content_id_restricted'
                })
            continue
        
        # Try to extract canonical license
        canonical = extract_canonical_license(track)
        
        if canonical:
            passed_cc_filter.append({
                'id': track.get('id'),
                'name': track.get('name'),
                'content_id_free': track.get('content_id_free'),
                'license_url': license_url,
                'canonical': canonical
            })
        else:
            if len(rejected_examples) < 5:
                rejected_examples.append({
                    'id': track.get('id'),
                    'name': track.get('name'),
                    'content_id_free': track.get('content_id_free'),
                    'license_url': license_url,
                    'reason': 'license_not_cc_by_or_cc_by_sa'
                })
    
    # Print diagnostics
    print("\n" + "="*60)
    print("JAMENDO FULL CATALOG LICENSE DIAGNOSTICS")
    print("="*60)
    print(f"Total tracks in catalog: {total_tracks:,}")
    print(f"Tracks with cc==true && ccnc==false && ccnd==false: {cc_flags_condition_count:,} ({100*cc_flags_condition_count/total_tracks:.1f}%)")
    print(f"Tracks with ccnc==false && ccnd==false (any cc): {no_nc_nd_count:,} ({100*no_nc_nd_count/total_tracks:.1f}%)")
    print(f"Tracks with empty license URL: {empty_license_count:,} ({100*empty_license_count/total_tracks:.1f}%)")
    print(f"Tracks with content_id_free=false: {content_id_restricted_count:,} ({100*content_id_restricted_count/total_tracks:.1f}%)")
    print(f"Tracks passing CC-BY/CC-BY-SA filter: {len(passed_cc_filter):,} ({100*len(passed_cc_filter)/total_tracks:.1f}%)")
    
    # Show CC flags examples
    if cc_flags_passed_examples:
        print("\n" + "="*60)
        print("üîç CC FLAGS CONDITION EXAMPLES (first 5)")
        print("   Tracks where cc==true && ccnc==false && ccnd==false")
        print("="*60)
        for track in cc_flags_passed_examples:
            print(f"ID: {track['id']}")
            print(f"  Name: {track['name']}")
            print(f"  cc: {track['cc']}")
            print(f"  ccnc: {track['ccnc']}")
            print(f"  ccnd: {track['ccnd']}")
            print(f"  License URL: {track['license_url']}")
            print()
    
    # Show no NC/ND examples
    if no_nc_nd_examples:
        print("\n" + "="*60)
        print("üîç NO NC/ND CONDITION EXAMPLES (first 5)")
        print("   Tracks where ccnc==false && ccnd==false (any cc)")
        print("="*60)
        for track in no_nc_nd_examples:
            print(f"ID: {track['id']}")
            print(f"  Name: {track['name']}")
            print(f"  cc: {track['cc']}")
            print(f"  ccnc: {track['ccnc']}")
            print(f"  ccnd: {track['ccnd']}")
            print(f"  License URL: {track['license_url']}")
            print()
    
    print("\n" + "="*60)
    print("‚úÖ PASSED FILTER - CC-BY or CC-BY-SA (first 5)")
    print("="*60)
    for track in passed_cc_filter[:5]:
        print(f"ID: {track['id']}")
        print(f"  Name: {track['name']}")
        print(f"  content_id_free: {track['content_id_free']}")
        print(f"  License: {track['canonical'].upper()}")
        print(f"  URL: {track['license_url']}")
        print()
    
    print("="*60)
    print("‚ùå REJECTED (first 5)")
    print("="*60)
    for track in rejected_examples[:5]:
        print(f"ID: {track['id']}")
        print(f"  Name: {track['name']}")
        print(f"  content_id_free: {track['content_id_free']}")
        print(f"  License URL: {track['license_url']}")
        print(f"  Reason: {track['reason']}")
        print()

except KeyboardInterrupt:
    print("\n\n‚ö†Ô∏è  INTERRUPTED - Cleaning up resources...")
    # Save checkpoint on interruption
    if 'checkpoint' in locals() and 'offset' in locals():
        try:
            checkpoint['last_offset'] = offset
            checkpoint['tracks_fetched'] = len(all_tracks) if 'all_tracks' in locals() else 0
            save_checkpoint(checkpoint)
            print(f"‚úì Checkpoint saved at offset {offset:,}")
            print(f"‚úì You can resume by re-running this cell")
        except Exception as e:
            print(f"‚úó Failed to save checkpoint: {e}")
    
    # Close checkpoint file if open
    if checkpoint_file is not None:
        try:
            checkpoint_file.flush()
            checkpoint_file.close()
            print("‚úì Checkpoint file closed")
        except:
            pass
    
    # Close HTTP client
    try:
        client.close()
        print("‚úì HTTP client closed")
    except:
        pass
    
    print("\nüõë Fetch interrupted. Progress has been saved.")
    sys.exit(0)

except Exception as e:
    print(f"\n‚úó Error: {e}")
    # Save checkpoint on error
    if 'checkpoint' in locals() and 'offset' in locals():
        try:
            checkpoint['last_offset'] = offset
            checkpoint['tracks_fetched'] = len(all_tracks) if 'all_tracks' in locals() else 0
            save_checkpoint(checkpoint)
            print(f"‚úì Checkpoint saved at offset {offset:,}")
        except:
            pass
    raise

finally:
    # Ensure resources are always cleaned up
    if checkpoint_file is not None:
        try:
            checkpoint_file.close()
        except:
            pass
    
    try:
        client.close()
    except:
        pass

‚úì Found checkpoint: resuming from offset 302,400 (357,798 tracks)
Resuming fetch: 368,396 tracks already fetched
Total catalog size: 848,742 tracks
Total pages to fetch: 4,244
Rate limit: 0s delay between requests
Checkpoint: saving every 50 pages

üìä Resume Status:
   Already fetched: 368,396 tracks (43.4%)
   Remaining: 480,346 tracks



Fetching tracks:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 371796/848742 [01:24<3:16:35]



‚ö†Ô∏è  INTERRUPTED - Cleaning up resources...
‚úì Checkpoint saved at offset 305,800
‚úì You can resume by re-running this cell
‚úì Checkpoint file closed
‚úì HTTP client closed

üõë Fetch interrupted. Progress has been saved.





SystemExit: 0

In [10]:
# Test: Download a single audio file to verify download functionality
import httpx
import json
from pathlib import Path
import time

# Configuration
JAMENDO_CLIENT_ID = "48ecf016"
JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"
TEST_DOWNLOAD_DIR = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/test_downloads")
TEST_DOWNLOAD_DIR.mkdir(exist_ok=True)

# Load tracks from checkpoint or final file
CHECKPOINT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/tracks_checkpoint.jsonl")
OUTPUT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/full_track_info.json")

print("Loading track data...")
if CHECKPOINT_FILE.exists():
    # Load first track from checkpoint
    with open(CHECKPOINT_FILE, 'r') as f:
        first_line = f.readline()
        if first_line.strip():
            test_track = json.loads(first_line)
            print(f"‚úì Loaded track from checkpoint file")
        else:
            print("‚úó Checkpoint file is empty")
            test_track = None
elif OUTPUT_FILE.exists():
    # Load first track from final file
    with open(OUTPUT_FILE, 'r') as f:
        all_tracks = json.load(f)
        if all_tracks:
            test_track = all_tracks[0]
            print(f"‚úì Loaded track from final output file")
        else:
            print("‚úó Output file has no tracks")
            test_track = None
else:
    print("‚úó No track data found. Run the fetch cell first.")
    test_track = None

if test_track:
    print("\n" + "="*60)
    print("TEST TRACK INFO")
    print("="*60)
    print(f"Track ID: {test_track.get('id')}")
    print(f"Name: {test_track.get('name')}")
    print(f"Artist: {test_track.get('artist_name')}")
    print(f"Album: {test_track.get('album_name')}")
    print(f"Duration: {test_track.get('duration')}s ({test_track.get('duration')/60:.1f} minutes)")
    
    # Get download URL from API
    print("\n" + "="*60)
    print("FETCHING DOWNLOAD URL")
    print("="*60)
    
    track_id = test_track.get('id')
    
    client = httpx.Client(timeout=30, follow_redirects=True)
    try:
        # Request track info with audiodownload format
        response = client.get(f"{JAMENDO_API_BASE}/tracks/", params={
            "client_id": JAMENDO_CLIENT_ID,
            "format": "json",
            "id": track_id,
            "audiodownload": "true"
        })
        response.raise_for_status()
        track_data = response.json()
        
        if track_data.get("results"):
            track_info = track_data["results"][0]
            download_url = track_info.get("audiodownload")
            
            if download_url:
                print(f"‚úì Download URL obtained: {download_url}")
                
                # Download the file
                print("\n" + "="*60)
                print("DOWNLOADING AUDIO FILE")
                print("="*60)
                
                # Generate filename
                safe_name = "".join(c for c in test_track.get('name', 'track') if c.isalnum() or c in (' ', '-', '_')).strip()
                safe_artist = "".join(c for c in test_track.get('artist_name', 'artist') if c.isalnum() or c in (' ', '-', '_')).strip()
                filename = f"{track_id}_{safe_artist}_{safe_name}.mp3"
                filepath = TEST_DOWNLOAD_DIR / filename
                
                print(f"Downloading to: {filepath}")
                print(f"Starting download...")
                
                start_time = time.time()
                
                # Stream download with progress
                with client.stream("GET", download_url) as r:
                    r.raise_for_status()
                    total_size = int(r.headers.get('content-length', 0))
                    
                    with open(filepath, 'wb') as f:
                        downloaded = 0
                        for chunk in r.iter_bytes(chunk_size=8192):
                            f.write(chunk)
                            downloaded += len(chunk)
                            if total_size > 0:
                                progress = (downloaded / total_size) * 100
                                print(f"\rProgress: {downloaded:,} / {total_size:,} bytes ({progress:.1f}%)", end='')
                
                elapsed = time.time() - start_time
                file_size_mb = filepath.stat().st_size / 1024 / 1024
                
                print(f"\n\n‚úì Download complete!")
                print(f"  File: {filepath.name}")
                print(f"  Size: {file_size_mb:.2f} MB")
                print(f"  Time: {elapsed:.1f} seconds")
                print(f"  Speed: {file_size_mb/elapsed:.2f} MB/s")
                print(f"\n‚úì Test download successful!")
                
            else:
                print("‚úó No download URL found in track data")
                print(f"Available fields: {list(track_info.keys())}")
        else:
            print("‚úó No track data returned from API")
            
    except httpx.HTTPError as e:
        print(f"‚úó HTTP error: {e}")
    except Exception as e:
        print(f"‚úó Error: {e}")
    finally:
        client.close()
else:
    print("\nSkipping test - no track data available.")

Loading track data...
‚úì Loaded track from checkpoint file

TEST TRACK INFO
Track ID: 168
Name: J'm'e FPM
Artist: TriFace
Album: Premiers Jets
Duration: 183s (3.0 minutes)

FETCHING DOWNLOAD URL
‚úì Download URL obtained: https://prod-1.storage.jamendo.com/download/track/168/mp32/

DOWNLOADING AUDIO FILE
Downloading to: /Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/test_downloads/168_TriFace_Jme FPM.mp3
Starting download...
Progress: 4,536,619 / 4,536,619 bytes (100.0%)

‚úì Download complete!
  File: 168_TriFace_Jme FPM.mp3
  Size: 4.33 MB
  Time: 3.8 seconds
  Speed: 1.13 MB/s

‚úì Test download successful!


In [None]:
%pip install -q httpx pandas pyarrow tqdm tenacity

In [None]:
import httpx
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
from typing import Optional, Dict, List, Any
import time

In [None]:
def canonicalize_license(license_ccurl: Optional[str]) -> Optional[str]:
    """
    Parse Creative Commons license URL and return canonical form.
    
    Returns: 'cc-by', 'cc-by-sa', or None (for rejected licenses)
    
    Accepts:
    - CC-BY 3.0/4.0: http://creativecommons.org/licenses/by/3.0/
    - CC-BY-SA 3.0/4.0: http://creativecommons.org/licenses/by-sa/3.0/
    
    Rejects (returns None):
    - CC-BY-NC (NonCommercial)
    - CC-BY-ND (NoDerivatives)
    - CC-BY-NC-SA
    - CC-BY-NC-ND
    - Any other license
    """
    if not license_ccurl or not isinstance(license_ccurl, str):
        return None
    
    # Normalize URL
    url_lower = license_ccurl.lower().strip().rstrip('/')
    
    # Extract license type from URL pattern
    # Expected: http(s)://creativecommons.org/licenses/{type}/{version}/
    if 'creativecommons.org/licenses/' not in url_lower:
        return None
    
    # Extract type component
    parts = url_lower.split('creativecommons.org/licenses/')
    if len(parts) != 2:
        return None
    
    license_part = parts[1].split('/')[0]  # Get type before version
    
    # Strict allowlist matching
    if license_part == 'by':
        return 'cc-by'
    elif license_part == 'by-sa':
        return 'cc-by-sa'
    else:
        # Reject: by-nc, by-nd, by-nc-sa, by-nc-nd, etc.
        return None

# Unit tests
assert canonicalize_license("http://creativecommons.org/licenses/by/3.0/") == "cc-by"
assert canonicalize_license("http://creativecommons.org/licenses/by-sa/4.0/") == "cc-by-sa"
assert canonicalize_license("http://creativecommons.org/licenses/by-nc/3.0/") is None
assert canonicalize_license("http://creativecommons.org/licenses/by-nc-sa/3.0/") is None
assert canonicalize_license("http://creativecommons.org/licenses/by-nd/3.0/") is None
assert canonicalize_license(None) is None
assert canonicalize_license("") is None
print("‚úì License canonicalization tests passed")

In [None]:
class JamendoAPIClient:
    """Jamendo API client with retry logic and rate limiting."""
    
    def __init__(self, client_id: str, timeout: int = 30):
        self.client_id = client_id
        self.timeout = timeout
        self.client = httpx.Client(timeout=timeout)
        self.last_request_time = 0
        self.min_request_interval = 0.1  # 100ms between requests
    
    def _rate_limit(self):
        """Simple rate limiting: ensure minimum interval between requests."""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_interval:
            time.sleep(self.min_request_interval - elapsed)
        self.last_request_time = time.time()
    
    @retry(
        stop=stop_after_attempt(RETRY_MAX_ATTEMPTS),
        wait=wait_exponential(multiplier=RETRY_BACKOFF_FACTOR, min=1, max=60),
        retry=retry_if_exception_type((httpx.TimeoutException, httpx.HTTPStatusError))
    )
    def fetch_tracks(
        self, 
        offset: int = 0, 
        limit: int = 200,
        include_fullcount: bool = False
    ) -> Dict[str, Any]:
        """
        Fetch tracks from Jamendo API with pagination.
        
        Returns: {"headers": {...}, "results": [...]}
        """
        self._rate_limit()
        
        params = {
            "client_id": self.client_id,
            "format": "json",
            "limit": limit,
            "offset": offset,
            "audiodownload": "true",  # Only downloadable tracks
        }
        
        if include_fullcount:
            params["fullcount"] = "true"
        
        url = f"{JAMENDO_API_BASE}/tracks/"
        
        try:
            response = self.client.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                # Rate limit - tenacity will retry with backoff
                print(f"‚ö† Rate limited (429), will retry...")
                raise
            elif e.response.status_code >= 500:
                # Server error - tenacity will retry
                print(f"‚ö† Server error ({e.response.status_code}), will retry...")
                raise
            else:
                # Client error - don't retry
                print(f"‚úó Client error ({e.response.status_code}): {e}")
                raise RuntimeError(f"API error: {e.response.status_code}") from e
    
    def close(self):
        self.client.close()

# Test client initialization
api_client = JamendoAPIClient(JAMENDO_CLIENT_ID, timeout=REQUEST_TIMEOUT)
print("‚úì API client initialized")

In [None]:
def load_state() -> Dict[str, Any]:
    """Load checkpoint state or return initial state."""
    if STATE_FILE.exists():
        with open(STATE_FILE, 'r') as f:
            state = json.load(f)
        print(f"‚úì Loaded checkpoint: offset={state['last_offset']}, fetched={state['total_fetched']}")
        return state
    else:
        return {
            "last_offset": 0,
            "total_fetched": 0,
            "total_passed_filter": 0,
            "total_rejected": 0,
            "total_catalog_size": None,
            "start_time": datetime.utcnow().isoformat(),
            "last_update_time": None
        }

def save_state(state: Dict[str, Any]):
    """Save checkpoint state."""
    state["last_update_time"] = datetime.utcnow().isoformat()
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)

# Load or initialize state
state = load_state()

In [None]:
def process_track(track: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Filter and extract metadata from a track.
    Returns None if track doesn't pass license filter.
    """
    license_url = track.get("license_ccurl")
    canonical_license = canonicalize_license(license_url)
    
    if canonical_license is None:
        return None
    
    # Extract relevant metadata
    return {
        "id": track.get("id"),
        "name": track.get("name"),
        "duration": track.get("duration"),  # in seconds
        "artist_id": track.get("artist_id"),
        "artist_name": track.get("artist_name"),
        "album_id": track.get("album_id"),
        "album_name": track.get("album_name"),
        "license": canonical_license,
        "license_ccurl": license_url,
        "releasedate": track.get("releasedate"),
        "audiodownload_allowed": track.get("audiodownload_allowed"),
    }

# Main crawl loop
offset = state["last_offset"]
page_count = 0
audit_passed = []
audit_rejected = []

# Open JSONL in append mode
jsonl_mode = 'a' if offset > 0 else 'w'
jsonl_file = open(JSONL_FILE, jsonl_mode)

try:
    # Get total catalog size on first request
    if state["total_catalog_size"] is None:
        print("Fetching catalog size...")
        first_response = api_client.fetch_tracks(offset=0, limit=1, include_fullcount=True)
        total_size = first_response["headers"].get("results_fullcount", 0)
        state["total_catalog_size"] = total_size
        print(f"Total catalog size: {total_size:,} tracks")
    
    total_size = state["total_catalog_size"]
    total_pages = (total_size // PAGE_SIZE) + (1 if total_size % PAGE_SIZE else 0)
    
    if MAX_PAGES > 0:
        total_pages = min(total_pages, MAX_PAGES)
        print(f"DRY RUN: Processing {total_pages} pages only")
    
    # Progress bar
    with tqdm(total=total_pages, initial=offset // PAGE_SIZE, desc="Crawling pages", unit="page") as pbar:
        while True:
            # Check if we've reached max pages (dry run mode)
            if MAX_PAGES > 0 and page_count >= MAX_PAGES:
                print(f"Reached MAX_PAGES limit ({MAX_PAGES})")
                break
            
            # Check if we've exhausted the catalog
            if offset >= total_size:
                print(f"Completed: processed all {total_size:,} tracks")
                break
            
            # Fetch page
            response = api_client.fetch_tracks(offset=offset, limit=PAGE_SIZE)
            results = response.get("results", [])
            
            if not results:
                print(f"No more results at offset {offset}")
                break
            
            # Process tracks
            for track in results:
                state["total_fetched"] += 1
                
                processed = process_track(track)
                if processed:
                    # Write to JSONL immediately (streaming)
                    jsonl_file.write(json.dumps(processed) + '\n')
                    state["total_passed_filter"] += 1
                    
                    # Collect audit samples
                    if len(audit_passed) < 10:
                        audit_passed.append({
                            "id": track.get("id"),
                            "name": track.get("name"),
                            "license_ccurl": track.get("license_ccurl"),
                            "canonical": processed["license"]
                        })
                else:
                    state["total_rejected"] += 1
                    
                    # Collect audit samples
                    if len(audit_rejected) < 10:
                        audit_rejected.append({
                            "id": track.get("id"),
                            "name": track.get("name"),
                            "license_ccurl": track.get("license_ccurl"),
                            "reason": "license_not_allowed"
                        })
            
            # Update state
            offset += len(results)
            state["last_offset"] = offset
            page_count += 1
            
            # Checkpoint periodically
            if page_count % CHECKPOINT_INTERVAL == 0:
                jsonl_file.flush()
                save_state(state)
            
            pbar.update(1)
            pbar.set_postfix({
                "passed": state["total_passed_filter"],
                "rejected": state["total_rejected"]
            })
    
    # Final checkpoint
    jsonl_file.flush()
    save_state(state)
    
finally:
    jsonl_file.close()
    api_client.close()

print("\n" + "="*60)
print("CRAWL COMPLETE")
print("="*60)
print(f"Total tracks fetched: {state['total_fetched']:,}")
print(f"Passed filter (CC-BY/CC-BY-SA): {state['total_passed_filter']:,}")
print(f"Rejected (NC/ND/other): {state['total_rejected']:,}")
print(f"Pass rate: {100 * state['total_passed_filter'] / state['total_fetched']:.1f}%")

In [None]:
print("\n" + "="*60)
print("AUDIT TRAIL - Sample Tracks")
print("="*60)

print("\n‚úì PASSED FILTER (CC-BY/CC-BY-SA):")
for i, track in enumerate(audit_passed, 1):
    print(f"{i}. [{track['id']}] {track['name']}")
    print(f"   License URL: {track['license_ccurl']}")
    print(f"   Canonical: {track['canonical']}")
    print()

print("\n‚úó REJECTED (NC/ND/other):")
for i, track in enumerate(audit_rejected, 1):
    print(f"{i}. [{track['id']}] {track['name']}")
    print(f"   License URL: {track['license_ccurl']}")
    print(f"   Reason: {track['reason']}")
    print()

In [None]:
# Load JSONL into DataFrame
print("Loading metadata for aggregation...")
df = pd.read_json(JSONL_FILE, lines=True)
print(f"Loaded {len(df):,} tracks")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Summary statistics
total_tracks = len(df)
total_duration_seconds = df['duration'].sum()
total_hours = total_duration_seconds / 3600

summary = {
    "generated_at": datetime.utcnow().isoformat(),
    "source": "jamendo_api",
    "license_filter": list(ALLOWED_LICENSES),
    "total_tracks": int(total_tracks),
    "total_duration_seconds": float(total_duration_seconds),
    "total_duration_hours": float(total_hours),
    "total_catalog_tracks_fetched": state["total_fetched"],
    "pass_rate_percent": float(100 * state["total_passed_filter"] / state['total_fetched']),
    "crawl_start_time": state["start_time"],
    "crawl_end_time": state["last_update_time"],
}

# Save summary JSON
with open(SUMMARY_FILE, 'w') as f:
    json.dump(summary, f, indent=2)

print("="*60)
print("SUMMARY STATISTICS")
print("="*60)
print(f"Total tracks: {summary['total_tracks']:,}")
print(f"Total duration: {summary['total_duration_hours']:,.1f} hours")
print(f"Average track duration: {total_duration_seconds / total_tracks / 60:.1f} minutes")
print(f"Pass rate: {summary['pass_rate_percent']:.1f}%")
print(f"\n‚úì Saved: {SUMMARY_FILE}")

In [None]:
# Group by license type
license_stats = df.groupby('license').agg({
    'duration': ['count', 'sum']
}).reset_index()

license_stats.columns = ['license', 'track_count', 'total_duration_seconds']
license_stats['total_duration_hours'] = license_stats['total_duration_seconds'] / 3600
license_stats['percentage_of_tracks'] = 100 * license_stats['track_count'] / total_tracks

# Save CSV
license_stats.to_csv(LICENSE_CSV, index=False)

print("="*60)
print("BREAKDOWN BY LICENSE TYPE")
print("="*60)
print(license_stats.to_string(index=False))
print(f"\n‚úì Saved: {LICENSE_CSV}")

In [None]:
# Compute duration percentiles
duration_minutes = df['duration'] / 60

duration_stats = pd.DataFrame({
    'metric': ['mean', 'median (p50)', 'p90', 'p95', 'p99', 'min', 'max'],
    'duration_minutes': [
        duration_minutes.mean(),
        duration_minutes.quantile(0.50),
        duration_minutes.quantile(0.90),
        duration_minutes.quantile(0.95),
        duration_minutes.quantile(0.99),
        duration_minutes.min(),
        duration_minutes.max(),
    ]
})

# Save CSV
duration_stats.to_csv(DURATION_CSV, index=False)

print("="*60)
print("DURATION STATISTICS")
print("="*60)
print(duration_stats.to_string(index=False))
print(f"\n‚úì Saved: {DURATION_CSV}")

In [None]:
# Save as Parquet (more efficient for large datasets)
try:
    df.to_parquet(PARQUET_FILE, index=False, compression='snappy')
    print(f"‚úì Saved Parquet: {PARQUET_FILE}")
    print(f"  JSONL size: {JSONL_FILE.stat().st_size / 1024 / 1024:.1f} MB")
    print(f"  Parquet size: {PARQUET_FILE.stat().st_size / 1024 / 1024:.1f} MB")
except Exception as e:
    print(f"‚ö† Failed to save Parquet: {e}")

In [None]:
print("\n" + "="*60)
print("ALL OUTPUTS GENERATED")
print("="*60)
print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"\nFiles created:")
print(f"  1. {JSONL_FILE.name} - Raw filtered metadata (append-only)")
print(f"  2. {SUMMARY_FILE.name} - Topline metrics")
print(f"  3. {LICENSE_CSV.name} - License breakdown")
print(f"  4. {DURATION_CSV.name} - Duration statistics")
print(f"  5. {STATE_FILE.name} - Checkpoint state (for resume)")
if PARQUET_FILE.exists():
    print(f"  6. {PARQUET_FILE.name} - Parquet archive (optional)")

print(f"\n‚úì Pipeline complete!")
print(f"\nTo resume crawl if interrupted:")
print(f"  - Re-run this notebook (it will resume from offset {state['last_offset']})")
print(f"\nTo start fresh:")
print(f"  - Delete {STATE_FILE} and {JSONL_FILE}")