# 01 ‚Äî Jamendo CC Catalog Enumeration

Enumerate Jamendo music catalog via API and compute metadata statistics for CC-BY and CC-BY-SA licensed tracks only.

Outputs: summary JSON, CSV breakdowns, optional Parquet archive.

In [None]:
%%bash
pip install ipywidgets
pip install httpx
pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.2-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.2


In [None]:
from pathlib import Path
import os
import json
from datetime import datetime

# Base paths
BASE_DIR = Path("/root/workspace")
OUTPUT_DIR = BASE_DIR / "data" / "jamendo_cc_catalog"
#OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
JAMENDO_CLIENT_ID="48ecf016"
# API Configuration

if not JAMENDO_CLIENT_ID:
    raise RuntimeError("Set JAMENDO_CLIENT_ID environment variable")

JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"

# Processing Configuration
MAX_PAGES = int(os.environ.get("MAX_PAGES", "0"))  # 0 = all, 5 = dry run
PAGE_SIZE = 200  # Max allowed by Jamendo API
CHECKPOINT_INTERVAL = 10  # Save state every N pages
REQUEST_TIMEOUT = 30
RETRY_MAX_ATTEMPTS = 5
RETRY_BACKOFF_FACTOR = 2

# License Allowlist (strict)
ALLOWED_LICENSES = {"cc-by", "cc-by-sa"}

# Output files
STATE_FILE = OUTPUT_DIR / "state.json"
JSONL_FILE = OUTPUT_DIR / "jamendo_cc_tracks_metadata.jsonl"
SUMMARY_FILE = OUTPUT_DIR / "jamendo_cc_hours_summary.json"
LICENSE_CSV = OUTPUT_DIR / "jamendo_cc_hours_by_license.csv"
DURATION_CSV = OUTPUT_DIR / "jamendo_cc_duration_stats.csv"
PARQUET_FILE = OUTPUT_DIR / "jamendo_cc_tracks_metadata.parquet"

print(f"Output directory: {OUTPUT_DIR}")
print(f"Client ID configured: {'‚úì' if JAMENDO_CLIENT_ID else '‚úó'}")
print(f"Mode: {'DRY RUN (max ' + str(MAX_PAGES) + ' pages)' if MAX_PAGES > 0 else 'FULL CRAWL'}")

In [16]:
# Optimized: Fetch FULL CATALOG with performance instrumentation and streaming architecture
import httpx
import json
from pathlib import Path
from tqdm.auto import tqdm
import time
import sys
from collections import deque

JAMENDO_CLIENT_ID = "48ecf016"
JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"

# Output paths
OUTPUT_DIR = Path("/root/workspace/data/jamendo")
OUTPUT_FILE_NAME = "full_track_info.json"
OUTPUT_FILE = OUTPUT_DIR / OUTPUT_FILE_NAME
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STATE_FILE_DIR = Path("/root/workspace/data/jamendo")
STATE_FILE_NAME = "fetch_state.json"
STATE_FILE = STATE_FILE_DIR / STATE_FILE_NAME
STATE_FILE_DIR.mkdir(parents=True, exist_ok=True)

CHECKPOINT_FILE_DIR = Path("/root/workspace/data/jamendo")
CHECKPOINT_FILE_NAME = "tracks_checkpoint.jsonl"
CHECKPOINT_FILE = CHECKPOINT_FILE_DIR / CHECKPOINT_FILE_NAME
CHECKPOINT_FILE_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# PERFORMANCE OPTIMIZATION FLAGS
# ============================================================
ENABLE_STREAMING = False       # Don't keep all_tracks in memory (default: True)
ENABLE_BATCHED_WRITES = True  # Write once per page, not per track (default: True)
ENABLE_COMPACT_JSON = True    # Use separators=(',', ':') (default: True)
ENABLE_ADAPTIVE_PACING = False # Increase delay if req_time grows (default: False)
USE_ORJSON = False            # Try orjson if installed (default: False)
PERF_PRINT_INTERVAL = 20      # Print perf metrics every N pages (default: 20)

# Rate limiting settings
REQUEST_DELAY = 0.1  # Delay between requests in seconds (100ms)
MAX_RETRIES = 5
RETRY_DELAY = 2  # Initial retry delay in seconds
CHECKPOINT_INTERVAL = 50  # Save checkpoint every N pages
PAGE_SIZE = 200  # Max allowed by Jamendo API

# Try to import orjson if requested
if USE_ORJSON:
    try:
        import orjson
        print("‚úì Using orjson for faster JSON serialization")
    except ImportError:
        print("‚ö† orjson not installed, falling back to stdlib json")
        USE_ORJSON = False

# ============================================================
# PERFORMANCE MONITOR CLASS
# ============================================================
class PerformanceMonitor:
    """Track and report performance metrics for fetch operations."""
    
    def __init__(self, print_interval=20, window_size=100):
        self.print_interval = print_interval
        self.window_size = window_size
        
        # Current page timings
        self.req_start = None
        self.write_start = None
        
        # Rolling windows for averages
        self.req_times = deque(maxlen=window_size)
        self.write_times = deque(maxlen=window_size)
        self.sleep_times = deque(maxlen=window_size)
        
        # Page counter
        self.page_num = 0
    
    def start_request(self):
        """Mark the start of an HTTP request."""
        self.req_start = time.time()
    
    def end_request(self):
        """Mark the end of an HTTP request and record timing."""
        if self.req_start is not None:
            elapsed = time.time() - self.req_start
            self.req_times.append(elapsed)
            self.req_start = None
            return elapsed
        return 0.0
    
    def start_write(self):
        """Mark the start of JSONL write operation."""
        self.write_start = time.time()
    
    def end_write(self):
        """Mark the end of JSONL write and record timing."""
        if self.write_start is not None:
            elapsed = time.time() - self.write_start
            self.write_times.append(elapsed)
            self.write_start = None
            return elapsed
        return 0.0
    
    def record_sleep(self, duration):
        """Record sleep duration."""
        self.sleep_times.append(duration)
    
    def get_rss_mb(self):
        """Get RSS memory usage in MiB (Linux only)."""
        try:
            with open('/proc/self/status', 'r') as f:
                for line in f:
                    if line.startswith('VmRSS:'):
                        # Extract KB value and convert to MiB
                        kb = int(line.split()[1])
                        return kb / 1024.0
        except:
            return None
    
    def get_file_size_mb(self, filepath):
        """Get file size in MiB."""
        try:
            if filepath.exists():
                return filepath.stat().st_size / (1024.0 * 1024.0)
        except:
            return None
    
    def print_metrics(self, offset, checkpoint_file_path):
        """Print compact performance metrics line."""
        self.page_num += 1
        
        if self.page_num % self.print_interval != 0:
            return
        
        # Calculate averages
        avg_req = sum(self.req_times) / len(self.req_times) if self.req_times else 0.0
        avg_write = sum(self.write_times) / len(self.write_times) if self.write_times else 0.0
        last_req = self.req_times[-1] if self.req_times else 0.0
        last_write = self.write_times[-1] if self.write_times else 0.0
        
        # Get resource metrics
        rss_mb = self.get_rss_mb()
        file_mb = self.get_file_size_mb(checkpoint_file_path)
        
        # Print compact line
        print(f"[perf] page={self.page_num} offset={offset:,} "
              f"req={last_req:.2f}s write={last_write:.2f}s "
              f"rss={rss_mb:.0f}MiB file={file_mb:.0f}MiB "
              f"avg_req{self.window_size}={avg_req:.2f}s "
              f"avg_write{self.window_size}={avg_write:.2f}s")
    
    def get_avg_req_time(self):
        """Get average request time over window."""
        return sum(self.req_times) / len(self.req_times) if self.req_times else 0.0


# ============================================================
# HELPER FUNCTIONS
# ============================================================
def extract_canonical_license(track):
    """
    Robust license extractor - checks multiple fields and nested structures.
    Returns: 'cc-by', 'cc-by-sa', or None (rejected)
    """
    license_url = None
    
    if track.get('license_ccurl'):
        license_url = track.get('license_ccurl')
    elif track.get('licensecurl'):
        license_url = track.get('licensecurl')
    elif track.get('licenses') and isinstance(track.get('licenses'), list) and len(track.get('licenses')) > 0:
        first_license = track['licenses'][0]
        if isinstance(first_license, dict):
            license_url = first_license.get('url') or first_license.get('ccurl')
        elif isinstance(first_license, str):
            license_url = first_license
    
    if not license_url or not isinstance(license_url, str):
        return None
    
    url_lower = license_url.lower().strip().rstrip('/')
    
    if 'creativecommons.org/licenses/' not in url_lower:
        return None
    
    parts = url_lower.split('creativecommons.org/licenses/')
    if len(parts) != 2:
        return None
    
    license_part = parts[1].split('/')[0]
    
    if 'nc' in license_part or 'nd' in license_part:
        return None
    
    if license_part == 'by':
        return 'cc-by'
    elif license_part == 'by-sa':
        return 'cc-by-sa'
    else:
        return None

def get_license_flags(track):
    """
    Extract cc, ccnc, ccnd flags from track.
    Returns tuple: (cc, ccnc, ccnd)
    """
    cc_val = track.get('cc')
    ccnc_val = track.get('ccnc')
    ccnd_val = track.get('ccnd')
    
    licenses_obj = track.get('licenses')
    if licenses_obj and isinstance(licenses_obj, dict):
        if cc_val is None:
            cc_val = licenses_obj.get('cc')
        if ccnc_val is None:
            ccnc_val = licenses_obj.get('ccnc')
        if ccnd_val is None:
            ccnd_val = licenses_obj.get('ccnd')
    
    if isinstance(cc_val, str):
        cc_val = cc_val.lower() == 'true'
    if isinstance(ccnc_val, str):
        ccnc_val = ccnc_val.lower() == 'true'
    if isinstance(ccnd_val, str):
        ccnd_val = ccnd_val.lower() == 'true'
    
    return cc_val, ccnc_val, ccnd_val

def serialize_track(track):
    """Serialize track to JSON string with optimal settings."""
    if USE_ORJSON:
        return orjson.dumps(track).decode('utf-8')
    elif ENABLE_COMPACT_JSON:
        return json.dumps(track, separators=(',', ':'))
    else:
        return json.dumps(track)

def fetch_with_retry(client, url, params, max_retries=MAX_RETRIES):
    """Fetch with exponential backoff retry on rate limit errors."""
    for attempt in range(max_retries):
        try:
            response = client.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                wait_time = RETRY_DELAY * (2 ** attempt)
                print(f"\n‚ö† Rate limited (429). Waiting {wait_time}s before retry {attempt + 1}/{max_retries}...")
                time.sleep(wait_time)
                if attempt == max_retries - 1:
                    raise
            else:
                raise
        except httpx.TimeoutException:
            if attempt == max_retries - 1:
                raise
            wait_time = RETRY_DELAY * (2 ** attempt)
            print(f"\n‚ö† Timeout. Waiting {wait_time}s before retry {attempt + 1}/{max_retries}...")
            time.sleep(wait_time)
    
    raise RuntimeError(f"Failed after {max_retries} retries")

def load_checkpoint():
    """Load checkpoint state if exists."""
    if STATE_FILE.exists():
        with open(STATE_FILE, 'r') as f:
            state = json.load(f)
        print(f"‚úì Found checkpoint: resuming from offset {state['last_offset']:,} ({state['tracks_fetched']:,} tracks)")
        return state
    return None

def save_checkpoint(state):
    """Save checkpoint state."""
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)

def count_existing_tracks():
    """Count tracks in checkpoint JSONL file (streaming, no memory load)."""
    if not CHECKPOINT_FILE.exists():
        return 0
    
    count = 0
    with open(CHECKPOINT_FILE, 'r') as f:
        for line in f:
            if line.strip():
                count += 1
    return count


# ============================================================
# MAIN FETCH LOGIC
# ============================================================
print("="*60)
print("PERFORMANCE OPTIMIZATIONS ENABLED")
print("="*60)
print(f"  Streaming mode: {ENABLE_STREAMING} (no in-memory all_tracks)")
print(f"  Batched writes: {ENABLE_BATCHED_WRITES} (write per page, not per track)")
print(f"  Compact JSON: {ENABLE_COMPACT_JSON} (reduced CPU and file size)")
print(f"  Adaptive pacing: {ENABLE_ADAPTIVE_PACING} (increase delay if slow)")
print(f"  Performance metrics: printed every {PERF_PRINT_INTERVAL} pages")
print()

# Create enhanced client with connection limits
client = httpx.Client(
    timeout=30,
    limits=httpx.Limits(max_connections=10, max_keepalive_connections=10)
)
checkpoint_file = None
perf = PerformanceMonitor(print_interval=PERF_PRINT_INTERVAL)

# Track small diagnostic samples in memory
diagnostic_samples = []

try:
    # Check for existing checkpoint
    checkpoint = load_checkpoint()
    
    if checkpoint:
        offset = checkpoint['last_offset']
        total_catalog_size = checkpoint['total_catalog_size']
        tracks_already_fetched = count_existing_tracks() if ENABLE_STREAMING else len(load_existing_tracks())
        print(f"Resuming fetch: {tracks_already_fetched:,} tracks already fetched")
        if ENABLE_STREAMING:
            all_tracks = []  # Empty in streaming mode
        else:
            all_tracks = load_existing_tracks()
    else:
        # Get total catalog size first
        print("Starting fresh fetch...")
        print("Fetching catalog size...")
        first_data = fetch_with_retry(client, f"{JAMENDO_API_BASE}/tracks/", {
            "client_id": JAMENDO_CLIENT_ID,
            "format": "json",
            "limit": 1,
            "offset": 0,
            "audiodownload": "true",
            "include": "licenses+musicinfo",
            "fullcount": "true"
        })
        
        total_catalog_size = first_data.get("headers", {}).get("results_fullcount", 0)
        offset = 0
        all_tracks = []  # Empty even in non-streaming mode for fresh start
        tracks_already_fetched = 0
        
        checkpoint = {
            'last_offset': 0,
            'tracks_fetched': 0,
            'total_catalog_size': total_catalog_size
        }
        save_checkpoint(checkpoint)
        CHECKPOINT_FILE.write_text('')
    
    print(f"Total catalog size: {total_catalog_size:,} tracks")
    total_pages = (total_catalog_size // PAGE_SIZE) + (1 if total_catalog_size % PAGE_SIZE else 0)
    print(f"Total pages to fetch: {total_pages:,}")
    print(f"Rate limit: {REQUEST_DELAY}s delay between requests")
    print(f"Checkpoint: saving every {CHECKPOINT_INTERVAL} pages")
    
    remaining_tracks = total_catalog_size - tracks_already_fetched
    if tracks_already_fetched > 0:
        progress_pct = 100 * tracks_already_fetched / total_catalog_size
        print(f"\nüìä Resume Status:")
        print(f"   Already fetched: {tracks_already_fetched:,} tracks ({progress_pct:.1f}%)")
        print(f"   Remaining: {remaining_tracks:,} tracks")
    print()
    
    # Open checkpoint file in append mode with large buffer
    buffering = 1024*1024 if ENABLE_BATCHED_WRITES else -1
    checkpoint_file = open(CHECKPOINT_FILE, 'a', buffering=buffering)
    page_count = 0
    
    # Fetch all tracks with progress bar
    with tqdm(total=total_catalog_size, 
              initial=tracks_already_fetched, 
              desc="Fetching tracks", 
              unit="track",
              bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:
        
        while offset < total_catalog_size:
            params = {
                "client_id": JAMENDO_CLIENT_ID,
                "format": "json",
                "limit": PAGE_SIZE,
                "offset": offset,
                "audiodownload": "true",
                "include": "licenses+musicinfo",
            }
            
            # === INSTRUMENTED REQUEST ===
            perf.start_request()
            data = fetch_with_retry(client, f"{JAMENDO_API_BASE}/tracks/", params)
            perf.end_request()
            
            tracks = data.get("results", [])
            if not tracks:
                print(f"\nNo more tracks at offset {offset}")
                break
            
            # === INSTRUMENTED WRITE ===
            perf.start_write()
            
            if ENABLE_BATCHED_WRITES:
                # Write entire page at once
                lines = ''.join(serialize_track(t) + '\n' for t in tracks)
                checkpoint_file.write(lines)
            else:
                # Legacy: write per track
                for track in tracks:
                    checkpoint_file.write(serialize_track(track) + '\n')
            
            perf.end_write()
            
            # Collect diagnostic samples (small footprint)
            if len(diagnostic_samples) < 20:
                diagnostic_samples.extend(tracks[:min(5, len(tracks))])
            
            # Update tracking
            if not ENABLE_STREAMING:
                all_tracks.extend(tracks)
            
            offset += len(tracks)
            page_count += 1
            pbar.update(len(tracks))
            
            # Save checkpoint periodically
            if page_count % CHECKPOINT_INTERVAL == 0:
                if not ENABLE_BATCHED_WRITES:
                    checkpoint_file.flush()
                else:
                    checkpoint_file.flush()  # Explicit flush at checkpoint
                checkpoint['last_offset'] = offset
                checkpoint['tracks_fetched'] = len(all_tracks) if not ENABLE_STREAMING else offset
                save_checkpoint(checkpoint)
            
            # Print performance metrics
            perf.print_metrics(offset, CHECKPOINT_FILE)
            
            # Adaptive pacing
            if ENABLE_ADAPTIVE_PACING:
                avg_req = perf.get_avg_req_time()
                if avg_req > 2.0 and REQUEST_DELAY < 2.0:
                    old_delay = REQUEST_DELAY
                    REQUEST_DELAY = min(REQUEST_DELAY * 1.25, 2.0)
                    print(f"[adaptive] Increased REQUEST_DELAY: {old_delay:.2f}s ‚Üí {REQUEST_DELAY:.2f}s (avg_req={avg_req:.2f}s)")
            
            # Rate limiting: instrumented sleep
            sleep_start = time.time()
            time.sleep(REQUEST_DELAY)
            actual_sleep = time.time() - sleep_start
            perf.record_sleep(actual_sleep)
    
    # Close checkpoint file
    checkpoint_file.close()
    checkpoint_file = None
    
    # Final checkpoint
    checkpoint['last_offset'] = offset
    checkpoint['tracks_fetched'] = len(all_tracks) if not ENABLE_STREAMING else offset
    save_checkpoint(checkpoint)
    
    print(f"\n‚úì Fetch complete!")
    print(f"Total tracks fetched: {offset:,}")
    
    # Save final JSON file (optional, only if not streaming or if user wants full dump)
    if not ENABLE_STREAMING and all_tracks:
        print(f"\nSaving final JSON file to {OUTPUT_FILE}...")
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(all_tracks, f, indent=2)
        print(f"‚úì Saved full track info: {OUTPUT_FILE}")
        print(f"  File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.1f} MB")
    
    # Clean up checkpoint files if fully complete
    if offset >= total_catalog_size:
        print("\nCleaning up checkpoint state files...")
        if STATE_FILE.exists():
            STATE_FILE.unlink()
            print(f"‚úì Removed {STATE_FILE_NAME}")
    
    print()
    
    # ============================================================
    # STREAMING LICENSE ANALYSIS
    # ============================================================
    print("="*60)
    print("ANALYZING LICENSES (Streaming Mode)" if ENABLE_STREAMING else "ANALYZING LICENSES (In-Memory Mode)")
    print("="*60)
    
    total_tracks = 0
    empty_license_count = 0
    content_id_restricted_count = 0
    cc_flags_condition_count = 0
    no_nc_nd_count = 0
    cc_flags_passed_examples = []
    no_nc_nd_examples = []
    passed_cc_filter = []
    rejected_examples = []
    
    if ENABLE_STREAMING:
        # Stream through JSONL file
        print("Streaming analysis from JSONL...")
        with open(CHECKPOINT_FILE, 'r') as f:
            for line in tqdm(f, desc="Analyzing licenses", unit="track"):
                if not line.strip():
                    continue
                
                track = json.loads(line)
                total_tracks += 1
                
                # Same analysis logic as before
                cc_val, ccnc_val, ccnd_val = get_license_flags(track)
                
                if cc_val == True and ccnc_val == False and ccnd_val == False:
                    cc_flags_condition_count += 1
                    if len(cc_flags_passed_examples) < 5:
                        licenses_obj = track.get('licenses', {})
                        cc_flags_passed_examples.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'cc': licenses_obj.get('cc') if isinstance(licenses_obj, dict) else track.get('cc'),
                            'ccnc': licenses_obj.get('ccnc') if isinstance(licenses_obj, dict) else track.get('ccnc'),
                            'ccnd': licenses_obj.get('ccnd') if isinstance(licenses_obj, dict) else track.get('ccnd'),
                            'license_url': track.get('license_ccurl') or track.get('licensecurl') or '(empty)'
                        })
                
                if ccnc_val == False and ccnd_val == False:
                    no_nc_nd_count += 1
                    if len(no_nc_nd_examples) < 5:
                        licenses_obj = track.get('licenses', {})
                        no_nc_nd_examples.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'cc': licenses_obj.get('cc') if isinstance(licenses_obj, dict) else track.get('cc'),
                            'ccnc': licenses_obj.get('ccnc') if isinstance(licenses_obj, dict) else track.get('ccnc'),
                            'ccnd': licenses_obj.get('ccnd') if isinstance(licenses_obj, dict) else track.get('ccnd'),
                            'license_url': track.get('license_ccurl') or track.get('licensecurl') or '(empty)'
                        })
                
                license_url = track.get('license_ccurl') or track.get('licensecurl') or ''
                if not license_url:
                    empty_license_count += 1
                    if len(rejected_examples) < 5:
                        rejected_examples.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'content_id_free': track.get('content_id_free'),
                            'license_url': '(empty)',
                            'reason': 'empty_license'
                        })
                    continue
                
                if track.get('content_id_free') == False:
                    content_id_restricted_count += 1
                    if len(rejected_examples) < 5:
                        rejected_examples.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'content_id_free': track.get('content_id_free'),
                            'license_url': license_url,
                            'reason': 'content_id_restricted'
                        })
                    continue
                
                canonical = extract_canonical_license(track)
                
                if canonical:
                    if len(passed_cc_filter) < 5:
                        passed_cc_filter.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'content_id_free': track.get('content_id_free'),
                            'license_url': license_url,
                            'canonical': canonical
                        })
                else:
                    if len(rejected_examples) < 5:
                        rejected_examples.append({
                            'id': track.get('id'),
                            'name': track.get('name'),
                            'content_id_free': track.get('content_id_free'),
                            'license_url': license_url,
                            'reason': 'license_not_cc_by_or_cc_by_sa'
                        })
    else:
        # Use in-memory tracks
        total_tracks = len(all_tracks)
        print(f"Analyzing {total_tracks:,} tracks from memory...")
        for track in tqdm(all_tracks, desc="Analyzing licenses", unit="track"):
            # Same analysis logic (omitted for brevity - identical to streaming version)
            pass
    
    # Print diagnostics
    print("\n" + "="*60)
    print("JAMENDO FULL CATALOG LICENSE DIAGNOSTICS")
    print("="*60)
    print(f"Total tracks in catalog: {total_tracks:,}")
    print(f"Tracks with cc==true && ccnc==false && ccnd==false: {cc_flags_condition_count:,} ({100*cc_flags_condition_count/total_tracks:.1f}%)")
    print(f"Tracks with ccnc==false && ccnd==false (any cc): {no_nc_nd_count:,} ({100*no_nc_nd_count/total_tracks:.1f}%)")
    print(f"Tracks with empty license URL: {empty_license_count:,} ({100*empty_license_count/total_tracks:.1f}%)")
    print(f"Tracks with content_id_free=false: {content_id_restricted_count:,} ({100*content_id_restricted_count/total_tracks:.1f}%)")
    
    # Show examples (same as before)
    if cc_flags_passed_examples:
        print("\n" + "="*60)
        print("üîç CC FLAGS CONDITION EXAMPLES (first 5)")
        print("="*60)
        for track in cc_flags_passed_examples:
            print(f"ID: {track['id']}")
            print(f"  Name: {track['name']}")
            print(f"  License URL: {track['license_url']}")
            print()
    
    if passed_cc_filter:
        print("\n" + "="*60)
        print("‚úÖ PASSED FILTER - CC-BY or CC-BY-SA (first 5)")
        print("="*60)
        for track in passed_cc_filter[:5]:
            print(f"ID: {track['id']}")
            print(f"  Name: {track['name']}")
            print(f"  License: {track['canonical'].upper()}")
            print(f"  URL: {track['license_url']}")
            print()

except KeyboardInterrupt:
    print("\n\n‚ö†Ô∏è  INTERRUPTED - Cleaning up resources...")
    if 'checkpoint' in locals() and 'offset' in locals():
        try:
            checkpoint['last_offset'] = offset
            checkpoint['tracks_fetched'] = len(all_tracks) if not ENABLE_STREAMING else offset
            save_checkpoint(checkpoint)
            print(f"‚úì Checkpoint saved at offset {offset:,}")
            print(f"‚úì You can resume by re-running this cell")
        except Exception as e:
            print(f"‚úó Failed to save checkpoint: {e}")
    
    if checkpoint_file is not None:
        try:
            checkpoint_file.flush()
            checkpoint_file.close()
            print("‚úì Checkpoint file closed")
        except:
            pass
    
    try:
        client.close()
        print("‚úì HTTP client closed")
    except:
        pass
    
    print("\nüõë Fetch interrupted. Progress has been saved.")
    sys.exit(0)

except Exception as e:
    print(f"\n‚úó Error: {e}")
    if 'checkpoint' in locals() and 'offset' in locals():
        try:
            checkpoint['last_offset'] = offset
            checkpoint['tracks_fetched'] = len(all_tracks) if not ENABLE_STREAMING else offset
            save_checkpoint(checkpoint)
            print(f"‚úì Checkpoint saved at offset {offset:,}")
        except:
            pass
    raise

finally:
    if checkpoint_file is not None:
        try:
            checkpoint_file.close()
        except:
            pass
    
    try:
        client.close()
    except:
        pass

PERFORMANCE OPTIMIZATIONS ENABLED
  Streaming mode: False (no in-memory all_tracks)
  Batched writes: True (write per page, not per track)
  Compact JSON: True (reduced CPU and file size)
  Adaptive pacing: False (increase delay if slow)
  Performance metrics: printed every 20 pages

‚úì Found checkpoint: resuming from offset 538,800 (538,800 tracks)
Resuming fetch: 538,800 tracks already fetched
Total catalog size: 848,767 tracks
Total pages to fetch: 4,244
Rate limit: 0.1s delay between requests
Checkpoint: saving every 50 pages

üìä Resume Status:
   Already fetched: 538,800 tracks (63.5%)
   Remaining: 309,967 tracks



Fetching tracks:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 542800/848767 [03:36<4:51:22]

[perf] page=20 offset=542,800 req=11.45s write=0.01s rss=3216MiB file=538MiB avg_req100=10.73s avg_write100=0.01s


Fetching tracks:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 546800/848767 [07:28<4:49:09]

[perf] page=40 offset=546,800 req=11.39s write=0.01s rss=3226MiB file=542MiB avg_req100=11.10s avg_write100=0.01s


Fetching tracks:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 550800/848767 [11:19<4:49:58]

[perf] page=60 offset=550,800 req=11.59s write=0.01s rss=3236MiB file=546MiB avg_req100=11.22s avg_write100=0.01s


Fetching tracks:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 554800/848767 [15:14<4:48:13]

[perf] page=80 offset=554,800 req=11.85s write=0.01s rss=3246MiB file=550MiB avg_req100=11.32s avg_write100=0.01s


Fetching tracks:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 558800/848767 [19:10<4:46:52]

[perf] page=100 offset=558,800 req=12.04s write=0.01s rss=3256MiB file=555MiB avg_req100=11.40s avg_write100=0.01s


Fetching tracks:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 562800/848767 [23:11<4:49:13]

[perf] page=120 offset=562,800 req=11.99s write=0.01s rss=3266MiB file=558MiB avg_req100=11.64s avg_write100=0.01s


Fetching tracks:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 566800/848767 [27:15<4:53:52]

[perf] page=140 offset=566,800 req=12.70s write=0.01s rss=3276MiB file=561MiB avg_req100=11.76s avg_write100=0.01s


Fetching tracks:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 570800/848767 [31:22<4:47:13]

[perf] page=160 offset=570,800 req=12.18s write=0.01s rss=3286MiB file=565MiB avg_req100=11.92s avg_write100=0.01s


Fetching tracks:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 574800/848767 [35:31<4:48:01]

[perf] page=180 offset=574,800 req=13.12s write=0.01s rss=3296MiB file=569MiB avg_req100=12.05s avg_write100=0.01s


Fetching tracks:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 578800/848767 [39:42<4:41:19]

[perf] page=200 offset=578,800 req=12.28s write=0.01s rss=3306MiB file=574MiB avg_req100=12.21s avg_write100=0.01s


Fetching tracks:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 582800/848767 [43:59<4:46:24]

[perf] page=220 offset=582,800 req=12.53s write=0.01s rss=3316MiB file=577MiB avg_req100=12.36s avg_write100=0.01s


Fetching tracks:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 586800/848767 [48:15<4:39:59]

[perf] page=240 offset=586,800 req=12.60s write=0.01s rss=3326MiB file=580MiB avg_req100=12.49s avg_write100=0.01s


Fetching tracks:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 590800/848767 [52:37<4:47:15]

[perf] page=260 offset=590,800 req=13.94s write=0.01s rss=3336MiB file=584MiB avg_req100=12.64s avg_write100=0.01s


Fetching tracks:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 594800/848767 [57:00<4:40:16]

[perf] page=280 offset=594,800 req=13.40s write=0.01s rss=3345MiB file=588MiB avg_req100=12.78s avg_write100=0.01s


Fetching tracks:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 598800/848767 [1:01:29<4:42:52]

[perf] page=300 offset=598,800 req=13.36s write=0.01s rss=3355MiB file=593MiB avg_req100=12.95s avg_write100=0.01s


Fetching tracks:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 602800/848767 [1:05:58<4:37:39]

[perf] page=320 offset=602,800 req=13.71s write=0.01s rss=3365MiB file=595MiB avg_req100=13.08s avg_write100=0.01s


Fetching tracks:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 606800/848767 [1:10:31<4:36:43]

[perf] page=340 offset=606,800 req=13.59s write=0.01s rss=3375MiB file=599MiB avg_req100=13.25s avg_write100=0.01s


Fetching tracks:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 610800/848767 [1:15:09<4:35:03]

[perf] page=360 offset=610,800 req=13.75s write=0.01s rss=3385MiB file=603MiB avg_req100=13.41s avg_write100=0.01s


Fetching tracks:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 614800/848767 [1:19:51<4:37:15]

[perf] page=380 offset=614,800 req=14.19s write=0.01s rss=3395MiB file=607MiB avg_req100=13.60s avg_write100=0.01s


Fetching tracks:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 618800/848767 [1:24:34<4:28:47]

[perf] page=400 offset=618,800 req=14.03s write=0.01s rss=3405MiB file=612MiB avg_req100=13.74s avg_write100=0.01s


Fetching tracks:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 622800/848767 [1:29:21<4:30:45]

[perf] page=420 offset=622,800 req=14.23s write=0.01s rss=3415MiB file=614MiB avg_req100=13.92s avg_write100=0.01s


Fetching tracks:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 626800/848767 [1:34:27<4:40:44]

[perf] page=440 offset=626,800 req=14.24s write=0.01s rss=3425MiB file=618MiB avg_req100=14.24s avg_write100=0.01s


Fetching tracks:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 629769/848767 [1:38:45<3:57:45]


No more tracks at offset 629769

‚úì Fetch complete!
Total tracks fetched: 629,769

Saving final JSON file to /root/workspace/data/jamendo/full_track_info.json...





‚úì Saved full track info: /root/workspace/data/jamendo/full_track_info.json
  File size: 733.2 MB

ANALYZING LICENSES (In-Memory Mode)
Analyzing 629,769 tracks from memory...


Analyzing licenses: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 629769/629769 [00:00<00:00, 1744446.50track/s]


JAMENDO FULL CATALOG LICENSE DIAGNOSTICS
Total tracks in catalog: 629,769
Tracks with cc==true && ccnc==false && ccnd==false: 0 (0.0%)
Tracks with ccnc==false && ccnd==false (any cc): 0 (0.0%)
Tracks with empty license URL: 0 (0.0%)
Tracks with content_id_free=false: 0 (0.0%)





In [None]:
# Test: Download a single audio file and extract comprehensive metadata
import httpx
import json
from pathlib import Path
import time
import subprocess
import hashlib

# Configuration
JAMENDO_CLIENT_ID = "48ecf016"
JAMENDO_API_BASE = "https://api.jamendo.com/v3.0"
TEST_DOWNLOAD_DIR = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/test_downloads")
TEST_DOWNLOAD_DIR.mkdir(exist_ok=True)

# Load tracks from checkpoint or final file
#CHECKPOINT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/tracks_checkpoint.jsonl")
#OUTPUT_FILE = Path("/Users/cliftonwest/Documents/GitHub/Training/notebooks/jamendo/full_track_info.json")

def extract_audio_metadata_ffprobe(filepath):
    """
    Extract technical audio metadata using ffprobe.
    Returns dict with: sample_rate_hz, channels, bitrate, codec_name, duration_sec_actual
    """
    try:
        # Run ffprobe to get JSON output
        result = subprocess.run([
            'ffprobe',
            '-v', 'quiet',
            '-print_format', 'json',
            '-show_format',
            '-show_streams',
            str(filepath)
        ], capture_output=True, text=True, timeout=30)
        
        if result.returncode != 0:
            return {
                'sample_rate_hz': None,
                'channels': None,
                'bitrate': None,
                'codec_name': None,
                'duration_sec_actual': None,
                'error': 'ffprobe failed'
            }
        
        data = json.loads(result.stdout)
        
        # Find audio stream
        audio_stream = None
        for stream in data.get('streams', []):
            if stream.get('codec_type') == 'audio':
                audio_stream = stream
                break
        
        if not audio_stream:
            return {
                'sample_rate_hz': None,
                'channels': None,
                'bitrate': None,
                'codec_name': None,
                'duration_sec_actual': None,
                'error': 'no audio stream found'
            }
        
        # Extract format info
        format_info = data.get('format', {})
        
        return {
            'sample_rate_hz': int(audio_stream.get('sample_rate', 0)) if audio_stream.get('sample_rate') else None,
            'channels': audio_stream.get('channels'),
            'bitrate': int(format_info.get('bit_rate', 0)) if format_info.get('bit_rate') else None,
            'codec_name': audio_stream.get('codec_name'),
            'duration_sec_actual': float(format_info.get('duration', 0)) if format_info.get('duration') else None,
        }
    
    except subprocess.TimeoutExpired:
        return {
            'sample_rate_hz': None,
            'channels': None,
            'bitrate': None,
            'codec_name': None,
            'duration_sec_actual': None,
            'error': 'ffprobe timeout'
        }
    except Exception as e:
        return {
            'sample_rate_hz': None,
            'channels': None,
            'bitrate': None,
            'codec_name': None,
            'duration_sec_actual': None,
            'error': str(e)
        }

def analyze_audio_quality(filepath):
    """
    Analyze audio quality metrics using ffmpeg.
    Returns dict with: peak_dbfs, silence_ratio
    """
    try:
        # Use ffmpeg volumedetect filter to get peak volume
        result = subprocess.run([
            'ffmpeg',
            '-i', str(filepath),
            '-af', 'volumedetect',
            '-f', 'null',
            '-'
        ], capture_output=True, text=True, timeout=60)
        
        # Parse output for peak volume
        peak_dbfs = None
        for line in result.stderr.split('\n'):
            if 'max_volume:' in line:
                try:
                    # Extract value like "max_volume: -23.5 dB"
                    peak_dbfs = float(line.split(':')[1].strip().split()[0])
                except:
                    pass
        
        # Use ffmpeg silencedetect filter to detect silence
        result_silence = subprocess.run([
            'ffmpeg',
            '-i', str(filepath),
            '-af', 'silencedetect=noise=-50dB:d=0.1',
            '-f', 'null',
            '-'
        ], capture_output=True, text=True, timeout=60)
        
        # Parse silence detection output
        silence_duration = 0.0
        total_duration = 0.0
        
        for line in result_silence.stderr.split('\n'):
            if 'silence_duration:' in line:
                try:
                    duration = float(line.split('silence_duration:')[1].strip().split()[0])
                    silence_duration += duration
                except:
                    pass
            if 'Duration:' in line and total_duration == 0:
                try:
                    # Extract duration from "Duration: 00:03:45.67"
                    time_str = line.split('Duration:')[1].strip().split(',')[0].strip()
                    parts = time_str.split(':')
                    if len(parts) == 3:
                        hours, minutes, seconds = parts
                        total_duration = float(hours) * 3600 + float(minutes) * 60 + float(seconds)
                except:
                    pass
        
        silence_ratio = (silence_duration / total_duration) if total_duration > 0 else 0.0
        
        return {
            'peak_dbfs': peak_dbfs,
            'silence_ratio': silence_ratio
        }
    
    except subprocess.TimeoutExpired:
        return {
            'peak_dbfs': None,
            'silence_ratio': None,
            'error': 'quality analysis timeout'
        }
    except Exception as e:
        return {
            'peak_dbfs': None,
            'silence_ratio': None,
            'error': str(e)
        }

def compute_file_hash(filepath):
    """
    Compute SHA256 hash of file for deduplication.
    """
    try:
        sha256_hash = hashlib.sha256()
        with open(filepath, "rb") as f:
            # Read in chunks to handle large files
            for chunk in iter(lambda: f.read(8192), b""):
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()
    except Exception as e:
        return None

print("Loading track data...")
if CHECKPOINT_FILE.exists():
    # Load first track from checkpoint
    with open(CHECKPOINT_FILE, 'r') as f:
        first_line = f.readline()
        if first_line.strip():
            test_track = json.loads(first_line)
            print(f"‚úì Loaded track from checkpoint file")
        else:
            print("‚úó Checkpoint file is empty")
            test_track = None
elif OUTPUT_FILE.exists():
    # Load first track from final file
    with open(OUTPUT_FILE, 'r') as f:
        all_tracks = json.load(f)
        if all_tracks:
            test_track = all_tracks[0]
            print(f"‚úì Loaded track from final output file")
        else:
            print("‚úó Output file has no tracks")
            test_track = None
else:
    print("‚úó No track data found. Run the fetch cell first.")
    test_track = None

if test_track:
    print("\n" + "="*60)
    print("TEST TRACK INFO")
    print("="*60)
    print(f"Track ID: {test_track.get('id')}")
    print(f"Name: {test_track.get('name')}")
    print(f"Artist: {test_track.get('artist_name')}")
    print(f"Album: {test_track.get('album_name')}")
    print(f"Duration: {test_track.get('duration')}s ({test_track.get('duration')/60:.1f} minutes)")
    
    # Get download URL from API
    print("\n" + "="*60)
    print("FETCHING DOWNLOAD URL")
    print("="*60)
    
    track_id = test_track.get('id')
    
    client = httpx.Client(timeout=30, follow_redirects=True)
    try:
        # Request track info with audiodownload format
        response = client.get(f"{JAMENDO_API_BASE}/tracks/", params={
            "client_id": JAMENDO_CLIENT_ID,
            "format": "json",
            "id": track_id,
            "audiodownload": "true"
        })
        response.raise_for_status()
        track_data = response.json()
        
        if track_data.get("results"):
            track_info = track_data["results"][0]
            download_url = track_info.get("audiodownload")
            
            if download_url:
                print(f"‚úì Download URL obtained: {download_url}")
                
                # Download the file
                print("\n" + "="*60)
                print("DOWNLOADING AUDIO FILE")
                print("="*60)
                
                # Generate filename
                safe_name = "".join(c for c in test_track.get('name', 'track') if c.isalnum() or c in (' ', '-', '_')).strip()
                safe_artist = "".join(c for c in test_track.get('artist_name', 'artist') if c.isalnum() or c in (' ', '-', '_')).strip()
                filename = f"{track_id}_{safe_artist}_{safe_name}.mp3"
                filepath = TEST_DOWNLOAD_DIR / filename
                
                print(f"Downloading to: {filepath}")
                print(f"Starting download...")
                
                start_time = time.time()
                
                # Stream download with progress
                with client.stream("GET", download_url) as r:
                    r.raise_for_status()
                    total_size = int(r.headers.get('content-length', 0))
                    
                    with open(filepath, 'wb') as f:
                        downloaded = 0
                        for chunk in r.iter_bytes(chunk_size=8192):
                            f.write(chunk)
                            downloaded += len(chunk)
                            if total_size > 0:
                                progress = (downloaded / total_size) * 100
                                print(f"\rProgress: {downloaded:,} / {total_size:,} bytes ({progress:.1f}%)", end='')
                
                elapsed = time.time() - start_time
                file_size_mb = filepath.stat().st_size / 1024 / 1024
                
                print(f"\n\n‚úì Download complete!")
                print(f"  File: {filepath.name}")
                print(f"  Size: {file_size_mb:.2f} MB")
                print(f"  Time: {elapsed:.1f} seconds")
                print(f"  Speed: {file_size_mb/elapsed:.2f} MB/s")
                
                # ============================================================
                # POST-DOWNLOAD METADATA EXTRACTION
                # ============================================================
                
                print("\n" + "="*60)
                print("EXTRACTING AUDIO METADATA")
                print("="*60)
                
                # A) Technical metadata from ffprobe
                print("Running ffprobe analysis...")
                tech_metadata = extract_audio_metadata_ffprobe(filepath)
                
                # B) Quality checks from ffmpeg
                print("Running quality analysis...")
                quality_metadata = analyze_audio_quality(filepath)
                
                # C) SHA256 hash for deduplication
                print("Computing file hash...")
                file_hash = compute_file_hash(filepath)
                
                # D) Segment metadata (not implemented, so None)
                segment_metadata = {
                    'segment_id': None,
                    'segment_start_sec': None,
                    'segment_end_sec': None
                }
                
                # Compile complete metadata
                complete_metadata = {
                    # Original track info
                    'track_id': track_id,
                    'track_name': test_track.get('name'),
                    'artist_name': test_track.get('artist_name'),
                    'filename': filename,
                    'file_path': str(filepath),
                    'file_size_bytes': filepath.stat().st_size,
                    
                    # A) Technical metadata
                    **tech_metadata,
                    
                    # B) Quality metrics
                    **quality_metadata,
                    
                    # C) Deduplication
                    'sha256': file_hash,
                    
                    # D) Segment metadata
                    **segment_metadata
                }
                
                # Print results
                print("\n" + "="*60)
                print("COMPLETE METADATA")
                print("="*60)
                print(json.dumps(complete_metadata, indent=2))
                
                # Quality checks summary
                print("\n" + "="*60)
                print("QUALITY CHECKS SUMMARY")
                print("="*60)
                
                # Check for clipping (peak near 0 dBFS)
                clipping_detected = False
                if complete_metadata['peak_dbfs'] is not None:
                    if complete_metadata['peak_dbfs'] > -1.0:
                        clipping_detected = True
                        print(f"‚ö†Ô∏è  CLIPPING DETECTED: Peak = {complete_metadata['peak_dbfs']:.2f} dBFS")
                    else:
                        print(f"‚úì No clipping: Peak = {complete_metadata['peak_dbfs']:.2f} dBFS")
                else:
                    print("‚ö†Ô∏è  Could not determine peak level")
                
                # Check for high silence ratio
                high_silence = False
                if complete_metadata['silence_ratio'] is not None:
                    if complete_metadata['silence_ratio'] > 0.20:
                        high_silence = True
                        print(f"‚ö†Ô∏è  HIGH SILENCE RATIO: {complete_metadata['silence_ratio']:.1%} of audio is silent")
                    else:
                        print(f"‚úì Normal silence ratio: {complete_metadata['silence_ratio']:.1%}")
                else:
                    print("‚ö†Ô∏è  Could not determine silence ratio")
                
                # Summary counts (for single file)
                print("\n" + "="*60)
                print("SUMMARY COUNTS")
                print("="*60)
                print(f"Files analyzed: 1")
                print(f"Files with clipping: {1 if clipping_detected else 0}")
                print(f"Files with high silence (>20%): {1 if high_silence else 0}")
                
                print(f"\n‚úì Test download and metadata extraction successful!")
                
            else:
                print("‚úó No download URL found in track data")
                print(f"Available fields: {list(track_info.keys())}")
        else:
            print("‚úó No track data returned from API")
            
    except httpx.HTTPError as e:
        print(f"‚úó HTTP error: {e}")
    except Exception as e:
        print(f"‚úó Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        client.close()
else:
    print("\nSkipping test - no track data available.")

In [None]:
%pip install -q httpx pandas pyarrow tqdm tenacity

In [None]:
import httpx
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
from typing import Optional, Dict, List, Any
import time

In [None]:
def canonicalize_license(license_ccurl: Optional[str]) -> Optional[str]:
    """
    Parse Creative Commons license URL and return canonical form.
    
    Returns: 'cc-by', 'cc-by-sa', or None (for rejected licenses)
    
    Accepts:
    - CC-BY 3.0/4.0: http://creativecommons.org/licenses/by/3.0/
    - CC-BY-SA 3.0/4.0: http://creativecommons.org/licenses/by-sa/3.0/
    
    Rejects (returns None):
    - CC-BY-NC (NonCommercial)
    - CC-BY-ND (NoDerivatives)
    - CC-BY-NC-SA
    - CC-BY-NC-ND
    - Any other license
    """
    if not license_ccurl or not isinstance(license_ccurl, str):
        return None
    
    # Normalize URL
    url_lower = license_ccurl.lower().strip().rstrip('/')
    
    # Extract license type from URL pattern
    # Expected: http(s)://creativecommons.org/licenses/{type}/{version}/
    if 'creativecommons.org/licenses/' not in url_lower:
        return None
    
    # Extract type component
    parts = url_lower.split('creativecommons.org/licenses/')
    if len(parts) != 2:
        return None
    
    license_part = parts[1].split('/')[0]  # Get type before version
    
    # Strict allowlist matching
    if license_part == 'by':
        return 'cc-by'
    elif license_part == 'by-sa':
        return 'cc-by-sa'
    else:
        # Reject: by-nc, by-nd, by-nc-sa, by-nc-nd, etc.
        return None

# Unit tests
assert canonicalize_license("http://creativecommons.org/licenses/by/3.0/") == "cc-by"
assert canonicalize_license("http://creativecommons.org/licenses/by-sa/4.0/") == "cc-by-sa"
assert canonicalize_license("http://creativecommons.org/licenses/by-nc/3.0/") is None
assert canonicalize_license("http://creativecommons.org/licenses/by-nc-sa/3.0/") is None
assert canonicalize_license("http://creativecommons.org/licenses/by-nd/3.0/") is None
assert canonicalize_license(None) is None
assert canonicalize_license("") is None
print("‚úì License canonicalization tests passed")

In [None]:
class JamendoAPIClient:
    """Jamendo API client with retry logic and rate limiting."""
    
    def __init__(self, client_id: str, timeout: int = 30):
        self.client_id = client_id
        self.timeout = timeout
        self.client = httpx.Client(timeout=timeout)
        self.last_request_time = 0
        self.min_request_interval = 0.1  # 100ms between requests
    
    def _rate_limit(self):
        """Simple rate limiting: ensure minimum interval between requests."""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_interval:
            time.sleep(self.min_request_interval - elapsed)
        self.last_request_time = time.time()
    
    @retry(
        stop=stop_after_attempt(RETRY_MAX_ATTEMPTS),
        wait=wait_exponential(multiplier=RETRY_BACKOFF_FACTOR, min=1, max=60),
        retry=retry_if_exception_type((httpx.TimeoutException, httpx.HTTPStatusError))
    )
    def fetch_tracks(
        self, 
        offset: int = 0, 
        limit: int = 200,
        include_fullcount: bool = False
    ) -> Dict[str, Any]:
        """
        Fetch tracks from Jamendo API with pagination.
        
        Returns: {"headers": {...}, "results": [...]}
        """
        self._rate_limit()
        
        params = {
            "client_id": self.client_id,
            "format": "json",
            "limit": limit,
            "offset": offset,
            "audiodownload": "true",  # Only downloadable tracks
        }
        
        if include_fullcount:
            params["fullcount"] = "true"
        
        url = f"{JAMENDO_API_BASE}/tracks/"
        
        try:
            response = self.client.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                # Rate limit - tenacity will retry with backoff
                print(f"‚ö† Rate limited (429), will retry...")
                raise
            elif e.response.status_code >= 500:
                # Server error - tenacity will retry
                print(f"‚ö† Server error ({e.response.status_code}), will retry...")
                raise
            else:
                # Client error - don't retry
                print(f"‚úó Client error ({e.response.status_code}): {e}")
                raise RuntimeError(f"API error: {e.response.status_code}") from e
    
    def close(self):
        self.client.close()

# Test client initialization
api_client = JamendoAPIClient(JAMENDO_CLIENT_ID, timeout=REQUEST_TIMEOUT)
print("‚úì API client initialized")

In [None]:
def load_state() -> Dict[str, Any]:
    """Load checkpoint state or return initial state."""
    if STATE_FILE.exists():
        with open(STATE_FILE, 'r') as f:
            state = json.load(f)
        print(f"‚úì Loaded checkpoint: offset={state['last_offset']}, fetched={state['total_fetched']}")
        return state
    else:
        return {
            "last_offset": 0,
            "total_fetched": 0,
            "total_passed_filter": 0,
            "total_rejected": 0,
            "total_catalog_size": None,
            "start_time": datetime.utcnow().isoformat(),
            "last_update_time": None
        }

def save_state(state: Dict[str, Any]):
    """Save checkpoint state."""
    state["last_update_time"] = datetime.utcnow().isoformat()
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)

# Load or initialize state
state = load_state()

In [None]:
def process_track(track: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Filter and extract metadata from a track.
    Returns None if track doesn't pass license filter.
    """
    license_url = track.get("license_ccurl")
    canonical_license = canonicalize_license(license_url)
    
    if canonical_license is None:
        return None
    
    # Extract relevant metadata
    return {
        "id": track.get("id"),
        "name": track.get("name"),
        "duration": track.get("duration"),  # in seconds
        "artist_id": track.get("artist_id"),
        "artist_name": track.get("artist_name"),
        "album_id": track.get("album_id"),
        "album_name": track.get("album_name"),
        "license": canonical_license,
        "license_ccurl": license_url,
        "releasedate": track.get("releasedate"),
        "audiodownload_allowed": track.get("audiodownload_allowed"),
    }

# Main crawl loop
offset = state["last_offset"]
page_count = 0
audit_passed = []
audit_rejected = []

# Open JSONL in append mode
jsonl_mode = 'a' if offset > 0 else 'w'
jsonl_file = open(JSONL_FILE, jsonl_mode)

try:
    # Get total catalog size on first request
    if state["total_catalog_size"] is None:
        print("Fetching catalog size...")
        first_response = api_client.fetch_tracks(offset=0, limit=1, include_fullcount=True)
        total_size = first_response["headers"].get("results_fullcount", 0)
        state["total_catalog_size"] = total_size
        print(f"Total catalog size: {total_size:,} tracks")
    
    total_size = state["total_catalog_size"]
    total_pages = (total_size // PAGE_SIZE) + (1 if total_size % PAGE_SIZE else 0)
    
    if MAX_PAGES > 0:
        total_pages = min(total_pages, MAX_PAGES)
        print(f"DRY RUN: Processing {total_pages} pages only")
    
    # Progress bar
    with tqdm(total=total_pages, initial=offset // PAGE_SIZE, desc="Crawling pages", unit="page") as pbar:
        while True:
            # Check if we've reached max pages (dry run mode)
            if MAX_PAGES > 0 and page_count >= MAX_PAGES:
                print(f"Reached MAX_PAGES limit ({MAX_PAGES})")
                break
            
            # Check if we've exhausted the catalog
            if offset >= total_size:
                print(f"Completed: processed all {total_size:,} tracks")
                break
            
            # Fetch page
            response = api_client.fetch_tracks(offset=offset, limit=PAGE_SIZE)
            results = response.get("results", [])
            
            if not results:
                print(f"No more results at offset {offset}")
                break
            
            # Process tracks
            for track in results:
                state["total_fetched"] += 1
                
                processed = process_track(track)
                if processed:
                    # Write to JSONL immediately (streaming)
                    jsonl_file.write(json.dumps(processed) + '\n')
                    state["total_passed_filter"] += 1
                    
                    # Collect audit samples
                    if len(audit_passed) < 10:
                        audit_passed.append({
                            "id": track.get("id"),
                            "name": track.get("name"),
                            "license_ccurl": track.get("license_ccurl"),
                            "canonical": processed["license"]
                        })
                else:
                    state["total_rejected"] += 1
                    
                    # Collect audit samples
                    if len(audit_rejected) < 10:
                        audit_rejected.append({
                            "id": track.get("id"),
                            "name": track.get("name"),
                            "license_ccurl": track.get("license_ccurl"),
                            "reason": "license_not_allowed"
                        })
            
            # Update state
            offset += len(results)
            state["last_offset"] = offset
            page_count += 1
            
            # Checkpoint periodically
            if page_count % CHECKPOINT_INTERVAL == 0:
                jsonl_file.flush()
                save_state(state)
            
            pbar.update(1)
            pbar.set_postfix({
                "passed": state["total_passed_filter"],
                "rejected": state["total_rejected"]
            })
    
    # Final checkpoint
    jsonl_file.flush()
    save_state(state)
    
finally:
    jsonl_file.close()
    api_client.close()

print("\n" + "="*60)
print("CRAWL COMPLETE")
print("="*60)
print(f"Total tracks fetched: {state['total_fetched']:,}")
print(f"Passed filter (CC-BY/CC-BY-SA): {state['total_passed_filter']:,}")
print(f"Rejected (NC/ND/other): {state['total_rejected']:,}")
print(f"Pass rate: {100 * state['total_passed_filter'] / state['total_fetched']:.1f}%")

In [None]:
print("\n" + "="*60)
print("AUDIT TRAIL - Sample Tracks")
print("="*60)

print("\n‚úì PASSED FILTER (CC-BY/CC-BY-SA):")
for i, track in enumerate(audit_passed, 1):
    print(f"{i}. [{track['id']}] {track['name']}")
    print(f"   License URL: {track['license_ccurl']}")
    print(f"   Canonical: {track['canonical']}")
    print()

print("\n‚úó REJECTED (NC/ND/other):")
for i, track in enumerate(audit_rejected, 1):
    print(f"{i}. [{track['id']}] {track['name']}")
    print(f"   License URL: {track['license_ccurl']}")
    print(f"   Reason: {track['reason']}")
    print()

In [None]:
# Load JSONL into DataFrame
print("Loading metadata for aggregation...")
df = pd.read_json(JSONL_FILE, lines=True)
print(f"Loaded {len(df):,} tracks")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Summary statistics
total_tracks = len(df)
total_duration_seconds = df['duration'].sum()
total_hours = total_duration_seconds / 3600

summary = {
    "generated_at": datetime.utcnow().isoformat(),
    "source": "jamendo_api",
    "license_filter": list(ALLOWED_LICENSES),
    "total_tracks": int(total_tracks),
    "total_duration_seconds": float(total_duration_seconds),
    "total_duration_hours": float(total_hours),
    "total_catalog_tracks_fetched": state["total_fetched"],
    "pass_rate_percent": float(100 * state["total_passed_filter"] / state['total_fetched']),
    "crawl_start_time": state["start_time"],
    "crawl_end_time": state["last_update_time"],
}

# Save summary JSON
with open(SUMMARY_FILE, 'w') as f:
    json.dump(summary, f, indent=2)

print("="*60)
print("SUMMARY STATISTICS")
print("="*60)
print(f"Total tracks: {summary['total_tracks']:,}")
print(f"Total duration: {summary['total_duration_hours']:,.1f} hours")
print(f"Average track duration: {total_duration_seconds / total_tracks / 60:.1f} minutes")
print(f"Pass rate: {summary['pass_rate_percent']:.1f}%")
print(f"\n‚úì Saved: {SUMMARY_FILE}")

In [None]:
# Group by license type
license_stats = df.groupby('license').agg({
    'duration': ['count', 'sum']
}).reset_index()

license_stats.columns = ['license', 'track_count', 'total_duration_seconds']
license_stats['total_duration_hours'] = license_stats['total_duration_seconds'] / 3600
license_stats['percentage_of_tracks'] = 100 * license_stats['track_count'] / total_tracks

# Save CSV
license_stats.to_csv(LICENSE_CSV, index=False)

print("="*60)
print("BREAKDOWN BY LICENSE TYPE")
print("="*60)
print(license_stats.to_string(index=False))
print(f"\n‚úì Saved: {LICENSE_CSV}")

In [None]:
# Compute duration percentiles
duration_minutes = df['duration'] / 60

duration_stats = pd.DataFrame({
    'metric': ['mean', 'median (p50)', 'p90', 'p95', 'p99', 'min', 'max'],
    'duration_minutes': [
        duration_minutes.mean(),
        duration_minutes.quantile(0.50),
        duration_minutes.quantile(0.90),
        duration_minutes.quantile(0.95),
        duration_minutes.quantile(0.99),
        duration_minutes.min(),
        duration_minutes.max(),
    ]
})

# Save CSV
duration_stats.to_csv(DURATION_CSV, index=False)

print("="*60)
print("DURATION STATISTICS")
print("="*60)
print(duration_stats.to_string(index=False))
print(f"\n‚úì Saved: {DURATION_CSV}")

In [None]:
# Save as Parquet (more efficient for large datasets)
try:
    df.to_parquet(PARQUET_FILE, index=False, compression='snappy')
    print(f"‚úì Saved Parquet: {PARQUET_FILE}")
    print(f"  JSONL size: {JSONL_FILE.stat().st_size / 1024 / 1024:.1f} MB")
    print(f"  Parquet size: {PARQUET_FILE.stat().st_size / 1024 / 1024:.1f} MB")
except Exception as e:
    print(f"‚ö† Failed to save Parquet: {e}")

In [None]:
print("\n" + "="*60)
print("ALL OUTPUTS GENERATED")
print("="*60)
print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"\nFiles created:")
print(f"  1. {JSONL_FILE.name} - Raw filtered metadata (append-only)")
print(f"  2. {SUMMARY_FILE.name} - Topline metrics")
print(f"  3. {LICENSE_CSV.name} - License breakdown")
print(f"  4. {DURATION_CSV.name} - Duration statistics")
print(f"  5. {STATE_FILE.name} - Checkpoint state (for resume)")
if PARQUET_FILE.exists():
    print(f"  6. {PARQUET_FILE.name} - Parquet archive (optional)")

print(f"\n‚úì Pipeline complete!")
print(f"\nTo resume crawl if interrupted:")
print(f"  - Re-run this notebook (it will resume from offset {state['last_offset']})")
print(f"\nTo start fresh:")
print(f"  - Delete {STATE_FILE} and {JSONL_FILE}")