In [3]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import time
import json
import os
from collections import Counter

# ==========================================
# Environment Setup
# ==========================================
# Clear proxy settings to ensure direct connection
os.environ.pop('HTTP_PROXY', None)
os.environ.pop('HTTPS_PROXY', None)

# ==========================================
# Authentication
# ==========================================
CLIENT_ID = "8e3cc5b22a844460948dfb979072658d"
CLIENT_SECRET = "b3756a9b49a648bba3d8b24d58fe1dc5"
REDIRECT_URI = "http://127.0.0.1:9091/callback"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri=REDIRECT_URI,
    scope="user-library-read playlist-read-private"
), requests_timeout=30)

# Configuration
TARGET_MAX_SONGS = 8000
all_tracks_dict = {}

def save_checkpoint(data, filename='spotify_data_checkpoint.json'):
    """Saves the current data to a JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(list(data.values()), f, ensure_ascii=False, indent=2)
        print(f"[Checkpoint] Saved {len(data)} records.")
    except Exception as e:
        print(f"[Error] Failed to save checkpoint: {e}")

def process_track_item(track_obj):
    """
    Parses a Spotify track object into the required JSON structure.
    """
    # Validation
    if not track_obj or not track_obj.get('id') or not track_obj.get('name'):
        return None

    # Extract Artists
    artists_data = []
    for artist in track_obj.get('artists', []):
        artists_data.append({
            "name": artist.get('name'),
            "id": artist.get('id')
        })

    # Extract Album
    album_obj = track_obj.get('album', {})
    album_data = {
        "name": album_obj.get('name'),
        "id": album_obj.get('id'),
        "release_date": album_obj.get('release_date'),
        "images": album_obj.get('images', [])
    }

    # Construct Document
    return {
        "_id": track_obj.get('id'),
        "name": track_obj.get('name'),
        "duration_ms": track_obj.get('duration_ms'),
        "popularity": track_obj.get('popularity', 0),
        "spotify_url": track_obj.get('external_urls', {}).get('spotify'),
        "preview_url": track_obj.get('preview_url'),
        
        # New Field: Genres (initially empty, populated in Phase 3)
        "genres": [],

        "album": album_data,
        "artists": artists_data
    }

# ==========================================
# Phase 1: Fetch Liked Songs
# ==========================================
print("--- [Phase 1] Fetching Liked Songs ---")

try:
    results = sp.current_user_saved_tracks(limit=50)
    artist_counter = Counter()

    while results:
        for item in results['items']:
            track = item['track']
            clean_track = process_track_item(track)
            
            if clean_track:
                all_tracks_dict[clean_track['_id']] = clean_track
                
                # Count artists to determine favorites
                if track.get('artists'):
                    main_artist_id = track['artists'][0]['id']
                    artist_counter[main_artist_id] += 1
        
        if results['next']:
            results = sp.next(results)
            time.sleep(0.1) 
        else:
            results = None

except Exception as e:
    print(f"[Error] Phase 1 failed: {e}")

print(f"-> Phase 1 Complete. Collected {len(all_tracks_dict)} liked songs.")

# ==========================================
# Phase 2: Extend Data (Artist Albums)
# ==========================================
print(f"--- [Phase 2] Extending Dataset (Target: {TARGET_MAX_SONGS}) ---")

# Get list of artists sorted by popularity (frequency in library)
sorted_artists = [aid for aid, count in artist_counter.most_common()]

for i, artist_id in enumerate(sorted_artists):
    if len(all_tracks_dict) >= TARGET_MAX_SONGS:
        print("-> Target limit reached. Stopping extension.")
        break

    try:
        # Get albums
        albums = sp.artist_albums(artist_id, album_type='album,single', limit=20)
        
        for alb in albums['items']:
            if len(all_tracks_dict) >= TARGET_MAX_SONGS: break

            # Get tracks
            tracks = sp.album_tracks(alb['id'], limit=50)
            
            for t in tracks['items']:
                if t['id'] in all_tracks_dict: continue

                t['album'] = alb
                t['popularity'] = 0 
                
                clean_track = process_track_item(t)
                if clean_track:
                    all_tracks_dict[clean_track['_id']] = clean_track

            time.sleep(0.1)

        if i % 10 == 0:
            print(f"   Processed {i} artists. Total songs: {len(all_tracks_dict)}")

    except Exception as e:
        print(f"[Warning] Error processing artist {artist_id}: {e}")
        time.sleep(1)

print(f"-> Phase 2 Complete. Total tracks: {len(all_tracks_dict)}")

# ==========================================
# Phase 3: Fetch Genres (via Artist Details)
# ==========================================
print("--- [Phase 3] Fetching Genres from Artist Profiles ---")

# 1. Collect all unique Artist IDs from the collected tracks
all_artist_ids = set()
for track in all_tracks_dict.values():
    for artist in track['artists']:
        if artist.get('id'):
            all_artist_ids.add(artist['id'])

unique_artists_list = list(all_artist_ids)
print(f"   Found {len(unique_artists_list)} unique artists. Fetching their genres...")

# 2. Batch fetch artist details (Spotify allows 50 artists per request)
artist_genre_map = {}
batch_size = 50

for i in range(0, len(unique_artists_list), batch_size):
    batch_ids = unique_artists_list[i : i + batch_size]
    
    try:
        artists_info = sp.artists(batch_ids)
        
        for artist_obj in artists_info['artists']:
            if artist_obj:
                # Map Artist ID to their Genres list
                artist_genre_map[artist_obj['id']] = artist_obj.get('genres', [])
        
        print(f"   Fetched artists {i} to {i+len(batch_ids)}...")
        
    except Exception as e:
        print(f"   [Error] Failed to fetch artist batch {i}: {e}")
        time.sleep(2)
    
    time.sleep(0.5) # Rate limit protection

# 3. Populate genres back into the tracks
print("   Mapping genres to tracks...")
for track_id, track_data in all_tracks_dict.items():
    track_genres = set()
    
    # Check every artist on the track
    for artist in track_data['artists']:
        a_id = artist['id']
        # If we found genres for this artist, add them
        if a_id in artist_genre_map:
            for g in artist_genre_map[a_id]:
                track_genres.add(g)
    
    # Convert set back to list
    track_data['genres'] = list(track_genres)

# ==========================================
# Final Save
# ==========================================
final_filename = 'final_spotify_data_genres.json'
with open(final_filename, 'w', encoding='utf-8') as f:
    json.dump(list(all_tracks_dict.values()), f, ensure_ascii=False, indent=2)

print(f"--- Success! Saved {len(all_tracks_dict)} tracks to {final_filename} ---")

--- [Phase 1] Fetching Liked Songs ---
-> Phase 1 Complete. Collected 1277 liked songs.
--- [Phase 2] Extending Dataset (Target: 8000) ---
   Processed 0 artists. Total songs: 1628
   Processed 10 artists. Total songs: 3617
   Processed 20 artists. Total songs: 4972
   Processed 30 artists. Total songs: 6524
   Processed 40 artists. Total songs: 8000
-> Target limit reached. Stopping extension.
-> Phase 2 Complete. Total tracks: 8000
--- [Phase 3] Fetching Genres from Artist Profiles ---
   Found 944 unique artists. Fetching their genres...
   Fetched artists 0 to 50...
   Fetched artists 50 to 100...
   Fetched artists 100 to 150...
   Fetched artists 150 to 200...
   Fetched artists 200 to 250...
   Fetched artists 250 to 300...
   Fetched artists 300 to 350...
   Fetched artists 350 to 400...
   Fetched artists 400 to 450...
   Fetched artists 450 to 500...
   Fetched artists 500 to 550...
   Fetched artists 550 to 600...
   Fetched artists 600 to 650...
   Fetched artists 650 to 70