In [None]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

client_id = "" # Enter your client ID here if you want to try this script
client_secret = "" # Enter your client Secret here if you want to try this script

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

artist_df = pd.read_csv('../data/sorted_artist_ids.csv')

def get_batch_track_features(track_ids, artist_id):
    global tracks_requests, artists_requests, audio_features_requests, total_api_requests

    output_data = []

    for i in range(0, len(track_ids), track_batch_size):
        tracks_batch = track_ids[i:i + track_batch_size]

        if audio_features_requests == cooldown_threshold:
            print(f"Cooldown for {cooldown_time} seconds.")
            time.sleep(cooldown_time)
            audio_features_requests = 0

        audio_features_list = sp.audio_features(tracks_batch)

        audio_features_requests += 1
        total_api_requests += 1

        for track_info, audio_features in zip(sp.tracks(tracks_batch)['tracks'], audio_features_list):
            if audio_features is not None:
                track_feature = {
                    'song_name': track_info['name'],
                    'artist': track_info['artists'][0]['name'],
                    'artist_id': track_info['artists'][0]['id'],
                    'popularity': track_info.get('popularity', None),

                    'danceability': audio_features['danceability'],
                    'energy': audio_features['energy'],
                    'key': audio_features['key'],
                    'loudness': audio_features['loudness'],
                    'mode': audio_features['mode'],
                    'speechiness': audio_features['speechiness'],
                    'acousticness': audio_features['acousticness'],
                    'instrumentalness': audio_features['instrumentalness'],
                    'liveness': audio_features['liveness'],
                    'valence': audio_features['valence'],
                    'tempo': audio_features['tempo'],
                    'time_signature': audio_features['time_signature'],
                }

                release_date = track_info.get('album', {}).get('release_date', None)
                if release_date:
                    track_feature['release_year'] = int(release_date.split('-')[0])
                else:
                    track_feature['release_year'] = None

                output_data.append(track_feature)

    return output_data

artist_limit = 150

artist_batch_size = 50
album_batch_size = 10  
track_batch_size = 50  
output_data = []

total_api_requests = 0
tracks_requests = 0
artists_requests = 0
audio_features_requests = 0
artist_albums_requests = 0
album_requests = 0

cooldown_threshold = 150
cooldown_time = 30

print(f"Total Artists in database: {len(artist_df)}")
processed_artists = 0
for i in range(370, len(artist_df), artist_batch_size): 
    
    if processed_artists >= artist_limit:
        break

    batch_artist_df = artist_df.iloc[i:i + artist_batch_size]
    print(f"Processing artist batch {i // artist_batch_size + 1}/{len(artist_df) // artist_batch_size}")

    for _, row in batch_artist_df.iterrows():
        if processed_artists >= artist_limit:
            break

        processed_artists += 1

        if artist_albums_requests == cooldown_threshold:
            print(f"Cooldown for {cooldown_time} seconds.")
            time.sleep(cooldown_time)
            artist_albums_requests = 0
            

        print(f"Processing artist ID: {row['artist_ids']}")
        top_albums = sp.artist_albums(row['artist_ids'], album_type='album,single', limit=album_batch_size)['items']
        
        total_api_requests += 1
        artist_albums_requests += 1

        time.sleep(0.5)

        for album in top_albums[:10]:
            if album_requests == cooldown_threshold:
                print(f"Cooldown for {cooldown_time} seconds.")
                time.sleep(cooldown_time)
                album_requests = 0

            print(f"Processing album ID: {album['id']}")
            album_info = sp.album(album['id'])
            album_requests += 1
            total_api_requests += 1

            track_ids_batch = [track['id'] for track in album_info['tracks']['items']]

            track_ids_batches = [track_ids_batch[i:i + 50] for i in range(0, len(track_ids_batch), 50)]

            if artists_requests == cooldown_threshold:
                print(f"Cooldown for {cooldown_time} seconds.")
                time.sleep(cooldown_time)
                artists_requests = 0

            artist_info = sp.artist(row['artist_ids'])
            artist_genres = artist_info.get('genres', [])[:3]

            artists_requests += 1
            total_api_requests += 1

            if len(artist_genres) >= 1:
                genre_1 = artist_genres[0]
                genre_2 = artist_genres[1] if len(artist_genres) >= 2 else None
                genre_3 = artist_genres[2] if len(artist_genres) >= 3 else None
            else:
                genre_1 = genre_2 = genre_3 = None

            for track_ids_batch in track_ids_batches:
                if tracks_requests == cooldown_threshold:
                    print(f"Cooldown for {cooldown_time} seconds.")
                    time.sleep(cooldown_time)
                    tracks_requests = 0

                track_features_batch = get_batch_track_features(track_ids_batch, row['artist_ids'])

                tracks_requests += 1

                for track_feature in track_features_batch:
                    track_feature['genre_1'] = genre_1
                    track_feature['genre_2'] = genre_2
                    track_feature['genre_3'] = genre_3

                output_data.extend(track_features_batch)

                time.sleep(0.5)

            if artists_requests == cooldown_threshold:
                print(f"Cooldown for {cooldown_time} seconds.")
                time.sleep(cooldown_time)
                artists_requests = 0

print(f"Total API Requests: {total_api_requests}")
print(f"Artist Albums API Requests: {artist_albums_requests}")
print(f"Albums API Requests: {album_requests}")
print(f"Audio Features API Requests: {audio_features_requests}")
print(f"Tracks Requests: {tracks_requests}")
print(f"Artists Requests: {artists_requests}")



In [5]:

# Save the data into the main DB

output_df = pd.DataFrame(output_data)

try:
    existing_df = pd.read_csv('../data/mainDB.csv')
    existing_df = pd.concat([existing_df, output_df], ignore_index=True)
except FileNotFoundError:
    existing_df = output_df

existing_df.to_csv('../data/mainDB.csv', index=False)
