In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
from dotenv import load_dotenv
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import json
import numpy as np
import pandas as pd



In [2]:
def get_client():
    load_dotenv(dotenv_path=".env")
    client_id = os.getenv("SPOTIFY_CLIENT_ID")
    client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")

    auth_manager = SpotifyClientCredentials(
        client_id=client_id, client_secret=client_secret, requests_timeout=5
    )
    sp = spotipy.Spotify(auth_manager=auth_manager)
    return sp
sp = get_client()

In [3]:
with open('components/id_to_ind.json', 'r') as f:
    artist_map = json.load(f)

In [4]:
artists = list(artist_map.keys())

In [5]:
artists

['4q3ewBCX7sLwd24euuV69X',
 '3TVXtAsR1Inumwj472S9r4',
 '06HL4z0CvFAxyc27GXpf02',
 '1Xyo4u8uXC1ZmMpatF05PJ',
 '5K4W6rqBFWDnAN6FQUkS6x',
 '3Nrfpe0tUJi4K4DXYWgMUX',
 '1uNFoZAHBGtllmzznpCI3s',
 '6eUKZXaKkcviH0Ku9w2n3V',
 '7dGJo4pcD2V6oG8kP0tJRR',
 '246dkjvS1zLTtiykXe5h60',
 '1mcTU81TzQhprhouKaTkpq',
 '5f7VJjfbwm532GiveGC0ZK',
 '4MCBfE4596Uoi2O4DtmEMz',
 '66CXWjxzNUsdJxJ2JdwvnR',
 '6M2wZ9GZgrQXHCFfjv46we',
 '53XhwfbYqKCa1cC15pYq2q',
 '4gzpq5DPGxSnKTe4SA8HAU',
 '2YZyLoL8N0Wb9xBt1NhZWg',
 '6vWDO969PvNqNYHIOW5v0m',
 '1vyhD5VmyZ7KMfW5gqLgo5',
 '2LRoIwlKmHjgvigdNGBHNo',
 '1Cs0zKBU1kc0i8ypK3B9ai',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '1RyvyyTE3xzB2ZywiAwp0i',
 '6qqNVTkY8uBg9cP3Jd7DAH',
 '4VMYDCV2IEDYJArk749S6m',
 '15UsOTVnJzReFVN1VCnxy4',
 '0du5cEVh5yTK9QJze8zA0C',
 '41MozSoPIsD1dJM0CLPjZF',
 '5pKCCKE2ajJHZ9KAiaK11H',
 '790FomKkXshlbRYZFtlgla',
 '716NhGYqD1jl2wI1Qkgq36',
 '00FQb4jTyendYWaN8pK0wa',
 '7bXgB6jMjp9ATFy66eO08Z',
 '0Y5tJX1MQlPlqiwlOH1tJY',
 '0hCNtLu0JehylgoiP8L4Gh',
 '1i8SpTcr7yvPOmcqrbnVXY',
 

In [6]:
# normalize: loudness, tempo, duration
# 4, 10, 11
# remove: type, id, uri, track_href, analysis_url
def get_median_audio_features_for_artists(artists):
    # map from id -> features
    id_to_features = defaultdict(dict)
    tracks = set()
    artists_to_ids = defaultdict(list)

    def get_top_track_ids(aid):
        nonlocal tracks, artists_to_ids
        resp = sp.artist_top_tracks(aid)
        ids = []
        for track in resp["tracks"]:
            id_ = track["id"]
            tracks.add(id_)
            artists_to_ids[aid].append(id_)
    
        return ids

    def get_audio_features(ids):
        nonlocal id_to_features
        resp = sp.audio_features(ids)
        for i in range(0, len(ids)):
            id_to_features[ids[i]] = resp[i]

    i = 0
    for a in artists:
        if i % 50 == 0: print("Artist #: ", i)
        get_top_track_ids(a)
        i += 1

    tracks = list(tracks)
    print(len(tracks)) # should be in the 29000 range
    chunk_size = 100
    for i in range(0, len(tracks), chunk_size):
        if (i % 1000) == 0: print("Feature analysis: ", i)
        track_chunk = tracks[i:i + chunk_size]
        get_audio_features(track_chunk)

    artist_to_features = defaultdict(list)
    for a in artists:
        features = []
        for id_ in artists_to_ids[a]:
            features_dic = id_to_features[id_]
            if features_dic is None: continue
            pop_keys = ["type", "id", "uri", "track_href", "analysis_url"]
            for p in pop_keys:
                if features_dic is None or p not in features_dic.keys(): continue
                features_dic.pop(p)
            features.append(list(features_dic.values()))
        arr = np.array(features)
        meds = np.median(arr, axis=0)
        artist_to_features[a] = meds.tolist()

    rows = [[key] + value for key, value in artist_to_features.items()]
    df = pd.DataFrame(rows)
    df = df.iloc[:, 1:].reset_index(drop=True)
    scaling_columns = [4, 11, 12]
    scaler = MinMaxScaler()
    df[scaling_columns] = scaler.fit_transform(df[scaling_columns])
    features_mat = df.to_numpy()

    np.save("components/Xaudiofeatures.npy", features_mat)
    return features_mat

Xaudiofeatures = get_median_audio_features_for_artists(artists)

Artist #:  0


KeyboardInterrupt: 

In [None]:
Xartist = np.load("components/Xartist.npy")
X_artist_2_cols = Xartist[:, :2]

In [None]:
Xfeatures = np.concatenate((X_artist_2_cols, Xaudiofeatures), axis=1)

In [None]:
np.save("components/Xfeatures.npy", Xfeatures)