## Get all artist info

Using Spotify API: https://developer.spotify.com/documentation/web-api/reference/get-an-artist

In [1]:
import requests
import json
import os
from dotenv import load_dotenv
from tqdm import tqdm
import re
import html

load_dotenv()

True

In [4]:
# Set stuff up with Spotify API
# https://developer.spotify.com/documentation/web-api/tutorials/client-credentials-flow

SPOTIFY_CLIENT_ID = os.environ["SPOTIFY_CLIENT_ID"]
SPOTIFY_CLIENT_SECRET = os.environ["SPOTIFY_CLIENT_SECRET"]

endpoint = "https://accounts.spotify.com/api/token"

headers = {
    "Content-Type": "application/x-www-form-urlencoded"
}
data = {
    "grant_type": "client_credentials",
    "client_id": SPOTIFY_CLIENT_ID,
    "client_secret": SPOTIFY_CLIENT_SECRET
}
res = requests.post(endpoint, data=data, headers=headers)
API_TOKEN = res.json()["access_token"]

## Make 1k requests

In [36]:
endpoint = "https://api.spotify.com/v1/artists"

def get_artist_data(id):
    headers = {
        "Authorization": f"Bearer {API_TOKEN}"
    }
    res = requests.get(f"{endpoint}/{id}", headers=headers)
    return res.json()

In [2]:
from utils import name2id, id2name, artist_freq_dict, top_artists

In [5]:
# Hopefully Spotify doesn't ratelimit too harshly
# https://developer.spotify.com/documentation/web-api/concepts/rate-limits
# Should take ~5 min

with open("./data/2.5k_artist_data.json", encoding="utf8") as fin:
    # TODO: Make this robust to if file doesn't exist
    artist_data = json.load(fin)

for artist_id in tqdm(top_artists):
    if artist_id in artist_data: continue
    artist_data[artist_id] = get_artist_data(artist_id)

100%|██████████| 2500/2500 [00:00<00:00, 1668644.18it/s]


In [62]:
with open("./data/2.5k_artist_data.json", "w", encoding="utf8") as fout:
    json.dump(artist_data, fout, indent=2)

## Retrieve stuff from last.fm

Particularly, artist bios.

In [6]:
endpoint = "https://ws.audioscrobbler.com/2.0"

def get_artist_bio(id):
    artist_name = id2name[id]
    params= {
        "method": "artist.getinfo",
        "artist": artist_name,
        "api_key": os.environ["LASTFM_API_KEY"]
    }
    res = requests.get(endpoint, params=params)

    search = re.search(
        "<summary>(.*)<\/summary>",
        res.content.decode()
    )
    if search == None:
        return ""

    bio = html.unescape(search[1])
    return bio

In [7]:
for artist_id in tqdm(top_artists):
    bio = get_artist_bio(artist_id)
    artist_data[artist_id]["bio"] = bio

100%|██████████| 2500/2500 [08:39<00:00,  4.81it/s]


In [8]:
with open("./data/2.5k_artist_data.json", "w", encoding="utf8") as fout:
    json.dump(artist_data, fout, indent=2)