In [11]:
import json
import sys
from collections import defaultdict
from tqdm import tqdm

In [108]:
data_dir = "./data/spotify_million_playlist_dataset_challenge/challenge_set.json"
with open(data_dir, "r", encoding="utf8") as fin:
    data = json.load(fin)

## Data preprocessing

Goal: get a list of 1,000 top artists and store it in an external file

In [41]:
def uri2id(uri):
    return uri.split(":")[-1]

id2name = {}
name2id = {}

In [116]:
artist_freq_dict = defaultdict(int)
for playlist in tqdm(data["playlists"]):
    for track in playlist["tracks"]:
        artist_id = uri2id(track["artist_uri"])
        artist_name = track["artist_name"]

        artist_freq_dict[artist_id] += 1
        id2name[artist_id] = artist_name
        name2id[artist_name] = artist_id

print(f"{len(artist_freq_dict)} unique artists")

100%|██████████| 10000/10000 [00:01<00:00, 8725.26it/s]

14098 unique artists





In [117]:
with open("./data/name_id_maps.json", "w", encoding="utf8") as fout:
    json.dump({
        "id2name": id2name,
        "name2id": name2id
    }, fout, indent=2)

In [120]:
N = 1000
artist_freq = sorted(artist_freq_dict.items(), key=lambda x: -x[1])[:N]
top_artists = set([a[0] for a in artist_freq])

## Cross-artist frequencies

In [121]:
# For each playlist, compute artist frequencies
# Then do some math to assign scores
# ~10 seconds

matrix = defaultdict(lambda: defaultdict(int))

for playlist in tqdm(data["playlists"]):
    freqs_dict = defaultdict(int)
    for track in playlist["tracks"]:
        artist_name = track["artist_uri"].split(":")[-1]
        if not artist_name in top_artists: continue
        freqs_dict[artist_name] += 1

    freqs = list(freqs_dict.items())
    n = len(freqs)
    for i in range(n):
        for j in range(i + 1, n):
            a, b = freqs[i][0], freqs[j][0]
            score = freqs[i][1] * freqs[j][1]
            matrix[a][b] += score
            matrix[b][a] += score

100%|██████████| 10000/10000 [00:08<00:00, 1209.96it/s]


Uh oh, this might take a while for a million playlists...\
Like 1000 seconds &approx; 17 min

In [123]:
matrix_obj = {key: dict(value) for key, value in matrix.items()}

In [125]:
with open("./data/1k_matrix.json", "w") as fout:
    json.dump(matrix_obj, fout, indent=2)