# Spotify MPD full export

Create the matrix for 1,000 artists based on all 1M playlists. A couple JSON files will be created:
1. Name to ID and ID to name lookup
2. Artist interaction matrix
3. "Artist popularity" (how many times they appear in total)

In [3]:
import json
import os
from collections import defaultdict
from tqdm import tqdm

In [11]:
all_slices = os.listdir("./data/spotify_million_playlist_dataset/data")

def all_playlists(progress=True):
    iterator = tqdm(all_slices) if progress else all_slices

    for slice in iterator:
        with open(f"./data/spotify_million_playlist_dataset/data/{slice}", encoding="utf8") as fin:
            for playlist in json.load(fin)["playlists"]:
                yield playlist

## Generate artist frequencies and lookups

In [9]:
def uri2id(uri):
    return uri.split(":")[-1]

id2name = {}
name2id = {}

In [15]:
artist_freq_dict = defaultdict(int)

for playlist in all_playlists():
    for track in playlist["tracks"]:
        artist_id = uri2id(track["artist_uri"])
        artist_name = track["artist_name"]

        artist_freq_dict[artist_id] += 1
        id2name[artist_id] = artist_name
        name2id[artist_name] = artist_id

print(f"{len(artist_freq_dict)} unique artists")

100%|██████████| 1000/1000 [07:51<00:00,  2.12it/s]

295860 unique artists





In [37]:
# THIS IS NOW REDUNDANT
# The matrix will compute everyone just for the hell of it
N = 2500
artist_freq = sorted(artist_freq_dict.items(), key=lambda x: -x[1])[:N]
top_artists = set([a[0] for a in artist_freq])

In [16]:
with open("./data/name_id_maps.json", "w", encoding="utf8") as fout:
    json.dump({
        "id2name": id2name,
        "name2id": name2id
    }, fout, indent=2)

In [29]:
with open("./data/1m_artist_freq.json", "w", encoding="utf8") as fout:
    json.dump(artist_freq_dict, fout, indent=2)

## Cross-artist frequencies

In [38]:
# For each playlist, compute artist frequencies
# Then do some math to assign scores

matrix = defaultdict(lambda: defaultdict(int))

for playlist in all_playlists():
    freqs_dict = defaultdict(int)
    for track in playlist["tracks"]:
        artist_name = track["artist_uri"].split(":")[-1]
        if not artist_name in top_artists: continue
        freqs_dict[artist_name] += 1

    freqs = list(freqs_dict.items())
    n = len(freqs)
    for i in range(n):
        for j in range(i + 1, n):
            a, b = freqs[i][0], freqs[j][0]
            score = freqs[i][1] * freqs[j][1]
            matrix[a][b] += score
            matrix[b][a] += score

100%|██████████| 1000/1000 [18:56<00:00,  1.14s/it]


In [39]:
matrix_obj = {key: dict(value) for key, value in matrix.items()}

In [40]:
with open("./data/2.5k_matrix.json", "w") as fout:
    json.dump(matrix_obj, fout, indent=2)