# Fetch data from Spotify and build datasets

In [None]:
import json
from functools import reduce
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from tqdm import tqdm_notebook as tqdm

from powerspot.operations import (
    get_album,
    get_artist,
    get_followed_artists,
    get_artist_albums,
    get_audio_analysis,
    get_audio_features,
    get_tracks,
)
from powerspot.cli import get_username

In [None]:
DATA_DIR = Path("data/")
# set random state for reproductible results
CONSTANT_RANDOM_STATE = 4

In [None]:
def save_json(name: str, data: Dict) -> None:
    (Path(DATA_DIR) / (name + ".json")).write_text(json.dumps(data))

## Get data

I use my library [powerspot](https://github.com/theodcr/powerspot) to easily fetch data from my Spotify account.

### User's followed artists

See the [documentation](https://developer.spotify.com/documentation/web-api/reference/follow/get-followed/)

In [None]:
username = get_username()

In [None]:
artists = get_followed_artists(username)

In [None]:
save_json("artists", artists)

### Albums released by each artist

See the [documentation](https://developer.spotify.com/documentation/web-api/reference/artists/get-artists-albums/)

In [None]:
artist_albums = {
    artist["uri"]: get_artist_albums(artist["uri"], limit=50)
    for artist in tqdm(artists)
}

How many artists and albums have we fetched:

In [None]:
all_albums = reduce(lambda x, y: x + y, artist_albums.values())
print("Number of artists:", len(artists))
print("Total number of albums:", len(all_albums))

2052 albums from 284 artists.

### More details on albums

The `albums` endpoint gives a bit more data about albums that the `artist_albums`. See the [documentation](https://developer.spotify.com/documentation/web-api/reference/albums/get-album/)

In [None]:
albums = [get_album(album["uri"]) for album in tqdm(all_albums)]

In [None]:
save_json("albums", albums)

In [None]:
print(
    "Total number of tracks:",
    sum([int(album["total_tracks"]) for album in albums])
)

A total of 30123 tracks.

### Album tracks audio features

Audio features provide a lot of interesting musical information about the tracks of our albums. See the [documentation](https://developer.spotify.com/documentation/web-api/reference/tracks/get-several-audio-features/)

In [None]:
audio_features = {
    uri: get_audio_features([track["uri"] for track in album["tracks"]["items"]])
    for uri, album in tqdm(albums.items())
}

## Build datasets

### Artists genres representation

- Binarization to get a matrix with 1 column per existing genre.
- Tf-Idf to get a more representative genres representation for each artist.
- t-SNE with 2 dimensions applied on the Tf-Idf matrix to get (x, y) coordinates for each artist in a 2D-genres space.
- KMean on the Tf-Idf matrix to find cluster among artists using their genres.

In [None]:
mlb = MultiLabelBinarizer()
genres_binarized = pd.DataFrame(
    mlb.fit_transform([artist["genres"] for artist in artists]), columns=mlb.classes_
)

In [None]:
genres_binarized.shape

300 genres among the 284 artists.

We can look at the most common genres.

In [None]:
genres_binarized.sum(0).sort_values(ascending=False)[:10]

Almost half the artists are in the modern and/or indie rock genres.

In [None]:
tfidf = TfidfTransformer()
genres_tfidf = pd.DataFrame(
    tfidf.fit_transform(genres_binarized).toarray(), columns=mlb.classes_,
)

Using the Tf-Idf genres representation, we can pick for each artist 1 genre that is its most destinctive (the one with the highest value).

It should mainly be subgenres.

In [None]:
genres_tfidf.idxmax(1).value_counts()

In [None]:
tsne = TSNE(n_components=2, random_state=CONSTANT_RANDOM_STATE)

In [None]:
tsne.fit(genres_tfidf)

In [None]:
km = KMeans(random_state=CONSTANT_RANDOM_STATE)
km.fit(genres_tfidf)

### Artists features

- artist name
- image
- genres representation
- popularity
- number of followers

In [None]:
def get_complex_centers_labels(mlb, km, nb_labels=2):
    """Helper function to get labels that describe the cluster centers."""
    labels = np.array([
        ", ".join(mlb.classes_[km.cluster_centers_.argsort()[:, ::-1][i, :nb_labels]])
        for i in range(km.n_clusters)
    ])
    return labels[km.labels_]

In [None]:
genre_clusters = get_complex_centers_labels(mlb, km, nb_labels=2)

In [None]:
genre_specifics = genres_tfidf.idxmax(1)

In [None]:
artists_features = pd.DataFrame({
    "uri": artist["uri"],
    "name": artist["name"],
    "popularity": artist["popularity"],
    "followers": artist["followers"]["total"],
    "image": artist["images"][-1]["url"],  # smallest image url
    "genre_cluster": genre_cluster,
    "genre_specific": genre_specific,
    "genre_x": tsne_x,
    "genre_y": tsne_y,
} for artist, genre_cluster, genre_specific, (tsne_x, tsne_y) in zip(artists, genre_clusters, genre_specifics, tsne.embedding_))

In [None]:
artists_features.head(5)

In [None]:
artists_features.to_json(DATA_DIR / "artists_features.json")

### Albums features

- album name
- image
- popularity
- release date
- total tracks
- total duration
- mean audio features from tracks

In [None]:
albums = read_json("albums")

In [None]:
albums_features = pd.DataFrame({
    "artist_uri": album["artists"][0]["uri"],
    "artist_name": album["artists"][0]["name"],
    "uri": album["uri"],
    "name": album["name"],
    "popularity": album["popularity"],
    "image": album["images"][-1]["url"],  # smallest image url
    "release_date": album["release_date"],
    "total_tracks": album["total_tracks"],
    "duration_ms": sum(track["duration_ms"] for track in album["tracks"]["items"]),
    # mean of audio_features     
    **{key: sum(
        track[key] for track in album_audio_feat if track is not None
    ) / len(album_audio_feat)
       for key in [
           "danceability",
           "energy",
           "key",
           "loudness",
           "mode",
           "speechiness",
           "acousticness",
           "instrumentalness",
           "liveness",
           "valence",
           "tempo",
           "time_signature",
       ]}
} for album, album_audio_feat in zip(albums, audio_features.values()))

*Note:* I found 1 album where audio features are all None values, I delete it as it must be a bug (I detect it at a mean tempo of zero in the data).

In [None]:
albums_features = albums_features[albums_features.tempo > 0]

In [None]:
len(albums_features)

2047 albums in the final dataset.

In [None]:
albums_features.head(5)

In [None]:
albums_features.to_json(DATA_DIR / "albums_features.json")