In [None]:
import sung

# Visualize

Let's delve directly into the visualizations of the prepared data. 

In a subsequent section, we can have a look at how the data was prepared. 

I'll say this though:
* You can find some definitions of the audio features in this [Spotify's API documentation page](https://developer.spotify.com/documentation/web-api/reference/get-audio-features)
* the tsne_* and umap_* are planar projections of the numerical features (audio features, duration_ms etc.) so that we can squish the multi dimensional numerical features into a 2d screen.

In [None]:
import pandas as pd
from cosmograph import cosmo

In [216]:
# get prepared data
url = 'https://www.dropbox.com/scl/fi/blchigtklrn49cp9v7aga/holiday_songs_spotify_with_embeddings.parquet?rlkey=wvr58wnj1rrx2zblsp73ufpdy&dl=1'
df = pd.read_parquet(url)
print(f"{df.shape=}")
df.iloc[0]

df.shape=(167, 27)


column_a                                                            1
track_uri                                      00IqwkT0PZhJ86PJajRCqk
danceability                                                    0.195
energy                                                          0.348
key                                                                A#
loudness                                                      -10.106
mode                                                            major
speechiness                                                    0.0332
acousticness                                                     0.82
instrumentalness                                                  0.0
liveness                                                        0.126
valence                                                         0.262
tempo                                                         166.824
duration_ms                                                    213107
time_signature      

In [None]:
from cosmograph import cosmo

# Danceability vs. Energy: The Holiday Party Sweet Spot
cosmo(
    df,
    point_x_by='danceability',
    point_y_by='energy',
    point_size_by='key_frequency',  # If unavailable, consider 'tempo'
    point_color_by='mode',
    point_label_by='track_name',
    point_size_scale=0.005,
    background_color='black',
)


Cosmograph(background_color='black', focused_point_ring_color=None, hovered_point_ring_color=None, link_color=…

In [None]:
from cosmograph import cosmo

# Danceability vs. Energy: The Holiday Party Sweet Spot
cosmo(
    df,
    point_x_by='danceability',
    point_y_by='valence',
    point_size_by='loudness', 
    point_color_by='key',
    point_label_by='track_name',
    point_size_scale=0.01,
    background_color='black',
)


Cosmograph(background_color='black', focused_point_ring_color=None, hovered_point_ring_color=None, link_color=…

## Add planar embeddings

In [None]:
cosmo(
    df,
    point_x_by='tsne_x',
    point_y_by='tsne_y',
    point_size_by='energy',
    point_color_by='key_frequency',
    point_label_by='track_name',
    point_size_scale=0.1,
    background_color='black',
)

Cosmograph(background_color='black', focused_point_ring_color=None, hovered_point_ring_color=None, link_color=…

In [None]:
cosmo(
    df,
    point_x_by='umap_x',
    point_y_by='umap_y',
    point_size_by='energy',
    point_color_by='key_frequency',
    point_label_by='track_name',
    point_size_scale=0.03,
    background_color='black',
)

Cosmograph(background_color='black', focused_point_ring_color=None, hovered_point_ring_color=None, link_color=…

# Get and prepare data

In [104]:
import pandas as pd 

some_holidays_playlist = 'https://www.data-action-lab.com/wp-content/uploads/2019/12/holiday_songs_spotify.csv'

df = pd.read_csv(some_holidays_playlist)
print(f"{df.shape=}")
df.iloc[0]

df.shape=(167, 22)


column_a                                                            1
track_uri                                      00IqwkT0PZhJ86PJajRCqk
danceability                                                    0.195
energy                                                          0.348
key                                                                A#
loudness                                                      -10.106
mode                                                            major
speechiness                                                    0.0332
acousticness                                                     0.82
instrumentalness                                                  0.0
liveness                                                        0.126
valence                                                         0.262
tempo                                                         166.824
duration_ms                                                    213107
time_signature      

## Map keys to frequencies (to get numericals)

In [206]:

frequencies = {
    "A": 440,
    "A#": 466.16,
    "B": 493.88,
    "C": 523.25,
    "C#": 554.37,
    "D": 587.33,
    "D#": 622.25,
    "E": 659.25,
    "F": 698.46,
    "F#": 739.99,
    "G": 783.99,
    "G#": 830.61,
}

df['key_frequency'] = df['key'].map(frequencies)

## Make planar projections of the multidimensional numerical data 

In [207]:
numerical_columns = [
    'column_a',
    'danceability',
    'energy',
    'loudness',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'duration_ms',
    'key_frequency',
]

In [208]:
from imbed import planar_embeddings, planar_embeddings_dict_to_df

tsne_xy = planar_embeddings(df[numerical_columns].values, embeddings_func='tsne')
tsne_xy = planar_embeddings_dict_to_df(tsne_xy, x_col='tsne_x', y_col='tsne_y')
umap_xy = planar_embeddings(df[numerical_columns].values, embeddings_func='umap')
umap_xy = planar_embeddings_dict_to_df(umap_xy, x_col='umap_x', y_col='umap_y')

df = pd.concat([df, tsne_xy, umap_xy], axis=1)
df.iloc[0]


column_a                                                            1
track_uri                                      00IqwkT0PZhJ86PJajRCqk
danceability                                                    0.195
energy                                                          0.348
key                                                                A#
loudness                                                      -10.106
mode                                                            major
speechiness                                                    0.0332
acousticness                                                     0.82
instrumentalness                                                  0.0
liveness                                                        0.126
valence                                                         0.262
tempo                                                         166.824
duration_ms                                                    213107
time_signature      

In [209]:
# save the prepared data
df.to_csv('holiday_songs_spotify_with_embeddings.csv', index=False)
df.to_parquet('holiday_songs_spotify_with_embeddings.parquet', index=False)

In [215]:
pwd

'/Users/thorwhalen/Dropbox/py/proj/t/sung/misc'

# Appendix: Anathor dataset: christmas_billboard_data

## Get raw source dataset

In [None]:
# first, let's download the dataset of top 100 christmas carols through the ages. 
# I found this on kaggle here: https://www.kaggle.com/datasets/sharkbait1223/billboard-top-100-christmas-carol-dataset
# I use the `haggle` package to find and manage my kaggle datasets.

from haggle import KaggleDatasets

kaggle_datasets = KaggleDatasets()
s = kaggle_datasets['sharkbait1223/billboard-top-100-christmas-carol-dataset']
list(s)

['christmas_billboard_data.csv']

In [None]:
import pandas as pd
import io

top_christmas = pd.read_csv(io.BytesIO(s['christmas_billboard_data.csv']))
print(f"{top_christmas.shape=}")
top_christmas.iloc[0]

billboard.shape=(387, 13)


url                       http://www.billboard.com/charts/hot-100/1958-1...
weekid                                                           12/13/1958
week_position                                                            83
song                                                        RUN RUDOLPH RUN
performer                                                       Chuck Berry
songid                                           Run Rudolph RunChuck Berry
instance                                                                  1
previous_week_position                                                  NaN
peak_position                                                            69
weeks_on_chart                                                            3
year                                                                   1958
month                                                                    12
day                                                                      13
Name: 0, dty

## Use the spotify API to get more song information

Now let's find a recording (the first of a search) of each song on spotify and make a playlist out of it.

In [None]:
from sung import Tracks

def row_to_query(row):
    """
    Generate a Spotify search query from a DataFrame row.
    """
    track, artist = row['song'], row['performer']
    query_str = f'track:"{track}" artist:"{artist}"'
    return query_str

def row_to_track_id(row):
    """
    Get the track ID of a song from a DataFrame row using the Spotify API.
    """
    query = row_to_query(row)
    return next(iter(Tracks.search(query)), None)

def track_ids_of_dataframe(df):
    """
    Yield track IDs for all rows in a DataFrame.
    """
    for _, row in df.iterrows():
        yield row_to_track_id(row)

In [None]:
christmas_track_ids = list(track_ids_of_dataframe(top_christmas))
print(f"{len(christmas_track_ids)=}")

In [None]:
top_christmas['spotify_track_id'] = christmas_track_ids

In [None]:
# Some songs were not found. Let's see how many
failed_finds = top_christmas['spotify_track_id'].isna()
print(f"{failed_finds.sum()=}")

## Make a playlist with these

In [84]:
track_ids = top_christmas['spotify_track_id'].dropna().unique()
len(track_ids)

70

In [85]:
from sung import Playlist

playlist = Playlist.create_from_track_list(
    track_list=track_ids,
    playlist_name='top_christmas_carols',
)
print(f"\nPlaylist '{playlist.playlist_id}' created successfully.")


Playlist '6u9WxulfSgXyT24IGyNH8m' created successfully.


## Get the playlist, and metadata about the tracks

In [86]:
playlist = Playlist('6u9WxulfSgXyT24IGyNH8m')
len(playlist)

70

In [101]:
playlist.playlist_url

'https://open.spotify.com/playlist/6u9WxulfSgXyT24IGyNH8m'

In [None]:
# WARNING: This is a bit fidly -- API conditions and credentials are not clear
numerical_features_df = playlist.numerical_features_df()
print(f"{numerical_features_df.shape=}")
numerical_features_df.iloc[0]

# Appendix: Scrap

In [67]:
from dataclasses import dataclass
from sung import Tracks
import pandas as pd


@dataclass
class TrackQuery:
    """
    :param dataframe: The pandas DataFrame containing song data.
    :param include_artist: Whether to include the artist in the search query.
    :param include_year: Whether to include the year in the search query.
    :param year_tolerance: The tolerance range for the year in the search query.
    """

    dataframe: pd.DataFrame
    include_artist: bool = True
    include_year: bool = False
    year_tolerance: int = 0
    title_col: str = 'song'
    artist_col: str = 'performer'
    year_col: str = 'year'

    def row_to_query(self, row):
        """
        Generate a Spotify search query from a DataFrame row.
        """
        track, artist, year = (
            row[self.title_col],
            row[self.artist_col],
            row[self.year_col],
        )
        query_str = f'track:"{track}"'
        if self.include_artist:
            query_str += f' artist:"{artist}"'
        if self.include_year:
            year = int(year)
            query_str += (
                f' year:{year - self.year_tolerance}-{year + self.year_tolerance}'
            )
        return query_str

    def row_to_track_id(self, row):
        """
        Get the track ID of a song from a DataFrame row using the Spotify API.
        """
        query = self.row_to_query(row)
        return next(iter(Tracks.search(query)), None)

    def track_ids(self):
        """
        Yield track IDs for all rows in the DataFrame.
        """
        for _, row in self.dataframe.iterrows():
            yield self.row_to_track_id(row)

In [68]:
christmas_tracks = TrackQuery(top_christmas)

In [51]:
christmas_track_ids = list(christmas_tracks.track_ids())
print(f"{len(christmas_track_ids)=}")

len(christmas_track_ids)=387


In [52]:
top_christmas['spotify_track_id'] = christmas_track_ids

In [54]:
# Some songs were not found. Let's see how many
failed_finds = top_christmas['spotify_track_id'].isna()
print(f"{failed_finds.sum()=}")

failed_finds.sum()=28


In [None]:
failed_subset = top_christmas[failed_finds]
christmas_tracks_with_year = TrackQuery(failed_subset, include_artist=False, include_year=True, year_tolerance=2)
tracks_found_with_title_and_year = list(christmas_tracks_with_year.track_ids())

In [82]:
import numpy as np
tracks_found_with_title_and_year = np.array(_tracks_found_with_title_and_year)[failed_finds.values]

In [75]:
import numpy as np
sum(f is not None for f in np.array(tracks_found_with_title_and_year)[failed_subset.values])

IndexError: arrays used as indices must be of integer (or boolean) type

In [78]:
np.array(tracks_found_with_title_and_year)[failed_finds.values]

array([None, None, None, None, None, None, None, None,
       '5bh3LJU6Jd0L5qhgSM1sTx', '0ypPVwQg9Z3wNavQqYSued',
       '3fLAkdIr3hLjb9Ft5nvOiH', '3fLAkdIr3hLjb9Ft5nvOiH',
       '3fLAkdIr3hLjb9Ft5nvOiH', '0ypPVwQg9Z3wNavQqYSued',
       '3fLAkdIr3hLjb9Ft5nvOiH', '3fLAkdIr3hLjb9Ft5nvOiH',
       '3fLAkdIr3hLjb9Ft5nvOiH', None, None, None, None, None, None, None,
       None, None, None, '7KgbyNNixHnh1cNtZ4qK1r'], dtype=object)

In [71]:
len(tracks_found_with_title_and_year)

387

In [None]:
top_christmas['spotify_track_id'].iloc[failed_finds] = tracks_found_with_title_and_year

In [None]:
still_missing = top_christmas['spotify_track_id'].isna()
print(f"{still_missing.sum()=}")

In [20]:
df = pd.DataFrame(t.values())
df

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,is_playable,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,130973,False,{'isrc': 'USMC15746480'},{'spotify': 'https://open.spotify.com/track/7v...,https://api.spotify.com/v1/tracks/7vQbuQcyTflf...,7vQbuQcyTflfCIOu3Uzzya,False,True,Jingle Bell Rock,92,,1,track,spotify:track:7vQbuQcyTflfCIOu3Uzzya
1,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,131733,False,{'isrc': 'USMC15746480'},{'spotify': 'https://open.spotify.com/track/6x...,https://api.spotify.com/v1/tracks/6xE98wKYt4vZ...,6xE98wKYt4vZk8j7cctjw8,False,True,Jingle Bell Rock,73,,4,track,spotify:track:6xE98wKYt4vZk8j7cctjw8
2,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,130973,False,{'isrc': 'USMC15746480'},{'spotify': 'https://open.spotify.com/track/6Z...,https://api.spotify.com/v1/tracks/6Z924AupOiJL...,6Z924AupOiJLdnAKH6UgCu,False,True,Jingle Bell Rock,40,,12,track,spotify:track:6Z924AupOiJLdnAKH6UgCu
3,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,141818,False,{'isrc': 'USWL12403972'},{'spotify': 'https://open.spotify.com/track/5i...,https://api.spotify.com/v1/tracks/5ieSoxnino7N...,5ieSoxnino7NkPZJegAiGz,False,True,Jingle Bell Rock (with Maria Becerra),56,,1,track,spotify:track:5ieSoxnino7NkPZJegAiGz
4,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,138226,False,{'isrc': 'USASN1900767'},{'spotify': 'https://open.spotify.com/track/3K...,https://api.spotify.com/v1/tracks/3Ka8XHwkF9kA...,3Ka8XHwkF9kAtUWoeyuoXX,False,True,Jingle Bell Rock (Special Nashville Edition),44,,1,track,spotify:track:3Ka8XHwkF9kAtUWoeyuoXX
5,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,139521,False,{'isrc': 'USUM72218855'},{'spotify': 'https://open.spotify.com/track/5u...,https://api.spotify.com/v1/tracks/5uT93SXmRonY...,5uT93SXmRonY8np9f90Ruj,False,True,Jingle Bell Rock - Ryan Riback Remix,20,,5,track,spotify:track:5uT93SXmRonY8np9f90Ruj
6,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,138986,False,{'isrc': 'USUM72218855'},{'spotify': 'https://open.spotify.com/track/1O...,https://api.spotify.com/v1/tracks/1OCOjQBJa3V6...,1OCOjQBJa3V69lzoqIUsYE,False,True,Jingle Bell Rock - Ryan Riback Remix,31,,1,track,spotify:track:1OCOjQBJa3V69lzoqIUsYE
7,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,246133,False,{'isrc': 'US4CR0400028'},{'spotify': 'https://open.spotify.com/track/1p...,https://api.spotify.com/v1/tracks/1pkq80ebVU6i...,1pkq80ebVU6i05sZI7myRn,False,True,Jingle Bell Rock,24,,9,track,spotify:track:1pkq80ebVU6i05sZI7myRn
8,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,108880,False,{'isrc': 'ushm91004850'},{'spotify': 'https://open.spotify.com/track/4a...,https://api.spotify.com/v1/tracks/4a4wUbJpB0JH...,4a4wUbJpB0JHCH3AEJZZyC,False,True,Jingle Bell Rock,24,,1,track,spotify:track:4a4wUbJpB0JHCH3AEJZZyC
9,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,118853,False,{'isrc': 'USACU0511730'},{'spotify': 'https://open.spotify.com/track/5L...,https://api.spotify.com/v1/tracks/5LiotnFytHaH...,5LiotnFytHaHPWy4j6errh,False,True,Jingle Bell Rock,23,,1,track,spotify:track:5LiotnFytHaHPWy4j6errh


In [24]:
import sung
dir(sung)
sung.Tracks

['Playlist',
 'PlaylistReader',
 'SpotifyDacc',
 'Tracks',
 'TracksAnalysis',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'cast_track_key',
 'delete_playlist',
 'df_extract_extra_metadata',
 'ensure_playlist_id',
 'ensure_track_id',
 'extract_extra_metadata',
 'extractor',
 'get_spotify_client',
 'search_tracks',
 'tools',
 'util']

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Initialize Spotipy client
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id="your_client_id",
                                                client_secret="your_client_secret",
                                                redirect_uri="your_redirect_uri"))

# Search for the specific track
query = 'track:Imagine artist:"John Lennon" year:1971'
results = sp.search(q=query, type='track', limit=10)

# Print the results
for item in results['tracks']['items']:
    print(f"Track Name: {item['name']}, Artist: {item['artists'][0]['name']}, Album: {item['album']['name']}, Release Date: {item['album']['release_date']}")