## Config

In [6]:
# Import dependencies
import tekore as tk
import os
import time
import pandas as pd
import numpy as np
from datetime import datetime, date
from dateutil.relativedelta import relativedelta

# Environment variables
CLIENTID = os.environ.get('CLIENTID')
CLIENTSECRET = os.environ.get('CLIENTSECRET')
REDIRECT_URI = 'https://example.com/callback'
# Set global variables
MINDATE = datetime((date.today() - relativedelta(years=3)).year, 6, 1)

In [33]:
### Authenticate Tekore and Authorize App
# Get client token
app_token = tk.request_client_token(CLIENTID, CLIENTSECRET)
# Get a user token; Note the need to ask for private read scope in order to read private playlists
# This will open a browser window; the user will need to copy the URL from the browser and paste it into the VSCode Command Palette
user_token = tk.prompt_for_user_token(CLIENTID, CLIENTSECRET, REDIRECT_URI, scope=tk.scope.playlist_read_private)
# Create spotify instance
spotify = tk.Spotify(app_token)
# Get the user's spotifyID; The spotify instance will have to reference the user_token to be authorized for this data
with spotify.token_as(user_token):
    userID = spotify.current_user().id

Opening browser for Spotify login...


## Get user playlists

In [8]:
# Create a playlist for the user
with spotify.token_as(user_token):
    playlists = spotify.playlists(userID, limit=40)
    allPlaylists = [{'id':x.id, 'name':x.name} for x in playlists.items]

In [17]:
# I know which playlists I want to pull from
priorSummerNames = [
    'Summer 2018',
    'Summer 2019',
    'Summer 2020',
    'Summer 2021',
    'Summer 2022',
    '2023',
    'anti pop',
    'big on the internet',
    'Lorem',
    'Alternative Hip-Hop',
    'Et-Alt'
]
# Get the IDs of those playlists
priorSummerIDs = [{"playlistID":x.id, "artistSource":x.name} for x in playlists.items if x.name in priorSummerNames]
priorSummerIDs = pd.DataFrame(priorSummerIDs)

## Get artists in those playlists

In [20]:
# Get all the artistIDs in those playlists
# Start with an empty list
artists = []
# Loop through the playlists
for playlistID in priorSummerIDs['playlistID']:
    # Get the artistIDs
    nestedList = [[{"playlistID":playlistID, "artistID":y.id, "artistName":y.name} for y in x.track.artists] for x in spotify.playlist_items(playlistID).items]
    # Unnest the list
    flatList = [item for sublist in nestedList for item in sublist]
    # Append to the empty list
    artists += flatList
# Get unique artists
artists = pd.DataFrame(artists)
artists = artists.merge(priorSummerIDs, how='left', left_on='playlistID', right_on='playlistID').drop(columns='playlistID')
# Create a categorical sort order
artists['artistSource'] = pd.Categorical(artists['artistSource'], categories=priorSummerNames)
artists = artists.sort_values(by='artistSource', ignore_index=True).drop_duplicates(subset='artistID', ignore_index=True)

## Get artist genres

In [21]:
# Cut the df into chunks
n = 0
artistInfo = []
while n < len(artists):
    toGet = artists.iloc[n:(n+50)]
    # Add songs to the playlist
    with spotify.token_as(user_token):
        additions = [{'artistID':x.id, 'artistPopularity':x.popularity, 'artistGenre':x.genres} for x in spotify.artists(list(toGet['artistID']))]
    n += 50
    # Wait 2 seconds so we do not exceed our API call allowance
    time.sleep(2)
    artistInfo += additions
artistInfo = pd.DataFrame(artistInfo)
# Select only the first genre in the list
artistInfo['artistGenreUnique'] = [x[0] if len(x)>0 else 0 for x in artistInfo['artistGenre']]
# Merge song IDs in with artists and albums
artists = artists.merge(artistInfo, how='left', left_on='artistID', right_on='artistID')

## Get albums for each artist

In [24]:
# Define function that will get the top tracks for each artist in the list
def fetchRecentAlbums(artistID):
    try:
        # Query the spotify API
        query = spotify.artist_albums(artistID, limit=50, include_groups=['album', 'single']).items
        result = [{'artistID':artistID, 'albumID':x.id, 'albumURI':x.uri, 'albumName':x.name, 'albumReleaseDate':x.release_date, 'albumType':x.album_type} for x in query if datetime.strptime(x.release_date[:4], '%Y')>=MINDATE]
    except: 
        # If exception, fill in blank data
        pass
    finally:
        # Wait 2 seconds so we do not exceed our API call allowance
        time.sleep(2)
    return result

# Create empty list
albums = []
# Loop through list of artists
for artistID in artists['artistID']:
    # Get all recent albums by that artist
    additions = fetchRecentAlbums(artistID)
    # Append the recent albums to the list
    albums += additions
# Convert to new df
albums = pd.DataFrame(albums)

# Merge the albums into the artists
artists = artists.merge(albums, how='left', left_on='artistID', right_on='artistID')
# Drop any duplicates
artists = artists.drop_duplicates(subset=['artistID', 'albumName'])
artists = artists.dropna(subset=['albumID'])

## Get track URIs

In [31]:
artists

Unnamed: 0,artistID,artistName,artistSource,artistPopularity,artistGenre,artistGenreUnique,albumID,albumURI,albumName,albumReleaseDate,albumType
0,2OK16hAFRHoJiFZKeZe8A8,courtship.,Summer 2018,41,"[hopebeat, indie poptimism, la pop]",hopebeat,4jJpB6oqm4IiyyUYqetD7i,spotify:album:4jJpB6oqm4IiyyUYqetD7i,I hope you're well in these crazy times,2022-08-11,album
1,2OK16hAFRHoJiFZKeZe8A8,courtship.,Summer 2018,41,"[hopebeat, indie poptimism, la pop]",hopebeat,2l8ZSXbZTA2x6rnlkhcCng,spotify:album:2l8ZSXbZTA2x6rnlkhcCng,Better Than Real Life,2022-06-02,single
2,2OK16hAFRHoJiFZKeZe8A8,courtship.,Summer 2018,41,"[hopebeat, indie poptimism, la pop]",hopebeat,23iE5xCgge6qsC0Ixi4tkp,spotify:album:23iE5xCgge6qsC0Ixi4tkp,Million Dollar Smoothies (From “American Song ...,2022-03-28,single
3,2OK16hAFRHoJiFZKeZe8A8,courtship.,Summer 2018,41,"[hopebeat, indie poptimism, la pop]",hopebeat,3nVuCSIm6kZWawzUUkFWkl,spotify:album:3nVuCSIm6kZWawzUUkFWkl,80 in the friend zone,2021-10-20,single
4,2OK16hAFRHoJiFZKeZe8A8,courtship.,Summer 2018,41,"[hopebeat, indie poptimism, la pop]",hopebeat,5ZwWwcH6Q2xWIJzC1n5WXI,spotify:album:5ZwWwcH6Q2xWIJzC1n5WXI,I Try,2021-08-25,single
...,...,...,...,...,...,...,...,...,...,...,...
10955,5dMrRJRJsvS36Mp8u2i7kv,KingTrey,Alternative Hip-Hop,40,[],0,6U7GCtO8tr6pbtqSJYqxaC,spotify:album:6U7GCtO8tr6pbtqSJYqxaC,Let You Drown (with Kota the Friend),2022-04-22,single
10956,5dMrRJRJsvS36Mp8u2i7kv,KingTrey,Alternative Hip-Hop,40,[],0,3cmhexJ8j81x3jdW5bdQx1,spotify:album:3cmhexJ8j81x3jdW5bdQx1,Rolling (Remix),2021-11-17,single
10957,5dMrRJRJsvS36Mp8u2i7kv,KingTrey,Alternative Hip-Hop,40,[],0,4guVQ9AgEoxEZPqV8mNx74,spotify:album:4guVQ9AgEoxEZPqV8mNx74,BLUE CHECK EP,2021-05-21,single
10958,5dMrRJRJsvS36Mp8u2i7kv,KingTrey,Alternative Hip-Hop,40,[],0,6j9QvQHfK9WijB7t44CgiI,spotify:album:6j9QvQHfK9WijB7t44CgiI,GUOMF,2021-05-07,single


In [36]:
spotify.album('4jJpB6oqm4IiyyUYqetD7i', market='from_token')

Response contains unknown attribute: `album_group`
  return try_post_func(request, response, *params)


FullAlbum with fields:
  album_type = 'album'
  artists = [1 x SimpleArtist(external_urls, href, id, name, type, uri)]
  available_markets = None
  copyrights = [2 x Copyright(text, type)]
  external_ids = {'upc'}
  external_urls = {'spotify'}
  genres = [0 x str]
  href = 'https://api.spotify.com/v1/albums/4jJpB6oqm4IiyyUYqetD7i'
  id = '4jJpB6oqm4IiyyUYqetD7i'
  images = [3 x Image(height, url, width)]
  is_playable = True
  label = 'T∆G Music'
  name = 'I hope you're well in these crazy times'
  popularity = 40
  release_date = '2022-08-11'
  release_date_precision = 'day'
  total_tracks = 10
  tracks = SimpleTrackPaging(href, items, limit, next, offset, ...)
  type = 'album'
  uri = 'spotify:album:4jJpB6oqm4IiyyUYqetD7i'

In [37]:
# Cut the df into chunks
albumsUnique = artists.drop_duplicates(subset='albumID', ignore_index=True)
n = 0
albums = []
while n < len(artists):
    albumsToAdd = albumsUnique.iloc[n:(n+20)]
    # Add songs to the playlist
    try:
        with spotify.token_as(user_token):
            additions = [{'albumID':x.id, 'albumPopularity':x.popularity, 'genre':x.genres, 'songURI':[y.uri for y in x.tracks.items]} for x in spotify.albums(list(albumsToAdd['albumID']), market='from_token')]
            albums += additions
            # Wait 2 seconds so we do not exceed our API call allowance
            time.sleep(2)
    except:
        with spotify.token_as(user_token):
            for albumID in albumsToAdd['albumID']:
                try:
                    addition = [{'albumID':x.id, 'albumPopularity':x.popularity, 'genre':x.genres, 'songURI':[y.uri for y in x.tracks.items]} for x in spotify.album(albumID, market='from_token')]
                    albums += addition
                    # Wait 2 seconds so we do not exceed our API call allowance
                    time.sleep(2)
                except: 
                    pass
    n += 20
albums = pd.DataFrame(albums).explode('songURI')

# Merge song IDs in with artists and albums
songs = artists.merge(albums, how='left', left_on='albumID', right_on='albumID')
# Drop any duplicates
songs = songs.drop_duplicates(subset='songURI', ignore_index=True)

Response contains unknown attribute: `album_group`
  return post_func(*args, **kwargs)
Response contains unknown attribute: `album_group`
  return try_post_func(request, response, *params)


## Clean up columns

In [46]:
songs['artistGenre'] = [[y for y in x] for x in songs['artistGenre']]

TypeError: 'float' object is not iterable

In [45]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26256 entries, 0 to 26255
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   artistID           26256 non-null  object  
 1   artistName         26256 non-null  object  
 2   artistSource       26256 non-null  category
 3   artistPopularity   26256 non-null  int64   
 4   artistGenre        26256 non-null  object  
 5   artistGenreUnique  26256 non-null  object  
 6   albumID            26256 non-null  object  
 7   albumURI           26256 non-null  object  
 8   albumName          26256 non-null  object  
 9   albumReleaseDate   26256 non-null  object  
 10  albumType          26256 non-null  object  
 11  albumPopularity    26255 non-null  float64 
 12  genre              26255 non-null  object  
 13  songURI            26255 non-null  object  
dtypes: category(1), float64(1), int64(1), object(11)
memory usage: 2.6+ MB


## Write to csv

In [47]:
songs.to_csv('songs2.csv', index=False)