In [39]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import keyring
import time

## 0. Setup Spotipy credentials and query wrapper

In [40]:
client_credentials_manager = SpotifyClientCredentials(client_id=keyring.get_password('spotify', 'cid'),
                                                      client_secret=keyring.get_password('spotify', 'secret') )
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)


Set keyword

In [41]:
KEYWORD = 'Britney Spears'

## 1. Search for the top N playlists for keyword

##### View the structure of a search query

In [42]:
results = sp.search(q='Britney Spears', type='playlist', market='PH')

In [43]:
[r['name'] for r in results['playlists']['items']]

['Britney Spears Greatest Hits My Perogative',
 'This Is Britney Spears',
 'Britney Spears: Complete Collection',
 'Britney Spears - All songs',
 'Britney Spears Exitos',
 'Top Hits of 2000',
 'Britney Spears Top 40 Hits',
 'Pop Nostalgia | Pop Internacional Antigo | Free Britney Spears',
 'Britney Spears - Girl In The Mirror : The Ballads',
 'Brittany Spears Greatest Hits']

In [44]:
results['playlists'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [45]:
results['playlists']['items'][6]

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/5gacB9jv4yEj55BAN9Yenj'},
 'href': 'https://api.spotify.com/v1/playlists/5gacB9jv4yEj55BAN9Yenj',
 'id': '5gacB9jv4yEj55BAN9Yenj',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 60}],
 'name': 'Britney Spears Top 40 Hits',
 

***

In [46]:
playlist_ids = []
playlist_names = []
playlist_numtracks = []

N = 100
#get playlist in batches of 50
for n in np.arange(N//50): #specified number of loops which is 2
    offset= 50*n
    print("Getting batch %d of search results for keyword: %s ..." % (n,KEYWORD), end='' )
    results = sp.search(q=KEYWORD, type='playlist' , market='PH', offset = offset, limit=50)
    playlist_ids.extend([p['href'].split('/')[5] for p in results['playlists']['items']])
    playlist_names.extend([p['name'] for p in results['playlists']['items']])
    playlist_numtracks.extend([p['tracks']['total'] for p in results['playlists']['items']])
    print("  DONE!")

Getting batch 0 of search results for keyword: Britney Spears ...  DONE!
Getting batch 1 of search results for keyword: Britney Spears ...  DONE!


## 2. Get Playlist Data

##### View the structure of a playlist query

In [47]:
playlist = sp.playlist('5gacB9jv4yEj55BAN9Yenj')

In [48]:
playlist

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/5gacB9jv4yEj55BAN9Yenj'},
 'followers': {'href': None, 'total': 175},
 'href': 'https://api.spotify.com/v1/playlists/5gacB9jv4yEj55BAN9Yenj?additional_types=track',
 'id': '5gacB9jv4yEj55BAN9Yenj',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f

In [49]:
playlist.keys()

dict_keys(['collaborative', 'description', 'external_urls', 'followers', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])

***

In [50]:
playlist_lookup = []
for n,p_id in enumerate(playlist_ids):
    #if p_id != '5gacB9jv4yEj55BAN9Yenj':
    #    continue
    print("Getting playlist data for playlist %s :..." % (playlist_names[n]), end='' )
    playlist = sp.playlist(p_id)
    try:
        relevant_playlist_data = { key: playlist[key] for key in ['followers','owner']}
        relevant_playlist_data['playlist_id'] = p_id
        relevant_playlist_data['playlist_name'] = playlist_names[n]
        relevant_playlist_data['playlist_total_tracks'] = playlist_numtracks[n]
        relevant_playlist_data['owner_id'] = playlist['owner']['id']
        relevant_playlist_data['owner_name'] = playlist['owner']['display_name']
        relevant_playlist_data['total_followers'] = playlist['followers']['total']
        relevant_playlist_data.pop('owner', None)
        relevant_playlist_data.pop('followers', None)
        playlist_lookup.append(relevant_playlist_data)
        print("   DONE")
    except:
        print("   Aborted")
        continue

Getting playlist data for playlist Britney Spears Greatest Hits My Perogative :...   DONE
Getting playlist data for playlist This Is Britney Spears :...   DONE
Getting playlist data for playlist Britney Spears: Complete Collection :...   DONE
Getting playlist data for playlist Britney Spears - All songs :...   DONE
Getting playlist data for playlist Britney Spears Exitos :...   DONE
Getting playlist data for playlist Top Hits of 2000 :...   DONE
Getting playlist data for playlist Britney Spears Top 40 Hits :...   DONE
Getting playlist data for playlist Pop Nostalgia | Pop Internacional Antigo | Free Britney Spears :...   DONE
Getting playlist data for playlist Britney Spears - Girl In The Mirror : The Ballads :...   DONE
Getting playlist data for playlist Brittany Spears Greatest Hits :...   DONE
Getting playlist data for playlist Britney Spears Playlist :...   DONE
Getting playlist data for playlist Backstreet Boys, Britney Spears, *NSYNC, ... All Bangers :...   DONE
Getting playlist 

In [51]:
playlist_df = pd.DataFrame(playlist_lookup)
playlist_df =playlist_df.sort_values('total_followers',ascending=False)
playlist_df 

Unnamed: 0,playlist_id,playlist_name,playlist_total_tracks,owner_id,owner_name,total_followers
1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears,46,spotify,Spotify,610994
57,6VdvufagCnB6BS52MxwPRw,Party Songs,143,myplay.com,Filtr US,229825
7,6jkzh0DCmoaFrJY9TJ21GO,Pop Nostalgia | Pop Internacional Antigo | Fre...,135,filtr.br,Filtr Brasil\t,216922
5,37i9dQZF1DWUZv12GM5cFk,Top Hits of 2000,100,spotify,Spotify,191741
2,6gDqhVd2qySRUQVWnyJRkh,Britney Spears: Complete Collection,149,britneyspears,Britney Spears,144547
...,...,...,...,...,...,...
66,0qSmfZFXziN0b4lvPyZTms,BRITNEY SPEARS,21,chadoth,chadoth,59
68,3mz9Y9ZMfsTcU2zYPknBg5,Toxic – Britney Spears,47,kimeemongs,kimeemongs,54
51,4BrFfmD8U7HO0PtyM7lIC0,It's Britney Bitch! (Best of Britney Spears),13,31yukip3mx7sb62wh7jfxhae63nq,Kristin,39
76,3c0vyG4lt1dng94s33o8m1,Circus – Britney Spears,4,1296456056,Bradley Sain,12


In [52]:
#playlist name must contain the keyword
playlist_df = playlist_df[playlist_df['playlist_name'].str.lower().str.contains(KEYWORD.lower())]

In [53]:
playlist_df.to_csv("./data/"+KEYWORD+"_playlist_data.csv",encoding='utf=8',index=False)

## 3. Get Tracks from a Playlist

##### View the structure of a playlist_tracks query

In [54]:
track = sp.playlist_tracks('37i9dQZF1DX4olOMiqFeqU')

In [55]:
track

{'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DX4olOMiqFeqU/tracks?offset=0&limit=100&additional_types=track',
 'items': [{'added_at': '2021-04-05T06:43:19Z',
   'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/'},
    'href': 'https://api.spotify.com/v1/users/',
    'id': '',
    'type': 'user',
    'uri': 'spotify:user:'},
   'is_local': False,
   'primary_color': None,
   'track': {'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2XHTklRsNMOOQT56Zm3WS4'},
       'href': 'https://api.spotify.com/v1/artists/2XHTklRsNMOOQT56Zm3WS4',
       'id': '2XHTklRsNMOOQT56Zm3WS4',
       'name': 'Parokya Ni Edgar',
       'type': 'artist',
       'uri': 'spotify:artist:2XHTklRsNMOOQT56Zm3WS4'}],
     'available_markets': ['AD',
      'AE',
      'AG',
      'AL',
      'AM',
      'AO',
      'AR',
      'AT',
      'AU',
      'AZ',
      'BA',
      'BB',
      'BD',
      'BE',
      'BF',
      

In [56]:
track.keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [57]:
track['items'][0].keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

***

In [58]:
#get only top 20 followed playlists
playlist_df = playlist_df.head(20)

In [59]:
def get_relevant_track_data(tracks_data, playlist_id, playlist_name):
    try:
        relevant_track_data = { key: tracks_data['track'][key] for key in ['id','artists','name','popularity','duration_ms'] }
        relevant_track_data['artist_id']=[artist['id'] for artist in relevant_track_data['artists'] ]
        relevant_track_data['artist_name']=[artist['name']for artist in relevant_track_data['artists'] ]
        relevant_track_data['num_artists']=len([artist['id'] for artist in relevant_track_data['artists']]) 
        relevant_track_data['playlist_id']=playlist_id
        relevant_track_data['playlist_name']=playlist_name
        
        relevant_track_data.pop('artists', None)
        return relevant_track_data
    except:
        return 

In [60]:
#playlist_tracks
all_track_data = []

for _,p_id,p_name, p_numtracks in playlist_df[['playlist_id','playlist_name','playlist_total_tracks']].to_records():
    print("Fetching data for playlist = %s, with total tracks: %d" % (p_name,p_numtracks))
    n_fetches = p_numtracks // 100
    
    playlist_track_data = []
    #get tracks in batches of 100
    for n in np.arange(n_fetches+1):
        track_data = sp.playlist_tracks(p_id, offset=n*100)
        playlist_track_data.extend([get_relevant_track_data(item, p_id,p_name) for item in track_data['items']])
        
    all_track_data.extend(playlist_track_data)

Fetching data for playlist = This Is Britney Spears, with total tracks: 46
Fetching data for playlist = Pop Nostalgia | Pop Internacional Antigo | Free Britney Spears, with total tracks: 135
Fetching data for playlist = Britney Spears: Complete Collection, with total tracks: 149
Fetching data for playlist = Anos 2000 - Internacional | Backstreet Boys | Britney Spears | Christina Aguilera, with total tracks: 120
Fetching data for playlist = Divas Pop 💋 Free Britney Spears, with total tracks: 134
Fetching data for playlist = Britney Spears Playlist, with total tracks: 150
Fetching data for playlist = Britney Spears: Workout Mix, with total tracks: 19
Fetching data for playlist = Britney Spears: Remixes, with total tracks: 20
Fetching data for playlist = Best of Britney Spears, with total tracks: 13
Fetching data for playlist = Britney Spears: My Only Wish (This Year) - Holiday Mix, with total tracks: 22
Fetching data for playlist = oh to be britney spears in 2009, with total tracks: 316


In [61]:
for n,a in enumerate(all_track_data):
    try:
        len(a)
    except:
        print(n)

In [62]:
tracks_df = pd.DataFrame([data for data in all_track_data if data is not None])
tracks_df = tracks_df.rename(columns={'id':'track_id'})
tracks_df['artist_id'] = tracks_df.apply(lambda x: x['artist_id'][0] if x['num_artists']==1 else x['artist_id'], axis=1)
tracks_df['artist_name'] = tracks_df.apply(lambda x: x['artist_name'][0] if x['num_artists']==1 else x['artist_name'], axis=1)
tracks_df.head()

Unnamed: 0,track_id,name,popularity,duration_ms,artist_id,artist_name,num_artists,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,79,211066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
1,6I9VzXrHxO9rA9A5euc8Ak,Toxic,82,198800,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
2,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,79,211160,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
3,4fixebDZAVToLbUCuEloa2,Womanizer,78,224400,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
4,6ic8OlLUNEATToEFU3xmaH,Gimme More,83,251240,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears


In [63]:
len(tracks_df)

1711

In [64]:
len(tracks_df['track_id'].unique())

1193

In [65]:
tracks_df.to_csv("./data/"+KEYWORD+"_playlist_tracks.csv",encoding='utf=8',index=False)

## 4. Get Tracks from a Playlist

In [67]:
tracks_df = pd.read_csv("./data/"+KEYWORD+"_playlist_tracks.csv")
tracks_df.head()

Unnamed: 0,track_id,name,popularity,duration_ms,artist_id,artist_name,num_artists,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,79,211066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
1,6I9VzXrHxO9rA9A5euc8Ak,Toxic,82,198800,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
2,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,79,211160,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
3,4fixebDZAVToLbUCuEloa2,Womanizer,78,224400,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears
4,6ic8OlLUNEATToEFU3xmaH,Gimme More,83,251240,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,37i9dQZF1DWXcA2XXbXQ3d,This Is Britney Spears


In [68]:
tracks_df.shape

(1711, 9)

In [69]:
#remove track duplicates
tracks_df = tracks_df.drop_duplicates(subset='track_id')
tracks_df.shape

(1193, 9)

In [70]:
def get_track_data(t_id, playlist_id,playlist_name):                    
    track_data = sp.track(t_id)
    track_features = sp.audio_features(t_id)
    
    #get only main(first) artist
    td_list = [t_id,\
               track_data['name'],\
               track_data['artists'][0]['id'],\
               track_data['artists'][0]['name'],\
               track_data['album']['uri'].split(":")[2],\
               track_data['duration_ms'],\
               track_data['album']['release_date'],\
               track_data['popularity']]
    data = pd.DataFrame([td_list], columns = ['track_id','track_name','artist_id','artist_name','album_id','duration','release_date','popularity'])

    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode',\
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  
    
    tf_data = pd.DataFrame(track_features)
    tf_data = tf_data[relevant_cols]
    #tag with source playlist
    tf_data['playlist_id'] = playlist_id
    tf_data['playlist_name'] = playlist_name
    
    data = pd.concat([data, tf_data], axis=1)
    return data


In [71]:
downloaded_track_data = []

In [72]:
track_list = tracks_df['track_id'].values
playlist_name_list = tracks_df['track_id'].values
playlist_id_list = tracks_df['track_id'].values
df_list=[]

for i,track_id in enumerate(track_list):
    try:
        if track_id not in downloaded_track_data:
            print('[%d/%d] Fetching track data for %s... ' % 
                  (i+1,len(track_list),tracks_df[tracks_df['track_id']==track_id]['name'].values[0]), end = " ") 
            track_data = get_track_data(track_id, playlist_id_list[i],playlist_name_list[i]) 
            df_list.append(track_data)
            downloaded_track_data.append(track_id)
            print('done!')
    except:
        continue
    else:
        continue
    
    #sleep for 60 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)&(i > 0):
        time.sleep(20)    

[1/1193] Fetching track data for ...Baby One More Time...  done!
[2/1193] Fetching track data for Toxic...  done!
[3/1193] Fetching track data for Oops!...I Did It Again...  done!
[4/1193] Fetching track data for Womanizer...  done!
[5/1193] Fetching track data for Gimme More...  done!
[6/1193] Fetching track data for Stronger...  done!
[7/1193] Fetching track data for Circus...  done!
[8/1193] Fetching track data for I'm a Slave 4 U...  done!
[9/1193] Fetching track data for Piece of Me...  done!
[10/1193] Fetching track data for (You Drive Me) Crazy - The Stop Remix!...  done!
[11/1193] Fetching track data for Sometimes...  done!
[12/1193] Fetching track data for Lucky...  done!
[13/1193] Fetching track data for Scream & Shout...  done!
[14/1193] Fetching track data for 3...  done!
[15/1193] Fetching track data for Work Bitch...  done!
[16/1193] Fetching track data for If U Seek Amy...  done!
[17/1193] Fetching track data for Do Somethin'...  done!
[18/1193] Fetching track data for O

In [73]:
tracks_data_df = pd.concat(df_list)
tracks_data_df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,26dSoYclwsYLMAKD3tpOr4,Britney Spears,3WNxdumkSMGMJRhEgK80qx,211066,1999-01-12,79,0.759,0.699,...,-5.745,0,0.0307,0.202,0.000131,0.443,0.907,92.96,3MjUtNVVq3C8Fn0MP3zhXa,3MjUtNVVq3C8Fn0MP3zhXa
0,6I9VzXrHxO9rA9A5euc8Ak,Toxic,26dSoYclwsYLMAKD3tpOr4,Britney Spears,0z7pVBGOD7HCIB7S8eLkLI,198800,2003-11-13,82,0.774,0.838,...,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,6I9VzXrHxO9rA9A5euc8Ak,6I9VzXrHxO9rA9A5euc8Ak
0,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,26dSoYclwsYLMAKD3tpOr4,Britney Spears,5PmgtkodFl2Om3hMXONDll,211160,2000-05-16,79,0.751,0.834,...,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,6naxalmIoLFWR0siv8dnQQ,6naxalmIoLFWR0siv8dnQQ
0,4fixebDZAVToLbUCuEloa2,Womanizer,26dSoYclwsYLMAKD3tpOr4,Britney Spears,2tve5DGwub1TtbX1khPX5j,224400,2008-12-02,78,0.724,0.695,...,-5.226,1,0.0622,0.073,0.0,0.0889,0.235,139.0,4fixebDZAVToLbUCuEloa2,4fixebDZAVToLbUCuEloa2
0,6ic8OlLUNEATToEFU3xmaH,Gimme More,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1ePkYcH5ZQCb1b4tQeiEDj,251240,2007-10-25,83,0.788,0.844,...,-3.131,1,0.0334,0.25,0.000678,0.0723,0.382,113.324,6ic8OlLUNEATToEFU3xmaH,6ic8OlLUNEATToEFU3xmaH


In [74]:
tracks_data_df.to_csv("./data/"+KEYWORD+"_playlist_tracks_data.csv", index=False, encoding='utf-8')