In [1]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import keyring
import time

## 0. Setup Spotipy credentials and query wrapper

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id=keyring.get_password('spotify', 'cid'),
                                                      client_secret=keyring.get_password('spotify', 'secret') )
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)


Set keyword

In [3]:
KEYWORD = 'Britney Spears'

## 1. Search for the top N playlists for keyword

##### View the structure of a search query

In [4]:
results = sp.search(q='Britney Spears', type='playlist', market='PH')

In [5]:
[r['name'] for r in results['playlists']['items']]

['Britney Spears Greatest Hits My Perogative',
 'This Is Britney Spears',
 'Britney Spears: Complete Collection',
 'Britney Spears - All songs',
 'Britney Spears Exitos',
 'Top Hits of 2000',
 'Britney Spears Top 40 Hits',
 'Pop Nostalgia | Pop Internacional Antigo | Free Britney Spears',
 'Britney Spears - Girl In The Mirror : The Ballads',
 'Brittany Spears Greatest Hits']

In [6]:
results['playlists'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [9]:
results['playlists']['items'][6]

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/5gacB9jv4yEj55BAN9Yenj'},
 'href': 'https://api.spotify.com/v1/playlists/5gacB9jv4yEj55BAN9Yenj',
 'id': '5gacB9jv4yEj55BAN9Yenj',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 60}],
 'name': 'Britney Spears Top 40 Hits',
 

***

In [10]:
playlist_ids = []
playlist_names = []
playlist_numtracks = []

N = 100
#get playlist in batches of 50
for n in np.arange(N//50): #specified number of loops which is 2
    offset= 50*n
    print("Getting batch %d of search results for keyword: %s ..." % (n,KEYWORD), end='' )
    results = sp.search(q=KEYWORD, type='playlist' , market='PH', offset = offset, limit=50)
    playlist_ids.extend([p['href'].split('/')[5] for p in results['playlists']['items']])
    playlist_names.extend([p['name'] for p in results['playlists']['items']])
    playlist_numtracks.extend([p['tracks']['total'] for p in results['playlists']['items']])
    print("  DONE!")

Getting batch 0 of search results for keyword: Britney Spears ...  DONE!
Getting batch 1 of search results for keyword: Britney Spears ...  DONE!


## 2. Get Playlist Data

##### View the structure of a playlist query

In [11]:
playlist = sp.playlist('5gacB9jv4yEj55BAN9Yenj')

In [12]:
playlist

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/5gacB9jv4yEj55BAN9Yenj'},
 'followers': {'href': None, 'total': 174},
 'href': 'https://api.spotify.com/v1/playlists/5gacB9jv4yEj55BAN9Yenj?additional_types=track',
 'id': '5gacB9jv4yEj55BAN9Yenj',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f002cd4',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b2732aa20611c7fb964a74ab01a6ab67616d0000b2738e49866860c25afffe2f1a02ab67616d0000b273e1a4e01cb7a1ecff468bbeadab67616d0000b273efc6988972cb04105f

In [13]:
playlist.keys()

dict_keys(['collaborative', 'description', 'external_urls', 'followers', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])

***

In [14]:
playlist_lookup = []
for n,p_id in enumerate(playlist_ids):
    if p_id != '5gacB9jv4yEj55BAN9Yenj':
        continue
    print("Getting playlist data for playlist %s :..." % (playlist_names[n]), end='' )
    playlist = sp.playlist(p_id)
    try:
        relevant_playlist_data = { key: playlist[key] for key in ['followers','owner']}
        relevant_playlist_data['playlist_id'] = p_id
        relevant_playlist_data['playlist_name'] = playlist_names[n]
        relevant_playlist_data['playlist_total_tracks'] = playlist_numtracks[n]
        relevant_playlist_data['owner_id'] = playlist['owner']['id']
        relevant_playlist_data['owner_name'] = playlist['owner']['display_name']
        relevant_playlist_data['total_followers'] = playlist['followers']['total']
        relevant_playlist_data.pop('owner', None)
        relevant_playlist_data.pop('followers', None)
        playlist_lookup.append(relevant_playlist_data)
        print("   DONE")
    except:
        print("   Aborted")
        continue

Getting playlist data for playlist Britney Spears Top 40 Hits :...   DONE


In [15]:
playlist_df = pd.DataFrame(playlist_lookup)
playlist_df =playlist_df.sort_values('total_followers',ascending=False)
playlist_df 

Unnamed: 0,playlist_id,playlist_name,playlist_total_tracks,owner_id,owner_name,total_followers
0,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits,22,iluvmusic99two0,iluvmusic99two0,174


In [16]:
#playlist name must contain the keyword
playlist_df = playlist_df[playlist_df['playlist_name'].str.lower().str.contains(KEYWORD.lower())]

In [17]:
playlist_df.to_csv("./data/"+KEYWORD+"_playlist_data.csv",encoding='utf=8',index=False)

## 3. Get Tracks from a Playlist

##### View the structure of a playlist_tracks query

In [18]:
track = sp.playlist_tracks('37i9dQZF1DX4olOMiqFeqU')

In [19]:
track

{'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DX4olOMiqFeqU/tracks?offset=0&limit=100&additional_types=track',
 'items': [{'added_at': '2021-04-05T06:43:19Z',
   'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/'},
    'href': 'https://api.spotify.com/v1/users/',
    'id': '',
    'type': 'user',
    'uri': 'spotify:user:'},
   'is_local': False,
   'primary_color': None,
   'track': {'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2XHTklRsNMOOQT56Zm3WS4'},
       'href': 'https://api.spotify.com/v1/artists/2XHTklRsNMOOQT56Zm3WS4',
       'id': '2XHTklRsNMOOQT56Zm3WS4',
       'name': 'Parokya Ni Edgar',
       'type': 'artist',
       'uri': 'spotify:artist:2XHTklRsNMOOQT56Zm3WS4'}],
     'available_markets': ['AD',
      'AE',
      'AG',
      'AL',
      'AM',
      'AO',
      'AR',
      'AT',
      'AU',
      'AZ',
      'BA',
      'BB',
      'BD',
      'BE',
      'BF',
      

In [20]:
track.keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [21]:
track['items'][0].keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

***

In [22]:
#get only top 20 followed playlists
playlist_df = playlist_df.head(20)

In [23]:
def get_relevant_track_data(tracks_data, playlist_id, playlist_name):
    try:
        relevant_track_data = { key: tracks_data['track'][key] for key in ['id','artists','name','popularity','duration_ms'] }
        relevant_track_data['artist_id']=[artist['id'] for artist in relevant_track_data['artists'] ]
        relevant_track_data['artist_name']=[artist['name']for artist in relevant_track_data['artists'] ]
        relevant_track_data['num_artists']=len([artist['id'] for artist in relevant_track_data['artists']]) 
        relevant_track_data['playlist_id']=playlist_id
        relevant_track_data['playlist_name']=playlist_name
        
        relevant_track_data.pop('artists', None)
        return relevant_track_data
    except:
        return 

In [24]:
#playlist_tracks
all_track_data = []

for _,p_id,p_name, p_numtracks in playlist_df[['playlist_id','playlist_name','playlist_total_tracks']].to_records():
    print("Fetching data for playlist = %s, with total tracks: %d" % (p_name,p_numtracks))
    n_fetches = p_numtracks // 100
    
    playlist_track_data = []
    #get tracks in batches of 100
    for n in np.arange(n_fetches+1):
        track_data = sp.playlist_tracks(p_id, offset=n*100)
        playlist_track_data.extend([get_relevant_track_data(item, p_id,p_name) for item in track_data['items']])
        
    all_track_data.extend(playlist_track_data)

Fetching data for playlist = Britney Spears Top 40 Hits, with total tracks: 22


In [25]:
for n,a in enumerate(all_track_data):
    try:
        len(a)
    except:
        print(n)

In [26]:
tracks_df = pd.DataFrame([data for data in all_track_data if data is not None])
tracks_df = tracks_df.rename(columns={'id':'track_id'})
tracks_df['artist_id'] = tracks_df.apply(lambda x: x['artist_id'][0] if x['num_artists']==1 else x['artist_id'], axis=1)
tracks_df['artist_name'] = tracks_df.apply(lambda x: x['artist_name'][0] if x['num_artists']==1 else x['artist_name'], axis=1)
tracks_df.head()

Unnamed: 0,track_id,name,popularity,duration_ms,artist_id,artist_name,num_artists,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,79,211066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
1,1UI0l2L66HJ9AtoEOlHzv4,Sometimes,68,245066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
2,1DSJNBNhGZCigg9ll5VeZv,(You Drive Me) Crazy,63,198066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
3,70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,54,312533,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
4,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,79,211160,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits


In [27]:
len(tracks_df)

22

In [28]:
len(tracks_df['track_id'].unique())

22

In [29]:
tracks_df.to_csv("./data/"+KEYWORD+"_playlist_tracks.csv",encoding='utf=8',index=False)

## 4. Get Tracks from a Playlist

In [30]:
tracks_df = pd.read_csv("./"+KEYWORD+"_playlist_tracks.csv")
tracks_df.head()

Unnamed: 0,track_id,name,popularity,duration_ms,artist_id,artist_name,num_artists,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,79,211066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
1,1UI0l2L66HJ9AtoEOlHzv4,Sometimes,68,245066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
2,1DSJNBNhGZCigg9ll5VeZv,(You Drive Me) Crazy,63,198066,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
3,70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,54,312533,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits
4,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,79,211160,26dSoYclwsYLMAKD3tpOr4,Britney Spears,1,5gacB9jv4yEj55BAN9Yenj,Britney Spears Top 40 Hits


In [31]:
tracks_df.shape

(22, 9)

In [32]:
#remove track duplicates
tracks_df = tracks_df.drop_duplicates(subset='track_id')
tracks_df.shape

(22, 9)

In [33]:
def get_track_data(t_id, playlist_id,playlist_name):                    
    track_data = sp.track(t_id)
    track_features = sp.audio_features(t_id)
    
    #get only main(first) artist
    td_list = [t_id,\
               track_data['name'],\
               track_data['artists'][0]['id'],\
               track_data['artists'][0]['name'],\
               track_data['album']['uri'].split(":")[2],\
               track_data['duration_ms'],\
               track_data['album']['release_date'],\
               track_data['popularity']]
    data = pd.DataFrame([td_list], columns = ['track_id','track_name','artist_id','artist_name','album_id','duration','release_date','popularity'])

    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode',\
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  
    
    tf_data = pd.DataFrame(track_features)
    tf_data = tf_data[relevant_cols]
    #tag with source playlist
    tf_data['playlist_id'] = playlist_id
    tf_data['playlist_name'] = playlist_name
    
    data = pd.concat([data, tf_data], axis=1)
    return data


In [34]:
downloaded_track_data = []

In [35]:
track_list = tracks_df['track_id'].values
playlist_name_list = tracks_df['track_id'].values
playlist_id_list = tracks_df['track_id'].values
df_list=[]

for i,track_id in enumerate(track_list):
    try:
        if track_id not in downloaded_track_data:
            print('[%d/%d] Fetching track data for %s... ' % 
                  (i+1,len(track_list),tracks_df[tracks_df['track_id']==track_id]['name'].values[0]), end = " ") 
            track_data = get_track_data(track_id, playlist_id_list[i],playlist_name_list[i]) 
            df_list.append(track_data)
            downloaded_track_data.append(track_id)
            print('done!')
    except:
        continue
    else:
        continue
    
    #sleep for 60 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)&(i > 0):
        time.sleep(20)    

[1/22] Fetching track data for ...Baby One More Time...  done!
[2/22] Fetching track data for Sometimes...  done!
[3/22] Fetching track data for (You Drive Me) Crazy...  done!
[4/22] Fetching track data for From the Bottom of My Broken Heart...  done!
[5/22] Fetching track data for Oops!...I Did It Again...  done!
[6/22] Fetching track data for Lucky...  done!
[7/22] Fetching track data for Stronger...  done!
[8/22] Fetching track data for I'm a Slave 4 U...  done!
[9/22] Fetching track data for Me Against the Music (feat. Madonna) - LP Version / Video Mix...  done!
[10/22] Fetching track data for Toxic...  done!
[11/22] Fetching track data for Everytime...  done!
[12/22] Fetching track data for Gimme More...  done!
[13/22] Fetching track data for Piece of Me...  done!
[14/22] Fetching track data for Womanizer...  done!
[15/22] Fetching track data for Circus...  done!
[16/22] Fetching track data for If U Seek Amy...  done!
[17/22] Fetching track data for 3...  done!
[18/22] Fetching tr

In [36]:
tracks_data_df = pd.concat(df_list)
tracks_data_df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,playlist_id,playlist_name
0,3MjUtNVVq3C8Fn0MP3zhXa,...Baby One More Time,26dSoYclwsYLMAKD3tpOr4,Britney Spears,3WNxdumkSMGMJRhEgK80qx,211066,1999-01-12,79,0.759,0.699,...,-5.745,0,0.0307,0.202,0.000131,0.443,0.907,92.96,3MjUtNVVq3C8Fn0MP3zhXa,3MjUtNVVq3C8Fn0MP3zhXa
0,1UI0l2L66HJ9AtoEOlHzv4,Sometimes,26dSoYclwsYLMAKD3tpOr4,Britney Spears,3WNxdumkSMGMJRhEgK80qx,245066,1999-01-12,68,0.745,0.742,...,-5.693,1,0.0259,0.42,0.0,0.102,0.806,95.996,1UI0l2L66HJ9AtoEOlHzv4,1UI0l2L66HJ9AtoEOlHzv4
0,1DSJNBNhGZCigg9ll5VeZv,(You Drive Me) Crazy,26dSoYclwsYLMAKD3tpOr4,Britney Spears,3WNxdumkSMGMJRhEgK80qx,198066,1999-01-12,63,0.748,0.939,...,-4.288,0,0.0341,0.0534,0.0,0.32,0.96,104.001,1DSJNBNhGZCigg9ll5VeZv,1DSJNBNhGZCigg9ll5VeZv
0,70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,26dSoYclwsYLMAKD3tpOr4,Britney Spears,3WNxdumkSMGMJRhEgK80qx,312533,1999-01-12,54,0.677,0.665,...,-5.171,1,0.0305,0.56,1e-06,0.338,0.706,74.981,70XtWbcVZcpaOddJftMcVi,70XtWbcVZcpaOddJftMcVi
0,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,26dSoYclwsYLMAKD3tpOr4,Britney Spears,5PmgtkodFl2Om3hMXONDll,211160,2000-05-16,79,0.751,0.834,...,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,6naxalmIoLFWR0siv8dnQQ,6naxalmIoLFWR0siv8dnQQ


In [37]:
tracks_data_df.to_csv("./data/"+KEYWORD+"_playlist_tracks_data.csv", index=False, encoding='utf-8')