In [1]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import keyring
import time

## Setup Spotipy credentials and query wrapper

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id=keyring.get_password('spotify', 'cid'),
                                                      client_secret=keyring.get_password('spotify', 'secret') )
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## Get sample artists data

Google:
    
open spotify artist 3Nrfpe0tUJi4K4DXYWgMUX

In [3]:
artist_id = '3Nrfpe0tUJi4K4DXYWgMUX'

In [4]:
# View sp.artist output
sp.artist(artist_id)

{'external_urls': {'spotify': 'https://open.spotify.com/artist/3Nrfpe0tUJi4K4DXYWgMUX'},
 'followers': {'href': None, 'total': 33073199},
 'genres': ['k-pop', 'k-pop boy group'],
 'href': 'https://api.spotify.com/v1/artists/3Nrfpe0tUJi4K4DXYWgMUX',
 'id': '3Nrfpe0tUJi4K4DXYWgMUX',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab6761610000e5eb0dd1c51e1a96d5300f730db4',
   'width': 640},
  {'height': 320,
   'url': 'https://i.scdn.co/image/ab676161000051740dd1c51e1a96d5300f730db4',
   'width': 320},
  {'height': 160,
   'url': 'https://i.scdn.co/image/ab6761610000f1780dd1c51e1a96d5300f730db4',
   'width': 160}],
 'name': 'BTS',
 'popularity': 97,
 'type': 'artist',
 'uri': 'spotify:artist:3Nrfpe0tUJi4K4DXYWgMUX'}

## Get sample track data

In [5]:
track_id = "3VqeTFIvhxu3DIe4eZVzGq"

In [6]:
# View sp.track output
sp.track(track_id)

{'album': {'album_type': 'single',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3Nrfpe0tUJi4K4DXYWgMUX'},
    'href': 'https://api.spotify.com/v1/artists/3Nrfpe0tUJi4K4DXYWgMUX',
    'id': '3Nrfpe0tUJi4K4DXYWgMUX',
    'name': 'BTS',
    'type': 'artist',
    'uri': 'spotify:artist:3Nrfpe0tUJi4K4DXYWgMUX'}],
  'available_markets': ['AD',
   'AE',
   'AG',
   'AL',
   'AM',
   'AO',
   'AR',
   'AT',
   'AU',
   'AZ',
   'BA',
   'BB',
   'BD',
   'BE',
   'BF',
   'BG',
   'BH',
   'BI',
   'BJ',
   'BN',
   'BO',
   'BR',
   'BS',
   'BT',
   'BW',
   'BY',
   'BZ',
   'CA',
   'CH',
   'CI',
   'CL',
   'CM',
   'CO',
   'CR',
   'CV',
   'CY',
   'CZ',
   'DE',
   'DJ',
   'DK',
   'DM',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FJ',
   'FM',
   'FR',
   'GA',
   'GB',
   'GD',
   'GE',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GR',
   'GT',
   'GW',
   'GY',
   'HK',
   'HN',
   'HR',
   'HT',
   'HU',
   'ID',
   'IE',
   'IL'

### Description

https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject

In [7]:
# View sp.audio_features output
sp.audio_features(track_id)

[{'danceability': 0.759,
  'energy': 0.459,
  'key': 8,
  'loudness': -5.187,
  'mode': 1,
  'speechiness': 0.0948,
  'acousticness': 0.00323,
  'instrumentalness': 0,
  'liveness': 0.0906,
  'valence': 0.695,
  'tempo': 109.997,
  'type': 'audio_features',
  'id': '3VqeTFIvhxu3DIe4eZVzGq',
  'uri': 'spotify:track:3VqeTFIvhxu3DIe4eZVzGq',
  'track_href': 'https://api.spotify.com/v1/tracks/3VqeTFIvhxu3DIe4eZVzGq',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3VqeTFIvhxu3DIe4eZVzGq',
  'duration_ms': 164442,
  'time_signature': 4}]

## Read consolidated spotify daily charts

In [9]:
df = pd.read_csv('data/spotify_daily_charts.csv')
df.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams
0,2017-01-01,1,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,Bruno Mars,185236
1,2017-01-01,2,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,James Arthur,180552
2,2017-01-01,3,7BKLCZ1jbUBVqRi2FVlTVw,Closer,The Chainsmokers,158720
3,2017-01-01,4,2rizacJSyD9S1IQUxUxnsK,All We Know,The Chainsmokers,130874
4,2017-01-01,5,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,Maroon 5,129656


In [10]:
df['date'].min(),df['date'].max()

('2017-01-01', '2021-05-20')

## Get data of unique tracks in charts 

In [11]:
def get_track_data(t_id):                    
    track_data = sp.track(t_id)
    track_features = sp.audio_features(t_id)
    
    #get only main(first) artist
    td_list = [t_id,\
               track_data['name'],\
               track_data['artists'][0]['id'],\
               track_data['artists'][0]['name'],\
               track_data['album']['uri'].split(":")[2],\
               track_data['duration_ms'],\
               track_data['album']['release_date'],\
               track_data['popularity']]
    data = pd.DataFrame([td_list], columns = ['track_id','track_name','artist_id','artist_name','album_id','duration','release_date','popularity'])

    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode',\
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  
    
    #if audio features are not available
    if track_features==[None]:
        tf_data = pd.DataFrame([{x:None for x in relevant_cols}])
    else:
        tf_data = pd.DataFrame(track_features)
        tf_data = tf_data[relevant_cols]
    
    data = pd.concat([data, tf_data], axis=1)
    return data


In [12]:
get_track_data(track_id)

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,3VqeTFIvhxu3DIe4eZVzGq,Butter,3Nrfpe0tUJi4K4DXYWgMUX,BTS,2BDhPi2XCYujYxU6VM0QaD,164441,2021-05-21,87,0.759,0.459,8,-5.187,1,0.0948,0.00323,0,0.0906,0.695,109.997


In [14]:
track_df = df[['track_id','track_name']].drop_duplicates()
track_df

Unnamed: 0,track_id,track_name
0,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor
1,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go
2,7BKLCZ1jbUBVqRi2FVlTVw,Closer
3,2rizacJSyD9S1IQUxUxnsK,All We Know
4,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know
...,...,...
318647,6Fz2TpxUD0YvAPsuG8nDMJ,MAPA
318853,2zrhoHlFKxFTRF5aMyxMoQ,Next Level
319182,2UbVnbE5FH6008mAm6Mmgw,Run
319399,7I9RoRcJ0N9rvN4p4KWPsH,Paraluman


In [15]:
len(pd.unique(track_df['track_id'].values)),len(pd.unique(track_df['track_name'].values))

(3495, 2742)

> Q: Why is it that we have fewer unique track names than unique track ids? Is this expected or does it indicate a data processing error?

In [16]:
downloaded_track_ids=[]
df_list=[]

In [17]:
track_list = track_df['track_id'].values
for i,track_id in enumerate(track_list):    
    if track_id in downloaded_track_ids:
        continue
    else:
        print('[%d/%d] Fetching track data for %s... ' % 
              (i+1,len(track_list),track_df[track_df['track_id']==track_id]['track_name'].values[0]), end = " ") 
        
        try:
            track_data = get_track_data(track_id) 
            downloaded_track_ids.append(track_id)
            df_list.append(track_data)
            print('done!')
        except Exception as e:
            print(e)

        # sleep for 100 secs per 100 requests to avoid being blocked
        if (i % 100 == 0)&(i > 0):
            #time.sleep(10)    
            time.sleep(30)

[1/3495] Fetching track data for Versace on the Floor...  done!
[2/3495] Fetching track data for Say You Won't Let Go...  done!
[3/3495] Fetching track data for Closer...  done!
[4/3495] Fetching track data for All We Know...  done!
[5/3495] Fetching track data for Don't Wanna Know...  done!
[6/3495] Fetching track data for 24K Magic...  done!
[7/3495] Fetching track data for Starving...  done!
[8/3495] Fetching track data for How Far I'll Go - From "Moana"...  done!
[9/3495] Fetching track data for Let Me Love You...  done!
[10/3495] Fetching track data for Starboy...  done!
[11/3495] Fetching track data for Jumpshot...  done!
[12/3495] Fetching track data for Cold Water (feat. Justin Bieber & MØ)...  done!
[13/3495] Fetching track data for Perfect Strangers...  done!
[14/3495] Fetching track data for Bad Things (with Camila Cabello)...  done!
[15/3495] Fetching track data for I Don’t Wanna Live Forever (Fifty Shades Darker) - From "Fifty Shades Darker (Original Motion Picture Soundtr

In [18]:
tracks_data_df = pd.concat(df_list)
tracks_data_df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0kN8xEmgMW9mh7UmDYHlJP,Versace on the Floor,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,4PgleR09JVnm3zY1fW3XBA,261240,2016-11-17,76,0.578,0.574,2,-6.209,1,0.0454,0.196,0.0,0.083,0.301,174.152
0,5uCax9HTNlzGybIStD3vDh,Say You Won't Let Go,4IWBUUAFIplrNtaOHcJPRM,James Arthur,7oiJYvEJHsmYtrgviAVIBD,211466,2016-10-28,85,0.358,0.557,10,-7.398,1,0.059,0.695,0.0,0.0902,0.494,85.043
0,7BKLCZ1jbUBVqRi2FVlTVw,Closer,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0rSLgV8p5FzfnqlEk4GzxE,244960,2016-07-29,85,0.748,0.524,8,-5.599,1,0.0338,0.414,0.0,0.111,0.661,95.01
0,2rizacJSyD9S1IQUxUxnsK,All We Know,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,0xmaV6EtJ4M3ebZUPRnhyb,194080,2016-09-29,69,0.662,0.586,0,-8.821,1,0.0307,0.097,0.00272,0.115,0.296,90.0
0,5MFzQMkrl1FOOng9tq6R9r,Don't Wanna Know,04gDigrS5kc9YWfZHwBETP,Maroon 5,0fvTn3WXF39kQs9i3bnNpP,214480,2016-10-11,0,0.783,0.623,7,-6.126,1,0.08,0.338,0.0,0.0975,0.447,100.048


In [19]:
tracks_data_df.to_csv('data/spotify_daily_charts_tracks.csv', index=False, encoding='utf-8')

In [20]:
tracks_data_df.describe()

Unnamed: 0,duration,popularity
count,3495.0,3495.0
mean,215404.976824,48.157368
std,45666.616715,29.252437
min,37640.0,0.0
25%,189289.5,23.0
50%,209831.0,59.0
75%,237530.0,71.0
max,536217.0,100.0


## Get data of unique artists in charts 

In [21]:
#Get unique artists id
artist_df = tracks_data_df[['artist_id','artist_name']].drop_duplicates()
artist_df

Unnamed: 0,artist_id,artist_name
0,0du5cEVh5yTK9QJze8zA0C,Bruno Mars
0,4IWBUUAFIplrNtaOHcJPRM,James Arthur
0,69GGBxA162lTqCwzJG5jLp,The Chainsmokers
0,04gDigrS5kc9YWfZHwBETP,Maroon 5
0,5p7f24Rk5HkUZsaS3BLG5F,Hailee Steinfeld
...,...,...
0,1DlYnIiliftt6R21Y5NOW2,Adie
0,26cMerAxjx9GedFt0lMDjm,Bella Poarch
0,2WgfkM8S11vg4kxLgDY3F5,StarBoi3
0,3Dr5ezvDdYsycy1gfaZWSL,raven


In [22]:
len(artist_df)

801

> Q: What does the ratio of unique artists to unique tracks tell you about the nature of the Spotify top-streamed market?

In [23]:
def get_artist_data(a_id):
       
    artist_data = sp.artist(a_id)

    ad_list = [a_id,\
               artist_data['name'],\
               artist_data['followers']['total'],\
               artist_data['genres'],\
               artist_data['popularity']]
    data = pd.DataFrame([ad_list], columns = ['artist_id','artist_name','total_followers','genres','popularity'])

    return data


In [24]:
get_artist_data(artist_id)

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,3Nrfpe0tUJi4K4DXYWgMUX,BTS,33073199,"[k-pop, k-pop boy group]",97


In [25]:
artist_list = artist_df['artist_id'].values
df_list=[]

for i,artist_id in enumerate(artist_list):
    print('[%d/%d] Fetching artist data for %s... ' % 
          (i+1,len(artist_list),artist_df[artist_df['artist_id']==artist_id]['artist_name'].values[0]), end = " ") 
    artist_data = get_artist_data(artist_id) 
    df_list.append(artist_data)
    print('done!')
    
    #sleep for 100 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)& (i > 0):
        #time.sleep(10)   
        time.sleep(25)   

[1/801] Fetching artist data for Bruno Mars...  done!
[2/801] Fetching artist data for James Arthur...  done!
[3/801] Fetching artist data for The Chainsmokers...  done!
[4/801] Fetching artist data for Maroon 5...  done!
[5/801] Fetching artist data for Hailee Steinfeld...  done!
[6/801] Fetching artist data for Alessia Cara...  done!
[7/801] Fetching artist data for DJ Snake...  done!
[8/801] Fetching artist data for The Weeknd...  done!
[9/801] Fetching artist data for Dawin...  done!
[10/801] Fetching artist data for Major Lazer...  done!
[11/801] Fetching artist data for Jonas Blue...  done!
[12/801] Fetching artist data for Machine Gun Kelly...  done!
[13/801] Fetching artist data for ZAYN...  done!
[14/801] Fetching artist data for The Vamps...  done!
[15/801] Fetching artist data for Martin Garrix...  done!
[16/801] Fetching artist data for Ariana Grande...  done!
[17/801] Fetching artist data for Starley...  done!
[18/801] Fetching artist data for Andy Grammer...  done!
[19/80

In [26]:
# artist_data_df = pd.concat(df_list).reset_index(drop=True) not used since we drop on saving
artist_data_df = pd.concat(df_list)
artist_data_df 

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,30647606,"[dance pop, pop]",95
0,4IWBUUAFIplrNtaOHcJPRM,James Arthur,8048670,"[pop, post-teen pop, talent show, uk pop]",87
0,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,17876933,"[dance pop, edm, electropop, pop, pop dance, t...",86
0,04gDigrS5kc9YWfZHwBETP,Maroon 5,30804160,"[pop, pop rock]",92
0,5p7f24Rk5HkUZsaS3BLG5F,Hailee Steinfeld,6766803,"[dance pop, pop, post-teen pop]",79
...,...,...,...,...,...
0,1DlYnIiliftt6R21Y5NOW2,Adie,7382,"[opm, pinoy indie]",52
0,26cMerAxjx9GedFt0lMDjm,Bella Poarch,128338,[],73
0,2WgfkM8S11vg4kxLgDY3F5,StarBoi3,8072,[],73
0,3Dr5ezvDdYsycy1gfaZWSL,raven,7385,[],50


In [27]:
artist_data_df.to_csv('data/spotify_daily_charts_artists.csv', index=False, encoding='utf-8')

## Resources
- Spotify API reference manual https://developer.spotify.com/documentation/web-api/reference/search/search/