In [None]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import keyring
import time

## Setup Spotipy credentials and query wrapper

In [None]:
client_credentials_manager = SpotifyClientCredentials(client_id=keyring.get_password('spotify', 'cid'),
                                                      client_secret=keyring.get_password('spotify', 'secret') )
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## Get sample artists data

In [None]:
artist_id = '3Nrfpe0tUJi4K4DXYWgMUX'

In [None]:
# View sp.artist output
sp.artist(artist_id)

## Get sample track data

In [None]:
track_id = "3VqeTFIvhxu3DIe4eZVzGq"

In [None]:
# View sp.track output
sp.track(track_id)

In [None]:
# View sp.audio_features output
sp.audio_features(track_id)

## Read consolidated spotify daily charts

In [None]:
df = pd.read_csv('data/spotify_daily_charts.csv')
df.head()

In [None]:
df['date'].min(),df['date'].max()

## Get data of unique tracks in charts 

In [None]:
def get_track_data(t_id):                    
    track_data = sp.track(t_id)
    track_features = sp.audio_features(t_id)
    
    #get only main(first) artist
    td_list = [t_id,\
               track_data['name'],\
               track_data['artists'][0]['id'],\
               track_data['artists'][0]['name'],\
               track_data['album']['uri'].split(":")[2],\
               track_data['duration_ms'],\
               track_data['album']['release_date'],\
               track_data['popularity']]
    data = pd.DataFrame([td_list], columns = ['track_id','track_name','artist_id','artist_name','album_id','duration','release_date','popularity'])

    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode',\
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  
    
    #if audio features are not available
    if track_features==[None]:
        tf_data = pd.DataFrame([{x:None for x in relevant_cols}])
    else:
        tf_data = pd.DataFrame(track_features)
        tf_data = tf_data[relevant_cols]
    
    data = pd.concat([data, tf_data], axis=1)
    return data


In [None]:
get_track_data(track_id)

In [None]:
track_df = df[['track_id','track_name']].drop_duplicates()
track_df

In [None]:
len(pd.unique(track_df['track_id'].values)),len(pd.unique(track_df['track_name'].values))

> Q: Why is it that we have fewer unique track names than unique track ids? Is this expected or does it indicate a data processing error?

In [None]:
downloaded_track_ids=[]
df_list=[]

In [None]:
track_list = track_df['track_id'].values
for i,track_id in enumerate(track_list):    
    if track_id in downloaded_track_ids:
        continue
    else:
        print('[%d/%d] Fetching track data for %s... ' % 
              (i+1,len(track_list),track_df[track_df['track_id']==track_id]['track_name'].values[0]), end = " ") 
        
        try:
            track_data = get_track_data(track_id) 
            downloaded_track_ids.append(track_id)
            df_list.append(track_data)
            print('done!')
        except Exception as e:
            print(e)

        # sleep for 100 secs per 100 requests to avoid being blocked
        if (i % 100 == 0)&(i > 0):
            time.sleep(10)    

In [None]:
tracks_data_df = pd.concat(df_list)
tracks_data_df.head()

In [None]:
tracks_data_df.to_csv('data/spotify_daily_charts_tracks.csv', index=False, encoding='utf-8')

In [None]:
tracks_data_df.describe()

## Get data of unique artists in charts 

In [None]:
#Get unique artists id
artist_df = tracks_data_df[['artist_id','artist_name']].drop_duplicates()
artist_df

In [None]:
len(artist_df)

> Q: What does the ratio of unique artists to unique tracks tell you about the nature of the Spotify top-streamed market?

In [None]:
def get_artist_data(a_id):
       
    artist_data = sp.artist(a_id)

    ad_list = [a_id,\
               artist_data['name'],\
               artist_data['followers']['total'],\
               artist_data['genres'],\
               artist_data['popularity']]
    data = pd.DataFrame([ad_list], columns = ['artist_id','artist_name','total_followers','genres','popularity'])

    return data


In [None]:
get_artist_data(artist_id)

In [None]:
artist_list = artist_df['artist_id'].values
df_list=[]

for i,artist_id in enumerate(artist_list):
    print('[%d/%d] Fetching artist data for %s... ' % 
          (i+1,len(artist_list),artist_df[artist_df['artist_id']==artist_id]['artist_name'].values[0]), end = " ") 
    artist_data = get_artist_data(artist_id) 
    df_list.append(artist_data)
    print('done!')
    
    #sleep for 100 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)& (i > 0):
        time.sleep(10)   

In [None]:
artist_data_df = pd.concat(df_list)
artist_data_df 

In [None]:
artist_data_df.to_csv('data/spotify_daily_charts_artists.csv', index=False, encoding='utf-8')

## Resources
- Spotify API reference manual https://developer.spotify.com/documentation/web-api/reference/search/search/