In [1]:
import os
import time
import json

import pandas as pd
import numpy as np

import requests

from threading import Thread
from queue import Queue

In [2]:
song_analytics = pd.read_csv('../song-ranked-analytics.csv')

In [3]:
song_analytics_not_na_spotify_id = song_analytics[song_analytics.spotify_id.notnull()]
song_analytics_not_na_analytics_url = song_analytics[song_analytics.analysis_url.notnull()]

In [4]:
song_analytics.head()

Unnamed: 0.1,Unnamed: 0,date,song_id,song_name,artist_id,display_artist,spotify_id,rank,acousticness,analysis_url,...,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0,2016-10-15,27182,Closer,5794072.0,The Chainsmokers Featuring Halsey,7BKLCZ1jbUBVqRi2FVlTVw,1,0.414,https://api.spotify.com/v1/audio-analysis/7BKL...,...,0.111,-5.599,1.0,0.0338,95.01,4.0,https://api.spotify.com/v1/tracks/7BKLCZ1jbUBV...,audio_features,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,0.661
1,1,2016-10-08,27182,Closer,5794072.0,The Chainsmokers Featuring Halsey,7BKLCZ1jbUBVqRi2FVlTVw,1,0.414,https://api.spotify.com/v1/audio-analysis/7BKL...,...,0.111,-5.599,1.0,0.0338,95.01,4.0,https://api.spotify.com/v1/tracks/7BKLCZ1jbUBV...,audio_features,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,0.661
2,2,2016-10-01,27182,Closer,5794072.0,The Chainsmokers Featuring Halsey,7BKLCZ1jbUBVqRi2FVlTVw,1,0.414,https://api.spotify.com/v1/audio-analysis/7BKL...,...,0.111,-5.599,1.0,0.0338,95.01,4.0,https://api.spotify.com/v1/tracks/7BKLCZ1jbUBV...,audio_features,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,0.661
3,3,2016-09-24,27182,Closer,5794072.0,The Chainsmokers Featuring Halsey,7BKLCZ1jbUBVqRi2FVlTVw,1,0.414,https://api.spotify.com/v1/audio-analysis/7BKL...,...,0.111,-5.599,1.0,0.0338,95.01,4.0,https://api.spotify.com/v1/tracks/7BKLCZ1jbUBV...,audio_features,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,0.661
4,4,2016-09-17,27182,Closer,5794072.0,The Chainsmokers Featuring Halsey,7BKLCZ1jbUBVqRi2FVlTVw,1,0.414,https://api.spotify.com/v1/audio-analysis/7BKL...,...,0.111,-5.599,1.0,0.0338,95.01,4.0,https://api.spotify.com/v1/tracks/7BKLCZ1jbUBV...,audio_features,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,0.661


In [5]:
unique_spotify_ids = sorted(song_analytics_not_na_spotify_id.spotify_id.unique())

In [6]:
acquired_analysis = [file.split("-")[0] for file in os.listdir('../audio_analysis/')]
unique_spotify_ids = list(set(unique_spotify_ids).difference(set(acquired_analysis)))

In [7]:
len(unique_spotify_ids)

1

In [8]:
_AA_SPOTIFY_URI = 'https://api.spotify.com/v1/audio-analysis/{}'
# You need to obtain a spotify api token for this.
_HEADERS = {
    "Authorization": "Bearer "
}

aa_file_path = '../audio_analysis/{}-analysis.json'

In [9]:
def write_analysis_to_file(song_id, analysis):
    f = open(aa_file_path.format(song_id), 'w')
    f.write(analysis)
    f.close()
    
    return 

In [10]:
ids_not_found = list()

In [11]:
def crawl(q, result):
    while not q.empty():
        work = q.get()                      #fetch new work from the Queue
        s_id = work[1]
        
        url = _AA_SPOTIFY_URI.format(s_id)
        
        try:
            req = requests.get(url, headers=_HEADERS)
            
            if req.status_code != 200:
                if req.status_code == 429:
                    print('Oops, hit your limit! [Status Code {}]'.format(req.status_code))

                    time_to_wait = req.headers.get('Retry-After')
                    if time_to_wait is not None:                
                        time.sleep(int(time_to_wait))

                        req = requests.get(url, headers=_HEADERS)

                        if req.status_code == 200:
                            print('Success!! Adding analysis to file.')
                            write_analysis_to_file(s_id, req.text)

                        else:
                            ids_not_found.append(s_id)

                    else:
                        print('No retry after found.. Adding id to not found list. {}'.format(req.headers))
                        ids_not_found.append(s_id)

                else:
                    print('Un-accounted for status code. Adding id to list. [Status Code {}]'.format(req.status_code))
                    ids_not_found.append(s_id)
            else:
                print('Success!! Adding analysis to file.')
                write_analysis_to_file(s_id, req.text)
            
        except Exception as e:
            print('Error with URL check!')
            print(e)
            
        #signal to the queue that task has been processed
        q.task_done()
    return True


In [12]:
#set up the queue to hold all the urls
q = Queue(maxsize=0)
# Use many threads (50 max, or one for each url)
num_theads = min(1, len(unique_spotify_ids))

In [13]:
results = [{} for x in unique_spotify_ids];
#load up the queue with the urls to fetch and the index for each job (as a tuple):
for i in range(len(unique_spotify_ids)):
    #need the index and the url in each queue item.
    q.put((i, unique_spotify_ids[i]))

In [14]:
for i in range(num_theads):
    print('Starting thread ', i)
    worker = Thread(target=crawl, args=(q, results))
    worker.setDaemon(True)    #setting threads as "daemon" allows main program to 
                              #exit eventually even if these dont finish 
                              #correctly.
    worker.start()
    
print('Processing finished!!')

Starting thread  0
Processing finished!!
