## Import libraries

In [1]:
import pandas as pd
import numpy as np
import json
from requests import get, post
from time import time, sleep
from random import randint
from bs4 import BeautifulSoup

## Obtain token and prepare headers to access Spotify's APIs.

In [2]:
# post Client Credentials to Spotify's Token API to obtain Token
with open("./token/token.json", "r") as token_file:
    token_file = json.load(token_file)
res = post('https://accounts.spotify.com/api/token', headers = {'Authorization': '{}'.format(token_file["token"])}, data= {'grant_type': 'client_credentials'})
token = 'Bearer {}'.format(res.json()['access_token'])

# define headers to include Token
headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}

## Function that extracts basic info of songs from specified year (name, id, popularity, release date) and stores them in dataframe songs_info.

approx. 5secs per 50 songs

In [3]:
def extract_songs(year, num):
    """
    Function to extract specified number of songs from a specified year.
    Args:
        year: year to extract songs from
        num: number of songs to extract
    """
    # for songs released in specified year, create data frame for songs' basic info
    songs_info = pd.DataFrame(columns=["name", "id", "popularity", "release_date"])

    # set start_time and songs_extracted to track progress
    start_time = time()
    songs_extracted = 0

    # from Spotify API, get songs and basic info released in specified year, in batches of 50, with rate limit of 1-3 secs between batches
    # maximum songs per year is 10k
    for i in range(0, num, 50):
        url="https://api.spotify.com/v1/search?q=%20year:{}&limit=50&offset={}&type=track".format(year, i)
        r=get(url, headers=headers)

        for item in r.json()["tracks"]["items"]:
            songs_info = songs_info.append({"name": item["name"], "id": item["id"], "popularity": item["popularity"],
                                           "release_date": item["album"]["release_date"]}, ignore_index = True)
            
        # track progress
        try:
            print("Extracting songs: [{}{}] {}/{} songs completed. Approx time left: {} minutes.".format("#"*round(((i+50)/(num/20))), "-"*round((num/(num/20)-((i+50)/(num/20)))), i+50, num, round(((time()-start_time)/i)*(num-i)/60, 2)), end="\r", flush=True)
        except:
            print("Extracting songs: [{}{}] {}/{} songs completed. Approx time left: {}".format("#"*round(((i+50)/(num/20))), "-"*round((num/(num/20)-((i+50)/(num/20)))), i+50, num, "-"), end="\r", flush=True)
                
        sleep(randint(1,3))
        songs_extracted = i + 50

    print("")
    print("EXTRACTION COMPLETE. Songs extracted: {}; Elapsed Time: {} minutes.".format(songs_extracted, round((time()-start_time)/60, 2)))
    return songs_info
    

## On songs extracted, conduct audio analysis (e.g. duration, loudness, tempo etc.), stored in dataframe songs_analysis.

approx. 3mins per 10 songs

In [4]:
def audio_analysis(songs_info, num):
    """
    Function to conduct audio analysis on extracted songs.
    Args:
        songs_info: dataframe containing songs' basic info
        num: number of songs
    """
    #create data frame for songs' audio analysis
    songs_analysis = pd.DataFrame(columns=["duration", "loudness", "tempo", "tempo_confidence", "time_signature", 
                                                "time_signature_confidence", "key", "key_confidence", "mode", "mode_confidence"])

    # set start_time and songs_analyzed to track progress
    start_time = time()
    songs_analyzed = 0

    # from Spotify API, get songs' audio analysis with rate limit of 1-2 secs between songs
    # maximum songs per year is 10k
    for i in range(num):
        url="https://api.spotify.com/v1/audio-analysis/{}".format(songs_info["id"][i])
        r=get(url, headers=headers)

        # to ensure response contains track details, which is not the case sometimes
        while "track" not in r.json():
            r=get(url, headers=headers)

        r=r.json()["track"]
        songs_analysis = songs_analysis.append({"duration": r["duration"], "loudness": r["loudness"], "tempo": r["tempo"],
                                                     "tempo_confidence": r["tempo_confidence"], "time_signature": r["time_signature"],
                                                     "time_signature_confidence": r["time_signature_confidence"], 
                                                     "key": r["key"], "key_confidence": r["key_confidence"], "mode": r["mode"],
                                                     "mode_confidence": r["mode_confidence"]}, ignore_index = True)
        
        # track progress
        try:
            print("Conducting audio analysis: [{}{}] {}/{} songs completed. Approx time left: {} minutes.".format("#"*round(((i+1)/(num/20))), "-"*round((num/(num/20)-((i+1)/(num/20)))), i+1, num, round(((time()-start_time)/i)*(num-i)/60, 2)), end="\r", flush=True)
        except:
            print("Conducting audio analysis: [{}{}] {}/{} songs completed. Approx time left: {}".format("#"*round(((i+1)/(num/20))), "-"*round((num/(num/20)-((i+1)/(num/20)))), i+1, num, "-"), end="\r", flush=True)
        
        sleep(randint(1,2))
        songs_analyzed = i + 1

    print("")
    print("AUDIO ANALYSIS COMPLETE. Songs analyzed: {}; Elapsed Time: {} minutes.".format(songs_analyzed, round((time()-start_time)/60, 2)))
    return songs_analysis

## On songs extracted, conduct audio feature analysis (e.g. acousticness, danceability, energy etc.), stored in dataframe songs_features.

approx. 15secs per 10 songs

In [5]:
def features_analysis(songs_info, num):
    """
    Function to conduct features analysis on extracted songs.
    Args:
        songs_info: dataframe containing songs' basic info
        num: number of songs
    """
    #create data frame for songs' audio analysis
    songs_features = pd.DataFrame(columns=["acousticness", "danceability", "energy", "instrumentalness", "liveness", 
                                                "speechiness", "valence"])

    # set start_time and songs_analyzed to track progress
    start_time = time()
    songs_analyzed = 0

    # get songs' audio features with rate limit of 1-2 secs between songs
    # maximum songs per year is 10k
    for i in range(num):
        url="https://api.spotify.com/v1/audio-features/{}".format(songs_info["id"][i])
        r=get(url, headers=headers)

        r=r.json()
        songs_features = songs_features.append({"acousticness": r["acousticness"], "danceability": r["danceability"], 
                                                          "energy": r["energy"], "instrumentalness": r["instrumentalness"], 
                                                          "liveness": r["liveness"], "speechiness": r["speechiness"], 
                                                          "valence": r["valence"]}, ignore_index = True)

        #track progress
        try:
            print("Conducting features analysis: [{}{}] {}/{} songs completed. Approx time left: {} minutes.".format("#"*round(((i+1)/(num/20))), "-"*round((num/(num/20)-((i+1)/(num/20)))), i+1, num, round(((time()-start_time)/i)*(num-i)/60, 2)), end="\r", flush=True)
        except:
            print("Conducting features analysis: [{}{}] {}/{} songs completed. Approx time left: {}".format("#"*round(((i+1)/(num/20))), "-"*round((num/(num/20)-((i+1)/(num/20)))), i+1, num, "-"), end="\r", flush=True)
        
        sleep(randint(1,2))
        songs_analyzed = i + 1

    print("")
    print("FEATURES ANALYSIS COMPLETE. Songs analyzed: {}; Elapsed Time: {} minutes.".format(songs_analyzed, round((time()-start_time)/60, 2)))
    return songs_features

## Identify cutoff popularity such that proportions of hit and miss are approx. 50/50, then label each song as either a hit or miss in dataframe songs.

In [6]:
def label_songs(songs):
    """
    Function to label songs with hit/miss.
    Args:
        songs: dataframe containing all songs' info
    """
    # using cutoff_popularity, classify songs as hit or miss using hit_miss list, then add hit_miss list to DataFrame
    hit_miss = []
    hit_count = 0

    for i in range(len(songs["popularity"])):

        if songs["popularity"][i] >= 70:
            hit_count += 1
            hit_miss.append("hit")

        else:
            hit_miss.append("miss")
            
    songs["hit_miss"] = hit_miss
            
    if hit_count < len(songs["popularity"])/2:
        drop_count = len(songs["popularity"]) - 2*hit_count
        songs = songs.drop(songs[songs["hit_miss"]=="miss"].sample(n=drop_count).index)

    return songs

## Link up above processes through a single function for easier management.

In [7]:
def get_data(year, num):
    """
    Function that initiates entire process of extracting specified number of song data from specified year.
    Args:
        year: year to extract songs from
        num: number of songs to extract
    """
    try:
        # obtain data
        songs_info = extract_songs(year, num)
        songs_analysis = audio_analysis(songs_info, num)
        songs_features = features_analysis(songs_info, num)
        # combine all 3 data frames to obtain songs, a dataframe which contains metadata for songs released in specified year.
        songs = pd.concat([songs_info, songs_analysis, songs_features], axis = 1)
        # label songs
        songs = label_songs(songs)
        # save results to csv file
        songs.to_csv("./song_features/{}_{}_song_features.csv".format(year, num), index=False)
    except Exception as ex:
        print("Error for scraping from year {} with num {} and exception: {}".format(year, num, ex))

## Where it all begins

In [None]:
for year in range(2000, 2019):
    get_data(year, 2000)

Extracting songs: [####################] 2000/2000 songs completed. Approx time left: 0.04 minutes.
EXTRACTION COMPLETE. Songs extracted: 2000; Elapsed Time: 1.48 minutes.
Conducting audio analysis: [--------------------] 4/2000 songs completed. Approx time left: 297.38 minutes.