In [None]:
import pandas as pd
import numpy as np
import os, json, logging
from requests import get, post
from time import time, sleep
from random import randint
from bs4 import BeautifulSoup
from datetime import date, timedelta

# prepares to call Spotify APIs
res = post('https://accounts.spotify.com/api/token', headers = {'Authorization': 'Basic NDM0YmFiM2VhNmM2NDg2MmI3NmJkYWUwOTA0NmU2Njg6ZjFlZmFhZmM5MjA1NDFiYzkyZGNlMTk2MzBhZjk1NzE='}, data= {'grant_type': 'client_credentials'})
token = 'Bearer {}'.format(res.json()['access_token'])

headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}

# "songs' info and features" refers to duration, loudness tempo etc.
# "billboard records" refers to weeks_on_chart, peak_performance etc.

# spotify_only .csv file contains songs' info and features, without billboard records
# billboard_only .csv file contains billboard unique hits, without their info and features

spotify_only = pd.read_csv(".csv")
billboard_only = pd.read_csv(".csv")

In [None]:
# code in this cell gets billboard hits' info and features

songs_info = pd.DataFrame(columns=["id", "popularity", "release_date"])

songs_label = pd.DataFrame(columns = ["hitmiss_spotify"])

songs_analysis = pd.DataFrame(columns=["duration", "loudness", "tempo", "tempo_confidence", "time_signature", 
                                       "time_signature_confidence", "key", "key_confidence", "mode", "mode_confidence"])

songs_features = pd.DataFrame(columns=["acousticness", "danceability", "energy", "instrumentalness", "liveness", 
                                       "speechiness", "valence"])

start_time = time()
songs_analyzed = 0

for i in range(len(billboard_raw["id"])):   
    songs_info = songs_info.append({"id": billboard_raw["id"][i], "popularity": billboard_raw["popularity"][i],
                                    "release_date": billboard_raw["release_date"][i]}, ignore_index = True)
    
    if billboard_raw["popularity"][i] >= 70:
        songs_label = songs_label.append({"hitmiss_spotify": 1}, ignore_index = True)

    else:
        songs_label = songs_label.append({"hitmiss_spotify": 0}, ignore_index = True)
    
    #------------------------#
    url="https://api.spotify.com/v1/audio-analysis/{}".format(billboard_raw["id"][i])
    r=get(url, headers=headers)

    # to ensure response contains track details, which is not the case sometimes
    while "track" not in r.json():
        r=get(url, headers=headers)

    try:
        r=r.json()["track"]
        songs_analysis = songs_analysis.append({"duration": r["duration"], "loudness": r["loudness"], "tempo": r["tempo"],
                                                "tempo_confidence": r["tempo_confidence"], "time_signature": r["time_signature"],
                                                "time_signature_confidence": r["time_signature_confidence"], "key": r["key"], 
                                                "key_confidence": r["key_confidence"], "mode": r["mode"], 
                                                "mode_confidence": r["mode_confidence"]}, ignore_index = True)
    except:
        songs_analysis = songs_analysis.append({"duration": "NaN", "loudness": "NaN", "tempo": "NaN",
                                                "tempo_confidence": "NaN", "time_signature": "NaN", "time_signature_confidence": "NaN", 
                                                "key": "NaN", "key_confidence": "NaN", "mode": "NaN", "mode_confidence": "NaN"}, 
                                               ignore_index = True)
    
    #-------------------------#
    url="https://api.spotify.com/v1/audio-features/{}".format(billboard_raw["id"][i])
    r=get(url, headers=headers)

    try:
        r=r.json()
        songs_features = songs_features.append({"acousticness": r["acousticness"], "danceability": r["danceability"], 
                                                "energy": r["energy"], "instrumentalness": r["instrumentalness"],
                                                "liveness": r["liveness"], "speechiness": r["speechiness"], 
                                                "valence": r["valence"]}, ignore_index = True)
        
    except:
        songs_features = songs_features.append({"acousticness": "NaN", "danceability": "NaN", 
                                                "energy": "NaN", "instrumentalness": "NaN",
                                                "liveness": "NaN", "speechiness": "NaN", 
                                                "valence": "NaN"}, ignore_index = True)
    
    sleep(randint(1,2))
    songs_analyzed = i + 1
    elapsed_time = time() - start_time
    print("Songs analyzed: {}; Elapsed Time: {}".format(songs_analyzed, elapsed_time), end = "\r", flush = True)

print("ANALYSIS COMPLETE. Songs analyzed: {}; Elapsed Time: {}.".format(songs_analyzed, time()-start_time))

In [None]:
# concatenates info and features of 1) spotify database and 2) billboard hits
# note that NO billboard records have been included yet i.e. weeks_on_chart, peak_peformance etc.

song_info_for_billboard = pd.concat([songs_info, songs_label, songs_analysis, songs_features], axis = 1, sort = False)

no_billboard_records = pd.concat([spotify_raw, song_info_for_billboard], axis = 0, sort = False)

no_billboard_records.drop_duplicates("id", keep = "first", inplace = True)
no_billboard_records.reset_index(drop = True, inplace = True)

In [None]:
# for each song in no_billboard_records, check if it has any billboard history i.e. records in billboard_only
# if yes, per its index in no_billboard_records, put its billboard records in a new dataframe

billboard_records = pd.DataFrame(index = range(len(no_billboard_records["id"])),
                                 columns = ["chart_scraped", "peak_position", "weeks_on_chart", "hitmiss_billboard"])

for i in range(len(no_billboard_records["id"])):
    
    for n in range(len(billboard_only["id"])):
        
        if no_billboard_records["id"][i] == billboard_only["id"][n]:
            billboard_records.at[i, "chart_scraped"] = billboard_only["chart_scraped"][n] 
            billboard_records.at[i, "peak_position"] = billboard_only["peak_position"][n]
            billboard_records.at[i, "weeks_on_chart"] = billboard_only["weeks_on_chart"][n]
            billboard_records.at[i, "hitmiss_billboard"] = 1
            break
            
        else:
            billboard_records.at[i, "chart_scraped"] = "NaN"
            billboard_records.at[i, "peak_position"] = "NaN"
            billboard_records.at[i, "weeks_on_chart"] = "NaN"
            billboard_records.at[i, "hitmiss_billboard"] = 0
        
    print("Checking song number {}".format(i), end = "\r", flush = True)

In [None]:
# concatenates all songs (with info and features) with their billboard records

final = pd.concat([no_billboard_records, billboard_records], axis = 1, sort = False)

final.to_csv(".csv", index=False)