In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
import requests
import numpy as np
import os
from collections import OrderedDict
import main
sp = main.sp

Code Source: https://github.com/RosebudAnwuri/TheArtandScienceofData/blob/master/The%20Making%20of%20Great%20Music/scripts/music_sentiment.py

Endpoint Returned:
https://developer.spotify.com/documentation/web-api/reference/#endpoint-get-audio-features

In [2]:
def get_spotify_features(track, artist):
    artist = artist.split(' ', 1)[0]
    #Search for Spofity song ID 
    songs=sp.search(q='track:'+track+' '+'artist:'+artist+'*' , type='track')
    items = songs['tracks']['items']
    if len(items) ==0:
        return([0]*len(features))
    else:
        track = items[0]
        song_id = str(track["id"])
        #Use ID to get Song features
        track_features=sp.audio_features(song_id)
        if len(track_features[0]) <18:
            return([0]*len(features))
        else:
            features_to_df = np.array(track_features)[0]
            #Order Dictionary
            features_to_df = OrderedDict(features_to_df)
            #Get Dictionary values
            feature_values = features_to_df.values()
            return(feature_values)

In [3]:
list(get_spotify_features('peaches','justin'))[0]

0.677

Function for getting charts for first of every month since the year 2000.

In [4]:
def append_song_stats(year):
    song_list = []
    for each in range(0,100):
        song = []
        rank = list(song_soup)[2].findAll("span",{'class': 'chart-element__rank__number'})[each].text
        artist = list(song_soup)[2].findAll("span",{'class': 'chart-element__information__artist text--truncate color--secondary'})[each].text
        song_name = list(song_soup)[2].findAll("span",{'class': 'chart-element__information__song text--truncate color--primary'})[each].text
        peak = list(song_soup)[2].findAll("span",{'class': 'chart-element__meta text--center color--secondary text--peak'})[each].text
        woc = list(song_soup)[2].findAll("span",{'class': 'chart-element__meta text--center color--secondary text--week'})[each].text
        # some songs are listed differently in Billboards vs Spotify; it's hard to account for those so we need to use a try/except 
        song.append(year + '-01')
        song.append(rank)
        song.append(artist)
        song.append(song_name)
        song.append(peak)
        song.append(woc)
        try:
            song_features = list(get_spotify_features(song_name,artist))
            for each in song_features:
                song.append(each)
        except: 
            song_features = 'N/A'
            for each in (0,18):
                song.append(song_features)
        song_list.append(song)
    return song_list

In [5]:
# Initialize .csv writer
writer = pd.ExcelWriter('billboard_charts.xlsx', engine='xlsxwriter')

# Push data to .csv file (1 sheet per year)
# Only using data as far back as 2000
for year in range(2000,2020):
    for month in range(1,13):
        year_month = str(year) + '-' + str(month).zfill(2)
        url = "https://www.billboard.com/charts/hot-100/{}-01".format(year_month)
        page = requests.get(url)
        song_soup = BeautifulSoup(page.content, 'html.parser')

        headers = ['year','rank',
                       'artist',
                       'song_name',
                        'peak',
                        'woc',
                   'danceability',
                       'energy', 
                       'key', 
                   'loudness',
                   'mode',
                   'speechiness',
                   'acousticness',
                   'instrumentalnes',
                   'liveness',
                   'valence',
                   'tempo',
                   'type',
                   'id',
                   'uri',
                   'track_href',
                   'analysis_url',
                   'duration_ms',
                   'time_signature'
                      ]

        song_list = append_song_stats(year_month)
        print(year_month + '-01' + ' successfully processed')
        stats = pd.DataFrame(song_list, columns = headers)
        stats.to_excel(writer, sheet_name='chart_data_{}-01'.format(year_month), index = False)
    
# Save .csv data
print('Scraping Completed!')
writer.save()

2000-01-01 successfully processed
2000-02-01 successfully processed
2000-03-01 successfully processed
2000-04-01 successfully processed
2000-05-01 successfully processed
2000-06-01 successfully processed
2000-07-01 successfully processed
2000-08-01 successfully processed
2000-09-01 successfully processed
2000-10-01 successfully processed
2000-11-01 successfully processed
2000-12-01 successfully processed
2001-01-01 successfully processed
2001-02-01 successfully processed
2001-03-01 successfully processed
2001-04-01 successfully processed
2001-05-01 successfully processed
2001-06-01 successfully processed
2001-07-01 successfully processed
2001-08-01 successfully processed
2001-09-01 successfully processed
2001-10-01 successfully processed
2001-11-01 successfully processed
2001-12-01 successfully processed
2002-01-01 successfully processed
2002-02-01 successfully processed
2002-03-01 successfully processed
2002-04-01 successfully processed
2002-05-01 successfully processed
2002-06-01 suc