# BILLBOARD HOT 100 SCRAPER

## - Input & Output
__Input__  = __Latest Date and Earliest Date__ as lists of [YYYY, MM, DD] <br>
(note: just input single digit if day or month is single digit ; if scraping one chart, Latest Date = Earliest Date)

__Output__ = __Dataframe of unique songs' details__, scraped from Billboard Hot 100 charts, for Latest Date to Earliest Date inclusive

e.g. <br> 
Input = Latest Date [2018, 12, 29] and Earliest Date [2018, 1, 1] <br>
Output = Dataframe of unique songs' details from 52 x Billboard Hot 100 charts, for 06 Jan 2018 to 29 Dec 2018 inclusive <br>

## - How to Use
1) Run all cells except last cell <br>
2) In last cell: only input YYYY, MM and DD each for latest (i.e. Latest Date) and earliest (i.e. Earliest Date) <br>
3) Run last cell

## - Flaws (further development?)

(Input) no error-catcher for wrong input of date e.g. Apr instead of 4 <br>
("Back-from-the-Dead") does not account for songs that had a Billboard streak, dropped out and then reappeared on chart

In [None]:
# rally library army
from datetime import date, timedelta
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint
import requests
import pandas as pd


# Billboard Hot 100 charts are Saturday weeklies, with Sundays to Fridays belonging to the same week as the UPCOMING Saturday
# which_saturday() takes in a date as parameter, and returns which Saturday it belongs to (based on Billboard logic)
# integrated into scrape_hot100()

def which_saturday(date):
    
    if date.isoweekday() < 6:                               # .isoweekday() returns int of day of week, Monday = 1, Sunday = 7
        days_to_saturday = 6 - date.isoweekday()            # if date is Monday to Friday, find number of days to UPCOMING Saturday   
        date = date + timedelta(days = days_to_saturday)    # add (number of days to UPCOMING Saturday) to the date, it becomes UPCOMING Saturday
    
    if date.isoweekday() == 7:
        date = date + timedelta(days = 6)                   # if date is Sunday, add 6 days to make it UPCOMING Saturday
        
    else:                                                   # if date is already Saturday, leave it
        pass
    
    return date                                             # returns date as UPCOMING Saturday

In [None]:
# scrape_hot100() takes Latest Date and Earliest Date as parameters, and returns dataframe of unique songs' details

def scrape_hot100(latest_date, earliest_date):
    
    # ----- convert Latest Date and Earliest Date to UPCOMING Saturdays (see cell above) ------ #
    
    latest_date = which_saturday(latest_date)           
    earliest_date = which_saturday(earliest_date)

    # ------ starting from Latest Date's chart, scrape weekly charts until and including Earliest Date's chart -------#
    
    bb_scrape = pd.DataFrame(columns=["name", "chart_scraped", "artist", "peak_position", "weeks_on_chart"])   # prepare dataframe
    scrape_date = latest_date           # start scraping from Latest Date
    start_time = time()                 # set start_time to track scrape duration
    
    while scrape_date >= earliest_date:        # keep scraping from Latest Date and backwards, then stop after Earliest Date
        
        zero_singledigit = ["blank to make index = value", "01", "02", "03", "04", "05", "06", "07", "08", "09",
                           "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
                           "26", "27", "28", "29", "30", "31"]  # prepares single digits with zeros to suit chart's url format
        
        # if date's month and/or day is a single digit, need to replace with double digits i.e. zero in front, then put into url
        if (len(str(scrape_date.month)) == 1) or (len(str(scrape_date.day)) == 1):
            
            scrape_date_text = [scrape_date.year, zero_singledigit[scrape_date.month], zero_singledigit[scrape_date.day]]
            url = "https://www.billboard.com/charts/hot-100/{}-{}-{}".format(scrape_date_text[0], 
                                                                             scrape_date_text[1], scrape_date_text[2])
        
        else:
            url = "https://www.billboard.com/charts/hot-100/{}-{}-{}".format(scrape_date.year, scrape_date.month, scrape_date.day)
        
        # request Billboard Hot 100 chart, convert to Beautiful Soup, then for each chart item scrape its name, chart date, (...)
        # (...) peak position and weeks on chart
        r = requests.get(url)
        r_soup = BeautifulSoup(r.text, "html.parser")
        song_containers = r_soup.find_all("div", attrs = {"class":"chart-list-item"})

        for n in range(len(song_containers)):
    
            name = song_containers[n].find("span", class_ = "chart-list-item__title-text").text.strip()
            hex_name = name.replace(" ", "%20")  # prepare Spotify API-friendly name
            chart_scraped = scrape_date
            artist = song_containers[n].find("div", class_ = "chart-list-item__artist").text.strip()
            try:       # for songs without Peak Position and Weeks on Chart, put NaN
                peak_position = song_containers[n].find("div", class_ = "chart-list-item__weeks-at-one").text.strip()
                weeks_on_chart = song_containers[n].find("div", class_ = "chart-list-item__weeks-on-chart").text.strip()
            except:
                peak_position = "NaN"
                weeks_on_chart = "NaN"
            
            bb_scrape = bb_scrape.append({"name": hex_name, "chart_scraped": scrape_date, "artist": artist,
                                         "peak_position": peak_position, "weeks_on_chart": weeks_on_chart}, 
                                         ignore_index = True)
        
            charts_total = int((latest_date - earliest_date).days / 7 + 1)       # counts total number of charts
            charts_scrapped = int((latest_date - scrape_date).days / 7 + 1)      # tracks number of charts scraped
            
            # print progress
            print("Busy scraping. Scraped/Total: {}/{}. Elapsed Time: {} mins"
                  .format(charts_scrapped, charts_total, round((time()-start_time)/60, 2)), end = "\r", flush = True) 
        
        sleep(randint(1,2))                                      # sleep between charts
        scrape_date = scrape_date - timedelta(days=7)            # prepare to scrape the previous Saturday

    # -------- sort scraped data by chart_scraped, i.e. latest chart's items on top, (...)
    # (...) then remove duplicates based on song name, keeping only the latest data for that song name, (...)
    # (...) since the latest data should be the most relevant e.g. peak position and weeks on chart --------- #
    
    bb_scrape = bb_scrape.sort_values("chart_scraped", ascending = False)       
    bb_scrape.drop_duplicates("name", keep = "first", inplace = True)           
    bb_scrape.reset_index(drop = True, inplace = True)           # resets index labels which were messed up during sorting
    
    return bb_scrape

In [None]:
latest = [2018, 12, 29]        # INPUT: [YYYY, MM, DD] ; if month/day is single digit, just put single digit
earliest = [2018, 1, 1]        # INPUT: [YYYY, MM, DD] ; if month/day is single digit, just put single digit

# ---- after input, don't touch anything else, run this last cell ---- #
scrape_hot100(date(latest[0], latest[1], latest[2]), date(earliest[0], earliest[1], earliest[2]))