# Billboard Functions

In [1]:
## Basic stuff
%load_ext autoreload
%autoreload

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
#IPython.Cell.options_default.cm_config.lineNumbers = true;

################################################################################
## Python Version
################################################################################
import sys


################################################################################
## General Stuff
################################################################################
from multiprocessing import Pool
from tqdm import tqdm


################################################################################
## Util Stuff
################################################################################
from timeUtils import clock, elapsed
from ioUtils import saveFile, getFile


################################################################################
## Music DB
################################################################################
from mainDB import mainDB
from musicDBMap import musicDBMap
from masterDBMatchClass import masterDBMatchClass
from matchDBArtist import matchDBArtist


################################################################################
## Music Names
################################################################################
from masterArtistNameDB import masterArtistNameDB


################################################################################
## Chart Stuff
################################################################################
from artistIgnores import getArtistIgnores
from billboardData import billboardData
from top40Data import top40Data


################################################################################
## Pandas Stuff
################################################################################
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

print("Python: {0}".format(sys.version))
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))



Python: 3.7.7 (default, Mar 26 2020, 10:32:53) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2020-12-10 21:16:40.562526


# Final Aggregation

In [4]:
%load_ext autoreload
%autoreload
from billboardData import billboardData
bd = billboardData(minYear=1, maxYear=2021)
bd.setChartUsage(rank=[0,1,2])
bd.setFullChartData()
bd.setArtistAlbumData()

bd.saveArtistAlbumData()
bd.saveFullChartData()

_, _ = clock("Last Run")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Found 62 files.
  Getting Chart For hot
  Using 16 Charts
  Getting Chart For adult
  Using 4 Charts
  Getting Chart For top
  Using 3 Charts
  Using 23 Charts
  Getting Chart For alternative
  Using 4 Charts
  Getting Chart For countryMusic
  Using 10 Charts
  Getting Chart For rock
  Using 14 Charts
  Getting Chart For rnb
  Using 16 Charts
  Using 44 Charts
  Getting Chart For christian
  Using 21 Charts
  Getting Chart For canadian
  Using 4 Charts
  Getting Chart For comedy
  Using 2 Charts
  Getting Chart For general
  Using 2 Charts
  Getting Chart For twitter
  Using 5 Charts
  Using 34 Charts
  Using Charts (None): ['hot-100', 'pop-songs', 'radio-songs', 'streaming-songs', 'rhythmic-40', 'heatseekers-albums', 'billboard-200', 'artist-100', 'top-album-sales', 'TLN', 'HSB', 'HSI', 'TLP', 'TSL', 'TFM', 'ATS', 'adult-contemporary', 'adult-pop-songs', 'ASI', 'ATF', 'billboard-200', 'artist-100',

****
****
****
****

# Starter Class Section

In [2]:
basedir = "/Volumes/Piggy/Charts/"
try:
    filename = glob(join(basedir, "data", "billboard", "starter.html"))[0]
except:
    print("Could not find starter HTML file!")
fdata = getHTML(filename)

#### Extra

In [None]:
chartData  = {}
dirname = None
baseURL = "https://www.billboard.com"
for iul,ul in enumerate(fdata.findAll("ul")):
    lis = ul.findAll("li", {"class": "chart-group__item"})
    for j,li in enumerate(lis):  
        a = li.find('a')
        if a is not None:
            href = a['href']
            text = a.text
            subdir = getDirname(href)[1:]
            chartData[text] = [baseURL, subdir, getBasename(href)]

In [None]:
#chartData

# Download Yearly Information

In [None]:
from time import sleep
years = [str(x) for x in range(1958,2014)]
years = ['2019']
for year in years:
    url="https://www.billboard.com/archive/charts/{0}".format(year)
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    
    savedir = join(basedir, "data", "billboard", "yearly")
    savename = join(savedir, "{0}.p".format(year))
    if isFile(savename):
        continue
    
    request=urllib.request.Request(url,None,headers) #The assembled request
    response = urllib.request.urlopen(request)
    data = response.read() # The data u need

    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)
    sleep(1)

# Parse Yearly Information

In [None]:
downloads = {}
baseURL = "https://www.billboard.com"
for ifile in sorted(glob(join(basedir, "data", "billboard", "yearly", "*.p"))):
    year = getBaseFilename(ifile)
    if downloads.get(year) is None:
        downloads[year] = {}
    fdata = getHTML(ifile)
    for iul,ul in enumerate(fdata.findAll("ul")):
        lis = ul.findAll("li", {"class": "chart-group__item"})
        for j,li in enumerate(lis):  
            a = li.find('a')
            if a is not None:
                href = a['href']
                text = a.text
                subdir = getDirname(href)[1:]
                downloads[year][href] = [baseURL, subdir, getBasename(href)]

In [None]:
downloads.keys()

# Download Category Data

In [None]:
baseURL = "https://www.billboard.com"

for year, yearData in downloads.items():   
    for href, hrefData in yearData.items():
        url  = "{0}/{1}".format(baseURL, href)
        year = getBasename(hrefData[1])
        category = hrefData[2]

        savedir  = join(basedir, "data", "billboard", "categories")
        savename = join(savedir, "{0}-{1}.p".format(year, category))

        if isFile(savename):
            continue

        print("  Trying to download and save {0}".format(savename))
        try:

            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
            headers={'User-Agent':user_agent,} 

            request=urllib.request.Request(url,None,headers) #The assembled request
            response = urllib.request.urlopen(request)
            data = response.read() # The data u need
        except:
            continue

        print("Saving {0}".format(savename))
        saveJoblib(data=data, filename=savename, compress=True)
        sleep(1)        

# Parse Category Data

In [None]:
data = {}

baseURL = "https://www.billboard.com"
names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
categories = set([x[5:] for x in names])

albumCategories  = [x for x in categories if x.endswith("albums")]
songCategories   = [x for x in categories if not x.endswith("albums")]


names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
years = sorted(set([x[:4] for x in names]))

print("There are {0} years".format(len(years)))
print("There are {0} charts".format(len(categories)))
print("There are {0} album charts".format(len(albumCategories)))
print("There are {0} song charts".format(len(songCategories)))

In [None]:
#categories

In [3]:
if False:
    catdir = join(basedir, "data", "billboard", "categories")
    files = findPatternExt(catdir, pattern='2019', ext='.p')    
    for ifile in files:
        bsdata = getHTML(ifile)
        name   = bsdata.find("h2", {"class": "simple-page__title"})
        print(ifile,'\t',name)

In [None]:
def parseBillboardFile(ifile):
    fdata = getHTML(ifile)
    data  = {}
    for i,table in enumerate(fdata.findAll('table')):
        ths = table.findAll("th")
        trs = table.findAll("tr")

        headers = [x.text for x in ths]
        for j,tr in enumerate(trs[1:]):
            tds  = tr.findAll('td')
            vals = [x.text for x in tds]
            #print(headers,vals)

            if len(vals) == 3:
                date,name,artist = vals
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    print(", ".join([date, year]))

                    1/0
            elif len(vals) == 1:
                date = vals[0]
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    1/0

            if not isDate(date):
                raise ValueError("Could not form date for {0}".format(date))

            date = printDateTime(date)
            if data.get(date) is not None:
                raise ValueError("Already seen this date!!!")
            data[date] = {"Artist": artist, "Name": name}
            
            #print("{0: <12}{1}".format(date, data[date]))
            continue
            
            
            if data.get(date) is None:
                data[date] = {}
            #print("{0: <20}{1: <20}{2: <20}{3: <20}".format(artist, name, category, date))
            #continue
            if data.get(artist) is None:
                data[artist] = {}
                #print("Artist: {0}".format(artist))
            if data[artist].get(name) is None:
                data[artist][name] = {}
                #print("\tSong: {0} ({1})".format(name, len(data[artist])))
            if data[artist][name].get(category) is None:
                data[artist][name][category] = []
            data[artist][name][category].append([date])

    return data

In [None]:
catdir = join(basedir, "data", "billboard", "categories")
for year in years:
    data  = {}
    files = findPatternExt(catdir, pattern=year, ext='.p')    
    for ifile in files:
        chart   = getBaseFilename(ifile)
        chart   = "-".join(chart.split('-')[1:])
        results = parseBillboardFile(ifile)
        data[chart] = results
        print("{0: <10}{1: <30}{2}".format(year, chart, len(results)))
        
    savedir = join(basedir, "data", "billboard", "results")
    savename = join(savedir, "{0}.p".format(year))
    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)

# Aggregrate Charts

Found 62 files.
  Getting All Charts
  Using 200 Charts
  Using Charts (None): ['adult-contemporary', 'adult-pop-songs', 'ASI', 'ATF', 'alternative-albums', 'alternative-songs', 'MRT', 'ALT', 'BLU', 'BGR', 'FLK', 'christian-airplay', 'hot-christian-songs', 'christian-albums', 'christian-digital-song-sales', 'christian-songs', 'christian-streaming-songs', 'gospel-airplay', 'gospel-albums', 'gospel-digital-song-sales', 'gospel-songs', 'gospel-streaming-songs', 'CRI', 'CRT', 'ICO', 'ILL', 'SLL', 'GOS', 'GSI', 'GSS', 'CHS', 'GDT', 'country-airplay', 'country-albums', 'country-digital-song-sales', 'country-songs', 'country-streaming-songs', 'CSA', 'CSI', 'CST', 'CLP', 'CDT', 'dance-club-play-songs', 'dance-electronic-albums', 'dance-electronic-digital-song-sales', 'dance-electronic-songs', 'dance-electronic-streaming-songs', 'DAN', 'DAS', 'DDT', 'BSI', 'BST', 'DSI', 'ELP', 'hot-100', 'pop-songs', 'radio-songs', 'streaming-songs', 'rhythmic-40', 'heatseekers-albums', 'billboard-200', 'artist

NameError: name 'Counter' is not defined

****

# Chart Analysis

In [6]:
chartCounter = getFile("chartCounter.p")
keys = chartCounter.keys()
sorted(keys)
#chartCounter.most_common()

['ATS',
 'HSB',
 'HSI',
 'TFM',
 'TLN',
 'TLP',
 'TSL',
 'artist-100',
 'billboard-200',
 'heatseekers-albums',
 'hot-100',
 'pop-songs',
 'radio-songs',
 'rhythmic-40',
 'streaming-songs',
 'top-album-sales']

In [78]:

slimResults = Counter()
for artist, artistData in results.items():
    for key, keyData in artistData.items():
        for album, albumData in keyData.items():
            slimResults[artist] += sum({k: len(v) for k,v in albumData.items()}.values())

In [None]:
saveFile(idata = slimResults, ifile="billboardCounter.p", debug=True)
saveFile(idata = results,     ifile="billboardResults.p", debug=True)

# Get Billboard Results

In [None]:
slimResults = getFile(ifile="billboardCounter.p", debug=True)
fullResults = getFile(ifile="billboardResults.p", debug=True)

# Get Discogs

In [None]:
disc = discogs()
discdf = disc.getMasterSlimArtistDiscogsDB()
artistIDToName = discdf["DiscArtist"].to_dict()
from masterdb import getArtistAlbumsDB, discConv
artistAlbumsDB = getArtistAlbumsDB(disc)

artistNameToID = {}
print("Found {0} ID -> Name entries".format(len(artistIDToName)))
for artistID,artistName in artistIDToName.items():
    if artistNameToID.get(artistName) is None:
        artistNameToID[artistName] = []
    artistNameToID[artistName].append(artistID)
print("Found {0} Name -> ID entries".format(len(artistNameToID)))
mulArts  = multiArtist(cutoff=0.9, discdata=artistNameToID, exact=False)

# Check Renames

In [None]:
singleRenames = getFile(ifile="singleRenames.p", debug=True)
multiRenames  = getFile(ifile="multiRenames.p", debug=True)
knownArtists  = getFile(ifile="artistMap.p", debug=True)

# Near Renames Artists

In [None]:
from searchUtils import findNearest
cutoff = 0.9
artistsToGet = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
#for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i <= 4500:
        continue
    if i % 250 == 0:
        print("==>",i,len(slimResults))
    if multiRenames.get(artist) is not None:
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if knownArtists.get(artist) is not None:
        continue
    if manualRenames.get(artist) is not None:
        continue

    matches = mulArts.getArtistNames(artist)
    if len(matches) > 1:
        continue

    mdata = getMusicData("DiscArtist", artist)
    if mdata is None:
        results = findNearest(artist, artistNameToID.keys(), num=1, cutoff=cutoff)      
        if len(results) > 0:
            artistsToGet[artist] = results
            print("manualRenames[\"{0}\"] = \"{1}\"".format(artist, results[0]))

In [None]:
findNearest

In [None]:
for k,v in artistsToGet.items():
    print("keep[\"{0}\"] = {1}  #{2}".format(k, v[0], v[1]))



# Get Multi Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
len(multiMatchResult)

In [None]:
print(len(knownArtists))
knownArtists.update(multiMatchResult)
print(len(knownArtists))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

# Single Results

In [None]:
knownArtists["The Kingston Trio"]

In [None]:
cutoff = 0.60
singleResult = {}
#for i, (artist, cnt) in enumerate(slimResults.most_common()):
for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i > 0 and i % 100 == 0 or i == 100:
        print("Passed",i,'/',len(slimResults))
    if cnt < 1:
        continue
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]        
    if prevArtist is None:
        prevArtist = artist
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    if knownArtists.get(prevArtist) is not None or knownArtists.get(artist) is not None:
        continue
    artistAlbums = set(list(fullResults[artist]["Songs"].keys()) + list(fullResults[artist]["Albums"].keys()))
    match = mulArts.getArtistNames(artist                                                     )
    if len(match) == 1:
        mdata = getMusicData("DiscArtist", list(match.keys())[0])
        if not isinstance(mdata, DataFrame):
            results = getBestArtistMatch(artist, artistAlbums, N=3, cutoff=cutoff)
            if results[2] is not None:
                if results[2] >= cutoff:
                    print(i,"/",len(slimResults),'  \t',artist,'\t',results)
                    singleResult[artist] = results
                    if len(singleResult) > 1000:
                        break

In [None]:
del singleResult["Murk vs. Kristine W"]
del singleResult["Chris Cox Vs. Happy Clappers"]
del singleResult["T.M.Revolution × Nana Mizuki"]
del singleResult["Bob Marley Vs. Funkstar De Luxe"]

In [None]:
singleResults = {k: v[0] for k,v in singleResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

## Known Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
singleResults = {k: v[0] for k,v in multiMatchResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))

In [None]:
saveFile(idata=knownArtists, ifile="singleRenames.p")

# Check For Misnames

In [None]:
ma = multiartist(cutoff=0.9, discdata=discdata)

In [None]:
print(ma.getArtistNames('Lipps, Inc.') == {'Lipps, Inc.': ['159617', '26641', '101850']})
print(ma.getArtistNames('Bob Wills & His Texas Playboys') == {'Bob Wills & His Texas Playboys': ['670000','786114','1134146','804668','804679','1004309','935875','935907']})
print(ma.getArtistNames('Brad Paisley Duet With Carrie Underwood', debug=False) == {'Carrie Underwood': ['1011680'], 'Brad Paisley': ['313755']})
print(ma.getArtistNames('Ray Charles With Willie Nelson', debug=False) == {'Willie Nelson': ['249449'], 'Ray Charles': ['521963', '30552']})

# Album Data

In [None]:
savename = join(savedir, "billboard.p")
albumData = getFile(join(savedir, "billboard-album.p"))
#chartdata = getFile(savename, debug=True)

In [None]:
for artist,artistData in albumData.items():
    print(artist)
    for album in artistData.keys():
        print("\t",album)

# Show Data

In [None]:
togets = {}
combos = []

savedir  = join(basedir, "results")
savename = join(savedir, "billboard.p")
chartdata = getFile(savename, debug=True)
chartArtists = sorted(chartdata.keys(), reverse=True)

import json
prevs = json.load(open("prevs.json", "r"))
artPrev = json.load(open("../discogs/prevs.json", "r"))
prevs.update(artPrev)
json.dump(prevs, open("prevs.json", "w"))
print("There are {0} artists previously searched.".format(len(prevs)))

ma = multiartist(cutoff=0.9, discdata=discdata, exact=False)

from random import shuffle
print("There are {0} chart artists".format(len(chartArtists)))
shuffle(chartArtists)
for i,artist in enumerate(chartArtists):
    if prevs.get(artist) is not None:
        continue
    retval = ma.getArtistNames(artist)
    if retval is None:
        togets[artist] = artist
    else:
        for name,value in retval.items():
            if prevs.get(name) is not None:
                continue
            if value is None:
                togets[name.upper()] = artist
                print("{0: <3}{1: <40}{2: <60}{3}".format(len(togets),name,artist,retval))
                print('\t---->',i,'/',len(chartArtists))

    
    if len(togets) > 50:
        print("\n\n\n\n")
        for name,artist in togets.items():
            print("art.searchDiscogForArtist(\"{0}\")  ## {1}".format(name, artist))
        break

###### artists = [x.strip() for x in vals.split("\n")]
artists = [x for x in artists if len(x) > 0]

In [None]:
Pitbull Featuring Ne-Yo, Afrojack & Nayer
Drake Featuring Kanye West, Lil Wayne & Eminem

In [None]:
def testName(name, discdata):
    if discdata.get(name) is not None:
        return discdata[name]
    return name

def findName(name, discArtists, threshold):
    retval = findNearest(name, discArtists, 1, threshold)
    if len(retval) == 0:
        return name
    return retval
    

def splitArtist(name):
    names = [name]
    names = [[x.strip() for x in y.split(" Featuring ")]
results = {}
    
for name in artists:
    if " & " in name:
        retvals = [testName(x, discdata) for x in 
        retvals = [findName(x, discArtists, 0.95) if not isinstance(x, list) else x for x in retvals]
        results[name] = retvals        

In [None]:
togets = []
for k,v in results.items():
    for val in v:
        if not isinstance(val, list):
            togets.append("art.searchDiscogForArtist(\"{0}\")".format(val))
            
for toget in set(togets):
    print(toget)

In [None]:
tmp = [['5070865', '90037', '4135543', '1507065'], ['Katy XXX Perry', ['1201210']]]

In [None]:
tmp

In [None]:
nameids = {}
namerefs = {}
for name,nameid in iddata.items():
    artist = name
    if name.endswith(")"):
        artist = None
        for x in [-3,-4,-5]:
            if artist is not None:
                continue
            if abs(x) > len(name):
                continue
            if name[x] == "(":
                try:
                    val = int(name[(x+1):-1])
                    artist = name[:x].strip()
                except:
                    continue
          
        if artist is None:
            artist = name

    ref = refdata[name]
            
    if nameids.get(artist) is None:
        nameids[artist] = {}
    nameids[artist][nameid] = 1
    
    if namerefs.get(artist) is None:
        namerefs[artist] = {}
    namerefs[artist][ref] = 1
    

nameids = {k: list(v.keys()) for k,v in nameids.items()}
namerefs = {k: list(v.keys()) for k,v in namerefs.items()}