# Billboard Functions

In [134]:
## Basic stuff
%load_ext autoreload
%autoreload

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
#IPython.Cell.options_default.cm_config.lineNumbers = true;

################################################################################
## Python Version
################################################################################
import sys


################################################################################
## General Stuff
################################################################################
from multiprocessing import Pool
from tqdm import tqdm


################################################################################
## Util Stuff
################################################################################
from timeUtils import clock, elapsed
from ioUtils import saveFile, getFile


################################################################################
## Music DB
################################################################################
from mainDB import mainDB
from musicDBMap import musicDBMap
from masterDBMatchClass import masterDBMatchClass
from matchDBArtist import matchDBArtist


################################################################################
## Music Names
################################################################################
from masterArtistNameDB import masterArtistNameDB


################################################################################
## Chart Stuff
################################################################################
from artistIgnores import getArtistIgnores
from billboardData import billboardData
from top40Data import top40Data


################################################################################
## Pandas Stuff
################################################################################
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

print("Python: {0}".format(sys.version))
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

Python: 3.7.7 (default, Mar 26 2020, 10:32:53) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2021-01-10 20:10:56.722631


# Final Aggregation

In [4]:
%load_ext autoreload
%autoreload
from billboardData import billboardData
bd = billboardData(minYear=1, maxYear=2021)
bd.setChartUsage(rank=[0,1,2])
bd.setFullChartData()
bd.setArtistAlbumData()

bd.saveArtistAlbumData()
bd.saveFullChartData()

_, _ = clock("Last Run")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Found 62 files.
  Getting Chart For hot
  Using 16 Charts
  Getting Chart For adult
  Using 4 Charts
  Getting Chart For top
  Using 3 Charts
  Using 23 Charts
  Getting Chart For alternative
  Using 4 Charts
  Getting Chart For countryMusic
  Using 10 Charts
  Getting Chart For rock
  Using 14 Charts
  Getting Chart For rnb
  Using 16 Charts
  Using 44 Charts
  Getting Chart For christian
  Using 21 Charts
  Getting Chart For canadian
  Using 4 Charts
  Getting Chart For comedy
  Using 2 Charts
  Getting Chart For general
  Using 2 Charts
  Getting Chart For twitter
  Using 5 Charts
  Using 34 Charts
  Using Charts (None): ['hot-100', 'pop-songs', 'radio-songs', 'streaming-songs', 'rhythmic-40', 'heatseekers-albums', 'billboard-200', 'artist-100', 'top-album-sales', 'TLN', 'HSB', 'HSI', 'TLP', 'TSL', 'TFM', 'ATS', 'adult-contemporary', 'adult-pop-songs', 'ASI', 'ATF', 'billboard-200', 'artist-100',

****
****
****
****

# Starter Class Section

In [2]:
basedir = "/Volumes/Piggy/Charts/"
try:
    filename = glob(join(basedir, "data", "billboard", "starter.html"))[0]
except:
    print("Could not find starter HTML file!")
fdata = getHTML(filename)

#### Extra

In [None]:
chartData  = {}
dirname = None
baseURL = "https://www.billboard.com"
for iul,ul in enumerate(fdata.findAll("ul")):
    lis = ul.findAll("li", {"class": "chart-group__item"})
    for j,li in enumerate(lis):  
        a = li.find('a')
        if a is not None:
            href = a['href']
            text = a.text
            subdir = getDirname(href)[1:]
            chartData[text] = [baseURL, subdir, getBasename(href)]

In [None]:
#chartData

# Download Yearly Information

In [None]:
from time import sleep
years = [str(x) for x in range(1958,2014)]
years = ['2019']
for year in years:
    url="https://www.billboard.com/archive/charts/{0}".format(year)
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    
    savedir = join(basedir, "data", "billboard", "yearly")
    savename = join(savedir, "{0}.p".format(year))
    if isFile(savename):
        continue
    
    request=urllib.request.Request(url,None,headers) #The assembled request
    response = urllib.request.urlopen(request)
    data = response.read() # The data u need

    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)
    sleep(1)

# Parse Yearly Information

In [4]:
import urllib
baseURL = "https://www.billboard.com"

url="{0}/charts".format(baseURL)
savename="tmpWeekly.p"

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}
    
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
data = response.read() # The data u need

saveFile(ifile=savename, idata=data)
_,_ = clock("Last Run")

  --> This file is 31.8kB.
Current Time is Wed Dec 30, 2020 23:41:13 for Last Run


In [5]:
from webUtils import getHTML
bsdata = getHTML("tmpWeekly.p")

In [7]:
categoryIDs = {}
for div in bsdata.findAll("div", {"class": ["chart-panel", "chart-panel--main"]}):
    categories = div.findAll("div", {"class": ["chart-panel__item", "chart-panel__item--selector", "chart-panel__item--selector-active"]})
    for category in categories:
        attrs = category.attrs
        if "data-category-id" in attrs:
            categoryID    = attrs['data-category-id']
            categoryPanel = attrs['data-target']
            try:
                str(categoryID)
            except:
                continue
            categoryIDs[categoryID] = categoryPanel
categoryIDs

{'top-charts': 'topchartsChartPanel',
 'global': 'globalChartPanel',
 'greatest-of-all-time': 'greatestofalltimeChartPanel',
 'pop': 'popChartPanel',
 'country': 'countryChartPanel',
 'rock': 'rockChartPanel',
 'rbhip-hop': 'rbhip-hopChartPanel',
 'latin': 'latinChartPanel',
 'danceelectronic': 'danceelectronicChartPanel',
 'christiangospel': 'christiangospelChartPanel',
 'classical': 'classicalChartPanel',
 'jazz': 'jazzChartPanel',
 'breaking-and-entering': 'breakingandenteringChartPanel',
 'web': 'webChartPanel',
 'holiday': 'holidayChartPanel',
 'songs-of-the-summer': 'songsofthesummerChartPanel',
 'international': 'internationalChartPanel',
 'additional-charts': 'additionalchartsChartPanel'}

In [8]:
categoryRefs = {}
for categoryID,categoryPanel in categoryIDs.items():
    categoryRefs[categoryID] = []
    div = bsdata.find("div", {"class": ["chart-panel", "chart-panel--hidden", "chart-panel__charts"], "id": categoryPanel})
    for ref in div.findAll("a", {"class": ["chart-panel__link"]}):
        href = ref.attrs['href']
        categoryRefs[categoryID].append(href)
categoryRefs    

{'top-charts': ['/charts/hot-100',
  '/charts/billboard-200',
  '/charts/artist-100',
  '/charts/social-50',
  '/charts/streaming-songs',
  '/charts/radio-songs',
  '/charts/digital-song-sales',
  '/charts/top-album-sales',
  '/charts/current-albums',
  '/charts/catalog-albums',
  '/charts/independent-albums',
  '/charts/soundtracks',
  '/charts/vinyl-albums'],
 'global': ['/charts/billboard-global-200',
  '/charts/billboard-global-excl-us'],
 'greatest-of-all-time': ['/charts/greatest-billboard-200-albums',
  '/charts/greatest-billboard-200-artists',
  '/charts/greatest-hot-100-singles',
  '/charts/greatest-hot-100-artists',
  '/charts/greatest-hot-100-songs-by-women',
  '/charts/greatest-hot-100-women-artists',
  '/charts/greatest-billboard-200-albums-by-women',
  '/charts/greatest-billboard-200-women-artists',
  '/charts/greatest-billboards-top-songs-80s',
  '/charts/greatest-billboards-top-songs-90s',
  '/charts/greatest-of-all-time-pop-songs',
  '/charts/greatest-of-all-time-pop-s

In [53]:
from time import sleep
from fsUtils import isFile
def getData(url, savename):
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}

    request=urllib.request.Request(url,None,headers) #The assembled request
    response = urllib.request.urlopen(request)
    data = response.read() # The data u need

    saveFile(ifile=savename, idata=data)        
    sleep(5)

In [56]:
for categoryID,categoryIDRefs in categoryRefs.items():
    n = 0
    for href in categoryIDRefs:
        savenamebase = "__".join([categoryID] + [x for x in href.split("/") if len(x) > 0])
        savename     = "chartData/{0}.p".format(savenamebase)
        if "imprints" in savename:
            continue
        if "labels" in savename:
            continue
        if "producers" in savename:
            continue
        if "distributors" in savename:
            continue
        if not isFile(savename):
            url = "{0}{1}".format(baseURL, href)
            print(savename)
            getData(url, savename)
            n += 1
    if n > 0:
        sleep(10)

chartData/rock__charts__year-end__2020__alternative-album-distributors.p
  --> This file is 16.7kB.
chartData/rock__charts__year-end__2020__alternative-songs-artists.p
  --> This file is 18.1kB.
chartData/rock__charts__year-end__2020__alternative-songs.p
  --> This file is 23.7kB.
chartData/rock__charts__year-end__2020__alternative-digital-song-sales-yearend.p
  --> This file is 20.2kB.
chartData/rock__charts__year-end__2020__adult-alternative-songs-artists.p
  --> This file is 17.7kB.
chartData/rock__charts__year-end__2020__adult-alternative-songs.p
  --> This file is 23.1kB.
chartData/rock__charts__year-end__2020__hard-rock-albums-artists.p
  --> This file is 18.1kB.
chartData/rock__charts__year-end__2020__hard-rock-albums.p
  --> This file is 23.8kB.
chartData/rock__charts__year-end__2020__hard-rock-album-distributors.p
  --> This file is 16.7kB.
chartData/rock__charts__year-end__2020__hard-rock-digital-song-sales-yearend.p
  --> This file is 20.2kB.
chartData/rock__charts__year-end

  --> This file is 22.8kB.
chartData/dance--electronic__charts__year-end__2020__top-dance-electronic-artists.p
  --> This file is 22.6kB.
chartData/dance--electronic__charts__year-end__2020__top-dance-electronic-new-artists.p
  --> This file is 17.5kB.
chartData/dance--electronic__charts__year-end__2020__hot-dance-electronic-songs-artists.p
  --> This file is 20.3kB.
chartData/dance--electronic__charts__year-end__2020__hot-dance-electronic-songs.p
  --> This file is 28.5kB.
chartData/dance--electronic__charts__year-end__2020__dance-electronic-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/dance--electronic__charts__year-end__2020__dance-electronic-digital-songs.p
  --> This file is 23.2kB.
chartData/dance--electronic__charts__year-end__2020__dance-electronic-streaming-songs-artists.p
  --> This file is 18.1kB.
chartData/dance--electronic__charts__year-end__2020__dance-electronic-streaming-songs.p
  --> This file is 23.5kB.
chartData/dance--electronic__charts__year-end__20

  --> This file is 20.6kB.
chartData/kid__charts__year-end__2020__kid-distributors.p
  --> This file is 16.7kB.
chartData/comedy__charts__year-end__2020__comedy-albums-artists.p
  --> This file is 18.0kB.
chartData/comedy__charts__year-end__2020__comedy-albums.p
  --> This file is 19.7kB.
chartData/comedy__charts__year-end__2020__comedy-distributors.p
  --> This file is 16.7kB.
chartData/video__charts__year-end__2020__music-video-sales.p
  --> This file is 20.3kB.
chartData/soundtracks__charts__year-end__2020__top-soundtracks-albums.p
  --> This file is 20.7kB.
chartData/international__charts__year-end__2020__candaian-hot-100-artists.p
  --> This file is 20.3kB.
chartData/international__charts__year-end__2020__canadian-hot-100.p
  --> This file is 28.9kB.
chartData/international__charts__year-end__2020__top-canadian-albums.p
  --> This file is 23.8kB.
chartData/international__charts__year-end__2020__hot-canadian-digital-songs.p
  --> This file is 23.0kB.
chartData/songwriters--publishe

In [None]:
from searchUtils import findExt
from fileUtils import getBaseFilename
files = sorted(findExt("chartData", ".p"))

for ifile in files:
    fvals = getBaseFilename(ifile).split("__")
    category,chart,chartType,year,chartName = fvals
    bsdata  = getHTML(ifile)
    ulYears = bsdata.find("ul", {"class": "dropdown__year-select-options"})
    n = 0
    if ulYears is not None:
        years = ulYears.findAll("a", {"class": "year-link"})
        for urlref in years:
            suburl   = urlref.attrs['href']
            year     = urlref.text
            url      = "{0}{1}".format(baseURL,suburl)
            savename = ifile.replace("__{0}__".format(2020), "__{0}__".format(year))
            if not isFile(savename):
                print(savename)
                getData(url, savename)
                n += 1
    if n > 0:
        sleep(5)

chartData/adult__charts__year-end__2006__hot-adult-40-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2019__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2018__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2017__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2016__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2015__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2014__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/adult__charts__year-end__2013__hot-adult-contemporary-labels.p
  --> This file is 17.2kB.
chartData/adult__charts__year-end__2012__hot-adult-contemporary-labels.p
  --> This file is 17.2kB.
chartData/adult__charts__year-end__2011__hot-adult-contemporary-labels.p
  --> This file is 17.1kB.
chartData/

  --> This file is 18.2kB.
chartData/billboard-200__charts__year-end__2009__the-billboard-200-artists-male.p
  --> This file is 18.1kB.
chartData/billboard-200__charts__year-end__2008__the-billboard-200-artists-male.p
  --> This file is 18.1kB.
chartData/billboard-200__charts__year-end__2007__the-billboard-200-artists-male.p
  --> This file is 18.1kB.
chartData/billboard-200__charts__year-end__2006__the-billboard-200-artists-male.p
  --> This file is 18.1kB.
chartData/billboard-200__charts__year-end__2016__the-billboard-200-distributors.p
  --> This file is 16.7kB.
chartData/billboard-200__charts__year-end__2015__the-billboard-200-distributors.p
  --> This file is 16.7kB.
chartData/billboard-200__charts__year-end__2014__the-billboard-200-distributors.p
  --> This file is 16.7kB.
chartData/billboard-200__charts__year-end__2013__the-billboard-200-distributors.p
  --> This file is 16.7kB.
chartData/billboard-200__charts__year-end__2012__the-billboard-200-distributors.p
  --> This file is 

  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2013__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2012__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2011__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2010__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2009__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2008__bluegrass-artists.p
  --> This file is 17.5kB.
chartData/bluegrass__charts__year-end__2007__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2006__bluegrass-artists.p
  --> This file is 17.6kB.
chartData/bluegrass__charts__year-end__2016__bluegrass-distributors.p
  --> This file is 16.7kB.
chartData/bluegrass__charts__year-end__2015__bluegrass-distributors.p
  --> This file is 16.7kB.
chartData/bluegrass__charts__year-end__2014

  --> This file is 23.6kB.
chartData/catalog__charts__year-end__2009__catalog-albums.p
  --> This file is 23.4kB.
chartData/catalog__charts__year-end__2008__catalog-albums.p
  --> This file is 23.6kB.
chartData/catalog__charts__year-end__2007__catalog-albums.p
  --> This file is 23.6kB.
chartData/catalog__charts__year-end__2006__catalog-albums.p
  --> This file is 23.7kB.
chartData/catalog__charts__year-end__2019__catalog-artists.p
  --> This file is 18.1kB.
chartData/catalog__charts__year-end__2018__catalog-artists.p
  --> This file is 18.1kB.
chartData/catalog__charts__year-end__2017__catalog-artists.p
  --> This file is 18.1kB.
chartData/catalog__charts__year-end__2016__catalog-artists.p
  --> This file is 18.0kB.
chartData/catalog__charts__year-end__2015__catalog-artists.p
  --> This file is 18.0kB.
chartData/catalog__charts__year-end__2014__catalog-artists.p
  --> This file is 18.1kB.
chartData/catalog__charts__year-end__2013__catalog-artists.p
  --> This file is 18.0kB.
chartData

chartData/christian__charts__year-end__2012__christian-albums.p
  --> This file is 23.7kB.
chartData/christian__charts__year-end__2011__christian-albums.p
  --> This file is 23.7kB.
chartData/christian__charts__year-end__2010__christian-albums.p
  --> This file is 23.7kB.
chartData/christian__charts__year-end__2009__christian-albums.p
  --> This file is 23.6kB.
chartData/christian__charts__year-end__2008__christian-albums.p
  --> This file is 23.7kB.
chartData/christian__charts__year-end__2007__christian-albums.p
  --> This file is 23.4kB.
chartData/christian__charts__year-end__2006__christian-albums.p
  --> This file is 23.9kB.
chartData/christian__charts__year-end__2019__christian-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/christian__charts__year-end__2018__christian-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/christian__charts__year-end__2017__christian-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/christian__charts__year-end__2016__

  --> This file is 18.2kB.
chartData/christian__charts__year-end__2007__hot-christian-songs-artists.p
  --> This file is 18.1kB.
chartData/christian__charts__year-end__2006__hot-christian-songs-artists.p
  --> This file is 18.1kB.
chartData/christian__charts__year-end__2019__hot-christian-songs.p
  --> This file is 28.5kB.
chartData/christian__charts__year-end__2018__hot-christian-songs.p
  --> This file is 28.4kB.
chartData/christian__charts__year-end__2017__hot-christian-songs.p
  --> This file is 28.1kB.
chartData/christian__charts__year-end__2016__hot-christian-songs.p
  --> This file is 23.0kB.
chartData/christian__charts__year-end__2015__hot-christian-songs.p
  --> This file is 23.0kB.
chartData/christian__charts__year-end__2014__hot-christian-songs.p
  --> This file is 28.0kB.
chartData/christian__charts__year-end__2013__hot-christian-songs.p
  --> This file is 23.1kB.
chartData/christian__charts__year-end__2012__hot-christian-songs.p
  --> This file is 22.9kB.
chartData/christi

chartData/classical__charts__year-end__2019__classical-crossover-albums.p
  --> This file is 19.1kB.
chartData/classical__charts__year-end__2018__classical-crossover-albums.p
  --> This file is 19.1kB.
chartData/classical__charts__year-end__2017__classical-crossover-albums.p
  --> This file is 19.2kB.
chartData/classical__charts__year-end__2016__classical-crossover-albums.p
  --> This file is 19.3kB.
chartData/classical__charts__year-end__2015__classical-crossover-albums.p
  --> This file is 19.2kB.
chartData/classical__charts__year-end__2014__classical-crossover-albums.p
  --> This file is 19.3kB.
chartData/classical__charts__year-end__2013__classical-crossover-albums.p
  --> This file is 19.1kB.
chartData/classical__charts__year-end__2012__classical-crossover-albums.p
  --> This file is 19.3kB.
chartData/classical__charts__year-end__2011__classical-crossover-albums.p
  --> This file is 19.3kB.
chartData/classical__charts__year-end__2010__classical-crossover-albums.p
  --> This file i

  --> This file is 16.7kB.
chartData/comedy__charts__year-end__2013__comedy-distributors.p
  --> This file is 16.7kB.
chartData/comedy__charts__year-end__2012__comedy-distributors.p
  --> This file is 16.8kB.
chartData/comedy__charts__year-end__2011__comedy-distributors.p
  --> This file is 16.8kB.
chartData/comedy__charts__year-end__2010__comedy-distributors.p
  --> This file is 16.8kB.
chartData/comedy__charts__year-end__2009__comedy-distributors.p
  --> This file is 16.8kB.
chartData/comedy__charts__year-end__2008__comedy-distributors.p
  --> This file is 16.7kB.
chartData/comedy__charts__year-end__2007__comedy-distributors.p
  --> This file is 16.7kB.
chartData/comedy__charts__year-end__2006__comedy-distributors.p
  --> This file is 16.7kB.
chartData/country__charts__year-end__2019__country-airplay-artists.p
  --> This file is 18.1kB.
chartData/country__charts__year-end__2018__country-airplay-artists.p
  --> This file is 18.1kB.
chartData/country__charts__year-end__2017__country-ai

chartData/country__charts__year-end__2011__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2010__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2009__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2008__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2007__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2006__country-artists-male.p
  --> This file is 17.6kB.
chartData/country__charts__year-end__2019__country-digital-songs-artists.p
  --> This file is 18.1kB.
chartData/country__charts__year-end__2018__country-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/country__charts__year-end__2017__country-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/country__charts__year-end__2016__country-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/country__charts__year-en

  --> This file is 18.7kB.
chartData/country__charts__year-end__2012__hot-country-songs-producers.p
  --> This file is 18.7kB.
chartData/country__charts__year-end__2011__hot-country-songs-producers.p
  --> This file is 18.7kB.
chartData/country__charts__year-end__2019__hot-country-songs.p
  --> This file is 29.2kB.
chartData/country__charts__year-end__2018__hot-country-songs.p
  --> This file is 28.3kB.
chartData/country__charts__year-end__2017__hot-country-songs.p
  --> This file is 28.4kB.
chartData/country__charts__year-end__2016__hot-country-songs.p
  --> This file is 28.4kB.
chartData/country__charts__year-end__2015__hot-country-songs.p
  --> This file is 28.0kB.
chartData/country__charts__year-end__2014__hot-country-songs.p
  --> This file is 28.1kB.
chartData/country__charts__year-end__2013__hot-country-songs.p
  --> This file is 28.2kB.
chartData/country__charts__year-end__2012__hot-country-songs.p
  --> This file is 28.0kB.
chartData/country__charts__year-end__2011__hot-countr

  --> This file is 20.8kB.
chartData/dance--electronic__charts__year-end__2011__dance-electronic-albums.p
  --> This file is 20.7kB.
chartData/dance--electronic__charts__year-end__2010__dance-electronic-albums.p
  --> This file is 20.8kB.
chartData/dance--electronic__charts__year-end__2009__dance-electronic-albums.p
  --> This file is 20.8kB.
chartData/dance--electronic__charts__year-end__2008__dance-electronic-albums.p
  --> This file is 20.7kB.
chartData/dance--electronic__charts__year-end__2007__dance-electronic-albums.p
  --> This file is 20.8kB.
chartData/dance--electronic__charts__year-end__2006__dance-electronic-albums.p
  --> This file is 20.9kB.
chartData/dance--electronic__charts__year-end__2019__dance-electronic-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/dance--electronic__charts__year-end__2018__dance-electronic-digital-songs-artists.p
  --> This file is 18.2kB.
chartData/dance--electronic__charts__year-end__2017__dance-electronic-digital-songs-artists.p
 

  --> This file is 23.4kB.
chartData/dance--electronic__charts__year-end__2012__dance-mix-show-airplay-songs.p
  --> This file is 23.3kB.
chartData/dance--electronic__charts__year-end__2011__dance-mix-show-airplay-songs.p
  --> This file is 23.2kB.
chartData/dance--electronic__charts__year-end__2010__dance-mix-show-airplay-songs.p
  --> This file is 20.5kB.
chartData/dance--electronic__charts__year-end__2009__dance-mix-show-airplay-songs.p
  --> This file is 20.5kB.
chartData/dance--electronic__charts__year-end__2008__dance-mix-show-airplay-songs.p
  --> This file is 20.6kB.
chartData/dance--electronic__charts__year-end__2007__dance-mix-show-airplay-songs.p
  --> This file is 20.3kB.
chartData/dance--electronic__charts__year-end__2006__dance-mix-show-airplay-songs.p
  --> This file is 20.6kB.
chartData/dance--electronic__charts__year-end__2019__hot-dance-electronic-songs-artists.p
  --> This file is 20.4kB.
chartData/dance--electronic__charts__year-end__2018__hot-dance-electronic-songs

  --> This file is 25.7kB.
chartData/digital-song-sales__charts__year-end__2019__hot-digital-songs-labels.p
  --> This file is 16.8kB.
chartData/digital-song-sales__charts__year-end__2018__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2017__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2016__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2015__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2014__hot-digital-songs-labels.p
  --> This file is 16.8kB.
chartData/digital-song-sales__charts__year-end__2013__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2012__hot-digital-songs-labels.p
  --> This file is 16.9kB.
chartData/digital-song-sales__charts__year-end__2011__hot-digital-songs-labels.p
  --> This file is 16.9kB.
c

  --> This file is 17.6kB.
chartData/gospel__charts__year-end__2018__gospel-streaming-songs-artists.p
  --> This file is 17.7kB.
chartData/gospel__charts__year-end__2017__gospel-streaming-songs-artists.p
  --> This file is 17.6kB.
chartData/gospel__charts__year-end__2016__gospel-streaming-songs-artists.p
  --> This file is 17.7kB.
chartData/gospel__charts__year-end__2015__gospel-streaming-songs-artists.p
  --> This file is 17.7kB.
chartData/gospel__charts__year-end__2014__gospel-streaming-songs-artists.p
  --> This file is 17.6kB.
chartData/gospel__charts__year-end__2019__gospel-streaming-songs.p
  --> This file is 22.5kB.
chartData/gospel__charts__year-end__2018__gospel-streaming-songs.p
  --> This file is 22.8kB.
chartData/gospel__charts__year-end__2017__gospel-streaming-songs.p
  --> This file is 22.7kB.
chartData/gospel__charts__year-end__2016__gospel-streaming-songs.p
  --> This file is 20.1kB.
chartData/gospel__charts__year-end__2015__gospel-streaming-songs.p
  --> This file is 2

  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2015__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2014__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2013__hot-100-artists-female.p
  --> This file is 18.0kB.
chartData/hot-100__charts__year-end__2012__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2011__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2010__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2009__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2008__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2007__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__charts__year-end__2006__hot-100-artists-female.p
  --> This file is 18.1kB.
chartData/hot-100__char

  --> This file is 23.6kB.
chartData/independent__charts__year-end__2015__independent-albums.p
  --> This file is 23.9kB.
chartData/independent__charts__year-end__2014__independent-albums.p
  --> This file is 24.0kB.
chartData/independent__charts__year-end__2013__independent-albums.p
  --> This file is 24.1kB.
chartData/independent__charts__year-end__2012__independent-albums.p
  --> This file is 23.5kB.
chartData/independent__charts__year-end__2011__independent-albums.p
  --> This file is 24.0kB.
chartData/independent__charts__year-end__2010__independent-albums.p
  --> This file is 23.6kB.
chartData/independent__charts__year-end__2009__independent-albums.p
  --> This file is 24.0kB.
chartData/independent__charts__year-end__2008__independent-albums.p
  --> This file is 23.8kB.
chartData/independent__charts__year-end__2007__independent-albums.p
  --> This file is 23.9kB.
chartData/independent__charts__year-end__2006__independent-albums.p
  --> This file is 24.4kB.
chartData/independent__

  --> This file is 20.5kB.
chartData/jazz__charts__year-end__2007__contemporary-jazz-albums.p
  --> This file is 20.4kB.
chartData/jazz__charts__year-end__2006__contemporary-jazz-albums.p
  --> This file is 20.5kB.
chartData/jazz__charts__year-end__2019__contemporary-jazz-artists.p
  --> This file is 17.7kB.
chartData/jazz__charts__year-end__2018__contemporary-jazz-artists.p
  --> This file is 17.7kB.
chartData/jazz__charts__year-end__2017__contemporary-jazz-artists.p
  --> This file is 17.7kB.
chartData/jazz__charts__year-end__2016__contemporary-jazz-artists.p
  --> This file is 17.6kB.
chartData/jazz__charts__year-end__2015__contemporary-jazz-artists.p
  --> This file is 17.6kB.
chartData/jazz__charts__year-end__2014__contemporary-jazz-artists.p
  --> This file is 17.7kB.
chartData/jazz__charts__year-end__2013__contemporary-jazz-artists.p


In [124]:
bYECharts = billboardYECharts()

['gospel', 'rhythmic', 'soundtracks', 'blues', 'new_age', 'pop', 'streaming_songs', 'reggae', 'independent', 'social', 'hot_100', 'americana__folk', 'songwriters__publishers', 'rap', 'international', 'rb', 'digital_song_sales', 'other_albums', 'billboard_200', 'bluegrass', 'overall', 'dance__electronic', 'rb__hip_hop', 'catalog', 'radio_songs', 'ratio', 'chartNames', 'chartRanks']


In [102]:
categories = {}
files = bYE.getSummaryFiles()
for ifile in files:
    category = getBaseFilename(ifile)
    categories[category] = {}
    categoryData = getFile(ifile)
    for year,yearData in categoryData.items():
        for chartName,chartData in yearData.items():
            categories[category][chartName] = True

Found 36 summary files


In [108]:
for category,categoryData in categories.items():
    print("        self.{0} = {1}\n".format(category.replace("-","_"), list(categoryData.keys())))
    

        self.country = ['country-digital-songs', 'country-artists-duo-group', 'hot-country-songs-artists', 'top-country-albums', 'country-streaming-songs-artists', 'country-airplay-artists', 'country-digital-songs-artists', 'country-streaming-songs', 'top-country-artists', 'country-artists-male', 'hot-country-songs', 'country-artists-female', 'top-country-albums-artists', 'new-country-artists', 'country-airplay-songs']

        self.gospel = ['hot-gospel-songs', 'gospel-albums', 'hot-gospel-songs-artists', 'gospel-albums-artists', 'gospel-digital-songs', 'gospel-digital-songs-artists', 'gospel-streaming-songs', 'gospel-airplay-songs', 'top-gospel-new-artists', 'top-gospel-artists-female', 'gospel-airplay-artists', 'top-gospel-artists-duo-group', 'gospel-streaming-songs-artists', 'top-gospel-artists', 'top-gospel-artists-male']

        self.adult = ['adult-contemporary-artists', 'adult-pop-songs-artists', 'adult-pop-songs', 'adult-contemporary-songs']

        self.cast = ['cast-albums

In [77]:
from copy import deepcopy
results = deepcopy(bYE.chartData)

In [96]:
bYE = billboardYE()
#bYE.parse()
bYE.chartData = results
bYE.saveChartData()

Saving data to chartData/summary/jazz.p
  --> This file is 39.5kB.
Saved data to chartData/summary/jazz.p
  --> This file is 39.5kB.
Saving data to chartData/summary/rb--hip-hop.p
  --> This file is 72.6kB.
Saved data to chartData/summary/rb--hip-hop.p
  --> This file is 72.6kB.
Saving data to chartData/summary/latin.p
  --> This file is 159.0kB.
Saved data to chartData/summary/latin.p
  --> This file is 159.0kB.
Saving data to chartData/summary/christian.p
  --> This file is 60.8kB.
Saved data to chartData/summary/christian.p
  --> This file is 60.8kB.
Saving data to chartData/summary/rb.p
  --> This file is 47.7kB.
Saved data to chartData/summary/rb.p
  --> This file is 47.7kB.
Saving data to chartData/summary/rock.p
  --> This file is 121.9kB.
Saved data to chartData/summary/rock.p
  --> This file is 121.9kB.
Saving data to chartData/summary/other-albums.p
  --> This file is 37.7kB.
Saved data to chartData/summary/other-albums.p
  --> This file is 37.7kB.
Saving data to chartData/su

In [150]:
manDB   = masterArtistNameDB("main", init=False)

  Loading data from /Users/tgadfort/opt/anaconda3/envs/py37/musicnames/mainArtistNameDB.p
  There are currently 6664 artist keys.
  There are currently 7714 renamed artist keys.


In [184]:
%load_ext autoreload
%autoreload
from billboardYECharts import billboardYECharts
from billboardYE import billboardYE
bYE = billboardYE(minYear=1019,maxYear=2021)
bYE.setChartUsage(rank=[0])
bYE.setDBRenames(manDB)
bYE.setFullChartData()
bYE.setArtistAlbumData()
bYE.saveFullChartData()
bYE.saveArtistAlbumData()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
  Getting Chart For hot
  Using 5 Charts
  Using 5 Charts
  Using Charts (None): ['hot-100-songs', 'hot-100-artists-male', 'hot-100-artists', 'hot-100-artists-female', 'hot-100-artists-duo-group']
Found 36 summary files
Renamed 72 single artists
Saving 339 Full Artist Data
Saving data to currentBillboardYEFullChartArtistAlbumData.p
  --> This file is 13.8kB.
Saved data to currentBillboardYEFullChartArtistAlbumData.p
  --> This file is 13.8kB.
Saving 339 Artist Album Data to currentBillboardYEArtistAlbumData.p
Saving data to currentBillboardYEArtistAlbumData.p
  --> This file is 8.9kB.
Saved data to currentBillboardYEArtistAlbumData.p
  --> This file is 8.9kB.


Found 36 summary files
Saving 256 Full Artist Data
Saving data to currentBillboardYEFullChartArtistAlbumData.p
  --> This file is 5.1kB.
Saved data to currentBillboardYEFullChartArtistAlbumData.p
  --> This file is 5.1kB.
Saving 256 Artist Album Data to currentBillboardYEArtistAlbumData.p
Saving data to currentBillboardYEArtistAlbumData.p
  --> This file is 3.6kB.
Saved data to currentBillboardYEArtistAlbumData.p
  --> This file is 3.6kB.


In [73]:
fullChartData

{'Michael Buble': {'Songs': {"It's A Beautiful Day": {'adult-contemporary-songs': {'2013': 0}},
   "Haven't Met You Yet": {'adult-pop-songs': {'2010': 0},
    'adult-contemporary-songs': {'2010': 0}},
   'Save The Last Dance For Me': {'adult-contemporary-songs': {'2006': 0}},
   'Home': {'adult-contemporary-songs': {'2006': 0}},
   'Lost': {'adult-contemporary-songs': {'2008': 0}},
   'Everything': {'adult-contemporary-songs': {'2007': 0}},
   'Hold On': {'adult-contemporary-songs': {'2011': 0}}},
  'Albums': {'To Be Loved': {'jazz-albums': {'2013': 0, '2020': 0, '2014': 0},
    'top-canadian-albums': {'2013': 0}},
   'Love': {'jazz-albums': {'2019': 0, '2020': 0},
    'top-album-title-sales': {'2019': 0},
    'current-albums': {'2019': 0},
    'top-canadian-albums': {'2019': 0}},
   "It's Time": {'jazz-albums': {'2020': 0},
    'catalog-albums': {'2007': 0, '2010': 0, '2008': 0}},
   'Crazy Love': {'jazz-albums': {'2020': 0, '2010': 0, '2009': 0, '2011': 0},
    'current-albums': {'20

In [None]:

        
        
    
    def setFullChartData(self):        
        dbRenameStats     = 0    
        multiRenameStats  = 0
        
        
        if len(self.files) == 0:
            raise ValueError("There are no files. Something is wrong...")
        
        for ifile in self.files:
            fdata = getFile(ifile)
            for chartName, cnameResults in fdata.items():
                if chartName not in self.charts:
                    continue
                
                for date, dResults in cnameResults.items():
                    if self.minYear is not None:
                        if getDateTime(date).year < int(self.minYear):
                            continue
                    if self.maxYear is not None:
                        if getDateTime(date).year > int(self.maxYear):
                            continue
                    stryear = getDateTime(date).year

                    artistName = dResults["Artist"]
                    

                    ## Test for rename
                    renamedArtistName = artistName
                    if self.dbRenames is not None:
                        tmpName = self.dbRenames.renamed(renamedArtistName)
                        if tmpName != renamedArtistName:
                            dbRenameStats += 1
                        renamedArtistName = tmpName

                    ## Test for multi rename
                    #renamedArtistName = artistName
                    if self.multirenameDB is not None:
                        tmpName = self.multirenameDB.renamed(renamedArtistName)
                        if tmpName != renamedArtistName:
                            multiRenameStats += 1
                        renamedArtistName = tmpName

                    artist = renamedArtistName

                    ignoreStatus = getArtistIgnores(artist)
                    if ignoreStatus is False:
                        continue

                    album  = dResults["Name"]

                    if self.chartData.get(artist) is None:
                        self.chartData[artist] = Counter()
                    self.chartData[artist][album] += 1
                    
                    if self.fullChartData.get(artist) is None:
                        self.fullChartData[artist] = {"Songs": {}, "Albums": {}}
                    if chartName.endswith("Albums"):
                        key = "Albums"
                    else:
                        key = "Songs"
                    if self.fullChartData[artist][key].get(album) is None:
                        self.fullChartData[artist][key][album] = {}
                    if self.fullChartData[artist][key][album].get(chartName) is None:
                        self.fullChartData[artist][key][album][chartName] = {}
                    self.fullChartData[artist][key][album][chartName][date] = 0
                #print("{0: <40}{1}".format("{0}-{1}".format(chartName,stryear),len(self.fullChartData)))
                
        print("Renamed {0} single artists".format(dbRenameStats))
        print("Renamed {0} multi artists".format(multiRenameStats))

In [80]:
bsdata = getHTML("chartData/adult__charts__year-end__2019__adult-contemporary-artists.p")

In [73]:
bsdat = getHTML("chartData/adult__charts__year-end__2020__adult-contemporary-artists.p")

In [18]:
yearends = []
for ul in bsdata.findAll("ul", {"class": "site-header__subnav-items"}):
    lis = ul.findAll("li", {"class": "site-header__subnav-item"})
    for li in lis:
        refs = li.findAll("a")
        for ref in refs:
            if ref.attrs['href'].find("year-end") != -1:
                yearends.append(ref)

In [None]:
downloads = {}
baseURL = "https://www.billboard.com"
for ifile in sorted(glob(join(basedir, "data", "billboard", "yearly", "*.p"))):
    year = getBaseFilename(ifile)
    if downloads.get(year) is None:
        downloads[year] = {}
    fdata = getHTML(ifile)
    for iul,ul in enumerate(fdata.findAll("ul")):
        lis = ul.findAll("li", {"class": "chart-group__item"})
        for j,li in enumerate(lis):  
            a = li.find('a')
            if a is not None:
                href = a['href']
                text = a.text
                subdir = getDirname(href)[1:]
                downloads[year][href] = [baseURL, subdir, getBasename(href)]

In [None]:
downloads.keys()

# Download Category Data

In [None]:
baseURL = "https://www.billboard.com"

for year, yearData in downloads.items():   
    for href, hrefData in yearData.items():
        url  = "{0}/{1}".format(baseURL, href)
        year = getBasename(hrefData[1])
        category = hrefData[2]

        savedir  = join(basedir, "data", "billboard", "categories")
        savename = join(savedir, "{0}-{1}.p".format(year, category))

        if isFile(savename):
            continue

        print("  Trying to download and save {0}".format(savename))
        try:

            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
            headers={'User-Agent':user_agent,} 

            request=urllib.request.Request(url,None,headers) #The assembled request
            response = urllib.request.urlopen(request)
            data = response.read() # The data u need
        except:
            continue

        print("Saving {0}".format(savename))
        saveJoblib(data=data, filename=savename, compress=True)
        sleep(1)        

# Parse Category Data

In [None]:
data = {}

baseURL = "https://www.billboard.com"
names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
categories = set([x[5:] for x in names])

albumCategories  = [x for x in categories if x.endswith("albums")]
songCategories   = [x for x in categories if not x.endswith("albums")]


names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
years = sorted(set([x[:4] for x in names]))

print("There are {0} years".format(len(years)))
print("There are {0} charts".format(len(categories)))
print("There are {0} album charts".format(len(albumCategories)))
print("There are {0} song charts".format(len(songCategories)))

In [None]:
#categories

In [3]:
if False:
    catdir = join(basedir, "data", "billboard", "categories")
    files = findPatternExt(catdir, pattern='2019', ext='.p')    
    for ifile in files:
        bsdata = getHTML(ifile)
        name   = bsdata.find("h2", {"class": "simple-page__title"})
        print(ifile,'\t',name)

In [None]:
def parseBillboardFile(ifile):
    fdata = getHTML(ifile)
    data  = {}
    for i,table in enumerate(fdata.findAll('table')):
        ths = table.findAll("th")
        trs = table.findAll("tr")

        headers = [x.text for x in ths]
        for j,tr in enumerate(trs[1:]):
            tds  = tr.findAll('td')
            vals = [x.text for x in tds]
            #print(headers,vals)

            if len(vals) == 3:
                date,name,artist = vals
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    print(", ".join([date, year]))

                    1/0
            elif len(vals) == 1:
                date = vals[0]
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    1/0

            if not isDate(date):
                raise ValueError("Could not form date for {0}".format(date))

            date = printDateTime(date)
            if data.get(date) is not None:
                raise ValueError("Already seen this date!!!")
            data[date] = {"Artist": artist, "Name": name}
            
            #print("{0: <12}{1}".format(date, data[date]))
            continue
            
            
            if data.get(date) is None:
                data[date] = {}
            #print("{0: <20}{1: <20}{2: <20}{3: <20}".format(artist, name, category, date))
            #continue
            if data.get(artist) is None:
                data[artist] = {}
                #print("Artist: {0}".format(artist))
            if data[artist].get(name) is None:
                data[artist][name] = {}
                #print("\tSong: {0} ({1})".format(name, len(data[artist])))
            if data[artist][name].get(category) is None:
                data[artist][name][category] = []
            data[artist][name][category].append([date])

    return data

In [None]:
catdir = join(basedir, "data", "billboard", "categories")
for year in years:
    data  = {}
    files = findPatternExt(catdir, pattern=year, ext='.p')    
    for ifile in files:
        chart   = getBaseFilename(ifile)
        chart   = "-".join(chart.split('-')[1:])
        results = parseBillboardFile(ifile)
        data[chart] = results
        print("{0: <10}{1: <30}{2}".format(year, chart, len(results)))
        
    savedir = join(basedir, "data", "billboard", "results")
    savename = join(savedir, "{0}.p".format(year))
    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)

# Aggregrate Charts

Found 62 files.
  Getting All Charts
  Using 200 Charts
  Using Charts (None): ['adult-contemporary', 'adult-pop-songs', 'ASI', 'ATF', 'alternative-albums', 'alternative-songs', 'MRT', 'ALT', 'BLU', 'BGR', 'FLK', 'christian-airplay', 'hot-christian-songs', 'christian-albums', 'christian-digital-song-sales', 'christian-songs', 'christian-streaming-songs', 'gospel-airplay', 'gospel-albums', 'gospel-digital-song-sales', 'gospel-songs', 'gospel-streaming-songs', 'CRI', 'CRT', 'ICO', 'ILL', 'SLL', 'GOS', 'GSI', 'GSS', 'CHS', 'GDT', 'country-airplay', 'country-albums', 'country-digital-song-sales', 'country-songs', 'country-streaming-songs', 'CSA', 'CSI', 'CST', 'CLP', 'CDT', 'dance-club-play-songs', 'dance-electronic-albums', 'dance-electronic-digital-song-sales', 'dance-electronic-songs', 'dance-electronic-streaming-songs', 'DAN', 'DAS', 'DDT', 'BSI', 'BST', 'DSI', 'ELP', 'hot-100', 'pop-songs', 'radio-songs', 'streaming-songs', 'rhythmic-40', 'heatseekers-albums', 'billboard-200', 'artist

NameError: name 'Counter' is not defined

****

# Chart Analysis

In [6]:
chartCounter = getFile("chartCounter.p")
keys = chartCounter.keys()
sorted(keys)
#chartCounter.most_common()

['ATS',
 'HSB',
 'HSI',
 'TFM',
 'TLN',
 'TLP',
 'TSL',
 'artist-100',
 'billboard-200',
 'heatseekers-albums',
 'hot-100',
 'pop-songs',
 'radio-songs',
 'rhythmic-40',
 'streaming-songs',
 'top-album-sales']

In [78]:

slimResults = Counter()
for artist, artistData in results.items():
    for key, keyData in artistData.items():
        for album, albumData in keyData.items():
            slimResults[artist] += sum({k: len(v) for k,v in albumData.items()}.values())

In [None]:
saveFile(idata = slimResults, ifile="billboardCounter.p", debug=True)
saveFile(idata = results,     ifile="billboardResults.p", debug=True)

# Get Billboard Results

In [None]:
slimResults = getFile(ifile="billboardCounter.p", debug=True)
fullResults = getFile(ifile="billboardResults.p", debug=True)

# Get Discogs

In [None]:
disc = discogs()
discdf = disc.getMasterSlimArtistDiscogsDB()
artistIDToName = discdf["DiscArtist"].to_dict()
from masterdb import getArtistAlbumsDB, discConv
artistAlbumsDB = getArtistAlbumsDB(disc)

artistNameToID = {}
print("Found {0} ID -> Name entries".format(len(artistIDToName)))
for artistID,artistName in artistIDToName.items():
    if artistNameToID.get(artistName) is None:
        artistNameToID[artistName] = []
    artistNameToID[artistName].append(artistID)
print("Found {0} Name -> ID entries".format(len(artistNameToID)))
mulArts  = multiArtist(cutoff=0.9, discdata=artistNameToID, exact=False)

# Check Renames

In [None]:
singleRenames = getFile(ifile="singleRenames.p", debug=True)
multiRenames  = getFile(ifile="multiRenames.p", debug=True)
knownArtists  = getFile(ifile="artistMap.p", debug=True)

# Near Renames Artists

In [None]:
from searchUtils import findNearest
cutoff = 0.9
artistsToGet = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
#for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i <= 4500:
        continue
    if i % 250 == 0:
        print("==>",i,len(slimResults))
    if multiRenames.get(artist) is not None:
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if knownArtists.get(artist) is not None:
        continue
    if manualRenames.get(artist) is not None:
        continue

    matches = mulArts.getArtistNames(artist)
    if len(matches) > 1:
        continue

    mdata = getMusicData("DiscArtist", artist)
    if mdata is None:
        results = findNearest(artist, artistNameToID.keys(), num=1, cutoff=cutoff)      
        if len(results) > 0:
            artistsToGet[artist] = results
            print("manualRenames[\"{0}\"] = \"{1}\"".format(artist, results[0]))

In [None]:
findNearest

In [None]:
for k,v in artistsToGet.items():
    print("keep[\"{0}\"] = {1}  #{2}".format(k, v[0], v[1]))



# Get Multi Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
len(multiMatchResult)

In [None]:
print(len(knownArtists))
knownArtists.update(multiMatchResult)
print(len(knownArtists))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

# Single Results

In [None]:
knownArtists["The Kingston Trio"]

In [None]:
cutoff = 0.60
singleResult = {}
#for i, (artist, cnt) in enumerate(slimResults.most_common()):
for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i > 0 and i % 100 == 0 or i == 100:
        print("Passed",i,'/',len(slimResults))
    if cnt < 1:
        continue
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]        
    if prevArtist is None:
        prevArtist = artist
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    if knownArtists.get(prevArtist) is not None or knownArtists.get(artist) is not None:
        continue
    artistAlbums = set(list(fullResults[artist]["Songs"].keys()) + list(fullResults[artist]["Albums"].keys()))
    match = mulArts.getArtistNames(artist                                                     )
    if len(match) == 1:
        mdata = getMusicData("DiscArtist", list(match.keys())[0])
        if not isinstance(mdata, DataFrame):
            results = getBestArtistMatch(artist, artistAlbums, N=3, cutoff=cutoff)
            if results[2] is not None:
                if results[2] >= cutoff:
                    print(i,"/",len(slimResults),'  \t',artist,'\t',results)
                    singleResult[artist] = results
                    if len(singleResult) > 1000:
                        break

In [None]:
del singleResult["Murk vs. Kristine W"]
del singleResult["Chris Cox Vs. Happy Clappers"]
del singleResult["T.M.Revolution × Nana Mizuki"]
del singleResult["Bob Marley Vs. Funkstar De Luxe"]

In [None]:
singleResults = {k: v[0] for k,v in singleResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

## Known Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
singleResults = {k: v[0] for k,v in multiMatchResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))

In [None]:
saveFile(idata=knownArtists, ifile="singleRenames.p")

# Check For Misnames

In [None]:
ma = multiartist(cutoff=0.9, discdata=discdata)

In [None]:
print(ma.getArtistNames('Lipps, Inc.') == {'Lipps, Inc.': ['159617', '26641', '101850']})
print(ma.getArtistNames('Bob Wills & His Texas Playboys') == {'Bob Wills & His Texas Playboys': ['670000','786114','1134146','804668','804679','1004309','935875','935907']})
print(ma.getArtistNames('Brad Paisley Duet With Carrie Underwood', debug=False) == {'Carrie Underwood': ['1011680'], 'Brad Paisley': ['313755']})
print(ma.getArtistNames('Ray Charles With Willie Nelson', debug=False) == {'Willie Nelson': ['249449'], 'Ray Charles': ['521963', '30552']})

# Album Data

In [None]:
savename = join(savedir, "billboard.p")
albumData = getFile(join(savedir, "billboard-album.p"))
#chartdata = getFile(savename, debug=True)

In [None]:
for artist,artistData in albumData.items():
    print(artist)
    for album in artistData.keys():
        print("\t",album)

# Show Data

In [None]:
togets = {}
combos = []

savedir  = join(basedir, "results")
savename = join(savedir, "billboard.p")
chartdata = getFile(savename, debug=True)
chartArtists = sorted(chartdata.keys(), reverse=True)

import json
prevs = json.load(open("prevs.json", "r"))
artPrev = json.load(open("../discogs/prevs.json", "r"))
prevs.update(artPrev)
json.dump(prevs, open("prevs.json", "w"))
print("There are {0} artists previously searched.".format(len(prevs)))

ma = multiartist(cutoff=0.9, discdata=discdata, exact=False)

from random import shuffle
print("There are {0} chart artists".format(len(chartArtists)))
shuffle(chartArtists)
for i,artist in enumerate(chartArtists):
    if prevs.get(artist) is not None:
        continue
    retval = ma.getArtistNames(artist)
    if retval is None:
        togets[artist] = artist
    else:
        for name,value in retval.items():
            if prevs.get(name) is not None:
                continue
            if value is None:
                togets[name.upper()] = artist
                print("{0: <3}{1: <40}{2: <60}{3}".format(len(togets),name,artist,retval))
                print('\t---->',i,'/',len(chartArtists))

    
    if len(togets) > 50:
        print("\n\n\n\n")
        for name,artist in togets.items():
            print("art.searchDiscogForArtist(\"{0}\")  ## {1}".format(name, artist))
        break

###### artists = [x.strip() for x in vals.split("\n")]
artists = [x for x in artists if len(x) > 0]

In [None]:
Pitbull Featuring Ne-Yo, Afrojack & Nayer
Drake Featuring Kanye West, Lil Wayne & Eminem

In [None]:
def testName(name, discdata):
    if discdata.get(name) is not None:
        return discdata[name]
    return name

def findName(name, discArtists, threshold):
    retval = findNearest(name, discArtists, 1, threshold)
    if len(retval) == 0:
        return name
    return retval
    

def splitArtist(name):
    names = [name]
    names = [[x.strip() for x in y.split(" Featuring ")]
results = {}
    
for name in artists:
    if " & " in name:
        retvals = [testName(x, discdata) for x in 
        retvals = [findName(x, discArtists, 0.95) if not isinstance(x, list) else x for x in retvals]
        results[name] = retvals        

In [None]:
togets = []
for k,v in results.items():
    for val in v:
        if not isinstance(val, list):
            togets.append("art.searchDiscogForArtist(\"{0}\")".format(val))
            
for toget in set(togets):
    print(toget)

In [None]:
tmp = [['5070865', '90037', '4135543', '1507065'], ['Katy XXX Perry', ['1201210']]]

In [None]:
tmp

In [None]:
nameids = {}
namerefs = {}
for name,nameid in iddata.items():
    artist = name
    if name.endswith(")"):
        artist = None
        for x in [-3,-4,-5]:
            if artist is not None:
                continue
            if abs(x) > len(name):
                continue
            if name[x] == "(":
                try:
                    val = int(name[(x+1):-1])
                    artist = name[:x].strip()
                except:
                    continue
          
        if artist is None:
            artist = name

    ref = refdata[name]
            
    if nameids.get(artist) is None:
        nameids[artist] = {}
    nameids[artist][nameid] = 1
    
    if namerefs.get(artist) is None:
        namerefs[artist] = {}
    namerefs[artist][ref] = 1
    

nameids = {k: list(v.keys()) for k,v in nameids.items()}
namerefs = {k: list(v.keys()) for k,v in namerefs.items()}