# Billboard Functions

In [14]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
from glob import glob
from os import getcwd
from os.path import join
from fileUtils import getBasename, getDirname, getBaseFilename
from fsUtils import isFile, isDir, moveFile, removeFile
from webUtils import getHTML, getWebData
from timeUtils import getDateTime, isDate, printDateTime
from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib
from searchUtils import findNearest
from collections import Counter



from discogsBase import discogs
from multiArtist import multiArtist
print("Python: {0}".format(sys.version))

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2020-01-21 19:30:36.438020


# Global Vars

In [2]:
basedir = getcwd()
savedir  = join(basedir, "results")

In [3]:
from pandas import DataFrame
def getMusicData(key, artist):
    retval = discdf[discdf[key] == artist]
    if retval.shape[0] > 0:
        return retval
    else:
        return None
    
from difflib import SequenceMatcher

def getRowByIndex(pdf, idx):
    return pdf.loc[idx]

In [13]:
from searchUtils import findNearest
from collections import Counter
def getBestArtistIDMatch(artistName, artistResults, possibleMatches, N=3, cutoff=0.7, debug=False):
    myAlbumNames = list(artistResults)
    idxResults = {}
    matchValue = {}
    for idx in possibleMatches:
        idxResults[idx] = 0
        matchValue[idx] = artistName

        artistAlbumsData = getRowByIndex(artistAlbumsDB, idx)
        artistAlbums     = artistAlbumsData["Albums"]

        for myAlbumName in myAlbumNames:
            maxRatio = 0
            for albumType, albumTypeData in artistAlbums.items():
                for albumID, dbAlbumName in albumTypeData.items():
                    s = SequenceMatcher(None, myAlbumName, dbAlbumName)
                    ratio = s.ratio()
                    if ratio > maxRatio:
                        maxRatio = ratio

            if maxRatio > cutoff:
                idxResults[idx] += maxRatio

        if debug:
            print("\t{0: <5}{1: <15}{2}".format(idxResults[idx], idx, matchValue[idx]))
            
    if len(idxResults) > 0:
        mc     = Counter(idxResults)
        best   = mc.most_common(1)[0]
        idx    = best[0]
        ratio  = round(best[1],2)
        artist = matchValue[idx]
        return (idx, artist, ratio)
    else:
        return (None, None, None)
    


def getBestArtistMatch(artistName, artistResults, N=3, cutoff=0.7, debug=False):
    myAlbumNames = list(artistResults)
    if debug:
        print(artistName)

    idxResults = {}
    matchValue = {}

    
    for artist in findNearest(artistName, artistNameToID.keys(), N, cutoff):
        for idx in artistNameToID[artist]:
            idxResults[idx] = 0
            matchValue[idx] = artist
            
            artistAlbumsData = getRowByIndex(artistAlbumsDB, idx)
            artistAlbums     = artistAlbumsData["Albums"]
            
            for myAlbumName in myAlbumNames:
                maxRatio = 0
                for albumType, albumTypeData in artistAlbums.items():
                    for albumID, dbAlbumName in albumTypeData.items():
                        s = SequenceMatcher(None, myAlbumName, dbAlbumName)
                        ratio = s.ratio()
                        if ratio > maxRatio:
                            maxRatio = ratio
                            
                if maxRatio > cutoff:
                    idxResults[idx] += maxRatio
            
            if debug:
                print("\t{0: <5}{1: <15}{2}".format(idxResults[idx], idx, matchValue[idx]))

    if len(idxResults) > 0:
        mc     = Counter(idxResults)
        best   = mc.most_common(1)[0]
        idx    = best[0]
        ratio  = round(best[1],2)
        artist = matchValue[idx]
        return (idx, artist, ratio)
    else:
        return (None, None, None)

# Starter Class Section

In [None]:

try:
    filename = glob(join(basedir, "data", "billboard", "starter.html"))[0]
except:
    print("Could not find starter HTML file!")
fdata = getHTML(filename)

In [None]:
chartData  = {}
dirname = None
baseURL = "https://www.billboard.com"
for iul,ul in enumerate(fdata.findAll("ul")):
    lis = ul.findAll("li", {"class": "chart-group__item"})
    for j,li in enumerate(lis):  
        a = li.find('a')
        if a is not None:
            href = a['href']
            text = a.text
            subdir = getDirname(href)[1:]
            chartData[text] = [baseURL, subdir, getBasename(href)]

# Download Yearly Information

In [None]:
from time import sleep
years = [str(x) for x in range(1958,2014)]
years = ['2019']
for year in years:
    url="https://www.billboard.com/archive/charts/{0}".format(year)
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    
    savedir = join(basedir, "data", "billboard", "yearly")
    savename = join(savedir, "{0}.p".format(year))
    if isFile(savename):
        continue
    
    request=urllib.request.Request(url,None,headers) #The assembled request
    response = urllib.request.urlopen(request)
    data = response.read() # The data u need

    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)
    sleep(1)

# Parse Yearly Information

In [None]:
downloads = {}
baseURL = "https://www.billboard.com"
for ifile in sorted(glob(join(basedir, "data", "billboard", "yearly", "*.p"))):
    year = getBaseFilename(ifile)
    if downloads.get(year) is None:
        downloads[year] = {}
    fdata = getHTML(ifile)
    for iul,ul in enumerate(fdata.findAll("ul")):
        lis = ul.findAll("li", {"class": "chart-group__item"})
        for j,li in enumerate(lis):  
            a = li.find('a')
            if a is not None:
                href = a['href']
                text = a.text
                subdir = getDirname(href)[1:]
                downloads[year][href] = [baseURL, subdir, getBasename(href)]

# Download Category Data

In [None]:
baseURL = "https://www.billboard.com"

for year, yearData in downloads.items():   
    for href, hrefData in yearData.items():
        url  = "{0}/{1}".format(baseURL, href)
        year = getBasename(hrefData[1])
        category = hrefData[2]

        savedir  = join(basedir, "data", "billboard", "categories")
        savename = join(savedir, "{0}-{1}.p".format(year, category))

        if isFile(savename):
            continue

        print("  Trying to download and save {0}".format(savename))
        try:

            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
            headers={'User-Agent':user_agent,} 

            request=urllib.request.Request(url,None,headers) #The assembled request
            response = urllib.request.urlopen(request)
            data = response.read() # The data u need
        except:
            continue

        print("Saving {0}".format(savename))
        saveJoblib(data=data, filename=savename, compress=True)
        sleep(1)        

# Parse Category Data

In [None]:
data = {}

baseURL = "https://www.billboard.com"
names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
categories = set([x[5:] for x in names])

albumCategories  = [x for x in categories if x.endswith("albums")]
songCategories   = [x for x in categories if not x.endswith("albums")]


names = [getBaseFilename(x) for x in sorted(glob(join(basedir, "data", "billboard", "categories", "*.p")))]
years = sorted(set([x[:4] for x in names]))

print("There are {0} years".format(len(years)))
print("There are {0} charts".format(len(categories)))
print("There are {0} album charts".format(len(albumCategories)))
print("There are {0} song charts".format(len(songCategories)))

In [None]:
#categories

In [None]:
from searchUtils import findPatternExt

In [None]:
def parseBillboardFile(ifile):
    fdata = getHTML(ifile)
    data  = {}
    for i,table in enumerate(fdata.findAll('table')):
        ths = table.findAll("th")
        trs = table.findAll("tr")

        headers = [x.text for x in ths]
        for j,tr in enumerate(trs[1:]):
            tds  = tr.findAll('td')
            vals = [x.text for x in tds]
            #print(headers,vals)

            if len(vals) == 3:
                date,name,artist = vals
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    print(", ".join([date, year]))

                    1/0
            elif len(vals) == 1:
                date = vals[0]
                try:
                    date = getDateTime(", ".join([date, year]))
                except:
                    print(vals)
                    1/0

            if not isDate(date):
                raise ValueError("Could not form date for {0}".format(date))

            date = printDateTime(date)
            if data.get(date) is not None:
                raise ValueError("Already seen this date!!!")
            data[date] = {"Artist": artist, "Name": name}
            
            #print("{0: <12}{1}".format(date, data[date]))
            continue
            
            
            if data.get(date) is None:
                data[date] = {}
            #print("{0: <20}{1: <20}{2: <20}{3: <20}".format(artist, name, category, date))
            #continue
            if data.get(artist) is None:
                data[artist] = {}
                #print("Artist: {0}".format(artist))
            if data[artist].get(name) is None:
                data[artist][name] = {}
                #print("\tSong: {0} ({1})".format(name, len(data[artist])))
            if data[artist][name].get(category) is None:
                data[artist][name][category] = []
            data[artist][name][category].append([date])

    return data

In [None]:
catdir = join(basedir, "data", "billboard", "categories")
for year in years:
    data  = {}
    files = findPatternExt(catdir, pattern=year, ext='.p')    
    for ifile in files:
        chart   = getBaseFilename(ifile)
        chart   = "-".join(chart.split('-')[1:])
        results = parseBillboardFile(ifile)
        data[chart] = results
        print("{0: <10}{1: <30}{2}".format(year, chart, len(results)))
        
    savedir = join(basedir, "results", "billboard")
    savename = join(savedir, "{0}.p".format(year))
    print("Saving {0}".format(savename))
    saveJoblib(data=data, filename=savename, compress=True)

# Aggregrate Charts

In [None]:
results = {}
from searchUtils import findExt
savedir = join(basedir, "results", "billboard")
files = sorted(findExt(savedir, ".p"))
for ifile in files:
    print(ifile)
    fdata = getFile(ifile)
    for cname, cnameResults in fdata.items():
        print('\t',cname)
        for date, dResults in cnameResults.items():
            artist = dResults["Artist"]
            if multiRenames.get(artist) is not None:
                prevArtist = artist
                artist = multiRenames[artist]
            if singleRenames.get(artist) is not None:
                idx    = str(singleRenames[artist])
                artist = artistIDToName[idx]
            if manualRenames.get(artist) is not None:
                artist = manualRenames[artist]
            album  = dResults["Name"]
            if results.get(artist) is None:
                results[artist] = {"Songs": {}, "Albums": {}}
            if cname.endswith("albums"):
                key = "Albums"
            else:
                key = "Songs"
            if results[artist][key].get(album) is None:
                results[artist][key][album] = {}
            if results[artist][key][album].get(cname) is None:
                results[artist][key][album][cname] = {}
            results[artist][key][album][cname][date] = 0

In [None]:
from collections import Counter
slimResults = Counter()
for artist, artistData in results.items():
    for key, keyData in artistData.items():
        for album, albumData in keyData.items():
            slimResults[artist] += sum({k: len(v) for k,v in albumData.items()}.values())

In [None]:
saveFile(idata = slimResults, ifile="billboardCounter.p", debug=True)
saveFile(idata = results,     ifile="billboardResults.p", debug=True)

# Get Billboard Results

In [5]:
slimResults = getFile(ifile="billboardCounter.p", debug=True)
fullResults = getFile(ifile="billboardResults.p", debug=True)

Loading data from billboardCounter.p
  --> This file is 96.0kB.
Loading billboardCounter.p
Loading data from billboardResults.p
  --> This file is 1.2MB.
Loading billboardResults.p


# Get Discogs

In [6]:
disc = discogs()
discdf = disc.getMasterSlimArtistDiscogsDB()
artistIDToName = discdf["DiscArtist"].to_dict()
from masterdb import getArtistAlbumsDB, discConv
artistAlbumsDB = getArtistAlbumsDB(disc)

artistNameToID = {}
print("Found {0} ID -> Name entries".format(len(artistIDToName)))
for artistID,artistName in artistIDToName.items():
    if artistNameToID.get(artistName) is None:
        artistNameToID[artistName] = []
    artistNameToID[artistName].append(artistID)
print("Found {0} Name -> ID entries".format(len(artistNameToID)))
mulArts  = multiArtist(cutoff=0.9, discdata=artistNameToID, exact=False)

Saved Discog Directory /Volumes/Music/Discog is Available
Local Discog Directory /Users/tgadfort/Music/Discog is Available
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-db/metadata exists
/Volumes/Music/Discog/albums-db/metadata exists
/Volumes/Music/Discog/diagnostic exists
/Volumes/Music/Discog/db exists
Loading data from /Volumes/Music/Discog/db/MasterSlimArtistDB.p
  --> This file is 53.5MB.
Loading /Volumes/Music/Discog/db/MasterSlimArtistDB.p
Current Time is Tue Jan 21, 2020 19:24:37 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
  --> This file is 180.4MB.
Loading /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
Creating Pandas DataFrame for 669087 Artists
	Shape --> (669087, 1)
DataFrame Shape is (669087, 1)
Current 

# Check Renames

In [7]:
singleRenames = getFile(ifile="singleRenames.p", debug=True)
multiRenames  = getFile(ifile="multiRenames.p", debug=True)
knownArtists  = getFile(ifile="artistMap.p", debug=True)

Loading data from singleRenames.p
  --> This file is 107.3kB.
Loading singleRenames.p
Loading data from multiRenames.p
  --> This file is 622B.
Loading multiRenames.p
Loading data from artistMap.p
  --> This file is 158.8kB.
Loading artistMap.p


In [8]:
manualRenames = {}
manualRenames["Michael Buble"] = "Michael Bublé"
manualRenames["Celine Dion"] = "Céline Dion"
manualRenames["Los Tigres del Norte"] = "Los Tigres Del Norte"
manualRenames["Marco Antonio Solis"] = "Marco Antonio Solís"
manualRenames["Banda El Recodo de Cruz Lizarraga"] = "Banda El Recodo De Crúz Lizárraga"
manualRenames["Alejandro Fernandez"] = "Alejandro Fernández"
manualRenames["Juan Luis Guerra 440"] = "Juan Luis Guerra 4.40"
manualRenames["Banda Sinaloense MS de Sergio Lizarraga"] = "Banda Sinaloense MS De Sergio Lizárraga"
manualRenames["J Balvin"] = "J. Balvin"
manualRenames["Los Tucanes de Tijuana"] = "Los Tucanes De Tijuana"
manualRenames["M.C. Hammer"] = "MC Hammer"
manualRenames["Ke$ha"] = "Ke$$ha"
manualRenames["Grupo Montez de Durango"] = "Grupo Montez De Durango"
manualRenames["Jose Feliciano"] = "José Feliciano"
manualRenames["Jennifer Pena"] = "Jennifer Peña"
manualRenames["Louie DeVito"] = "Louie Devito"
manualRenames["John Mellencamp"] = "John Mellencamp*"
manualRenames["Michel Telo"] = "Michel Teló"
manualRenames["La Adictiva Banda San Jose de Mesillas"] = "La Adictiva Banda San José De Mesillas"
manualRenames["VaShawn Mitchell"] = "Vashawn Mitchell"
manualRenames["Los Huracanes del Norte"] = "Los Huracanes Del Norte"
manualRenames["Daryl Hall John Oates"] = "Daryl Hall & John Oates"
manualRenames["A$AP Rocky"] = "ASAP Rocky"
manualRenames["Los Rieleros del Norte"] = "Los Rieleros Del Norte"
manualRenames["Gente de Zona"] = "Gente De Zona"
manualRenames["Los Horoscopos de Durango"] = "Los Horóscopos de Durango"
manualRenames["Jeff Lorber Fusion"] = "The Jeff Lorber Fusion"
manualRenames["Grupo Mania"] = "Grupo Manía"
manualRenames["Voz de Mando"] = "Voz De Mando"
manualRenames["La Maquinaria Nortena"] = "La Maquinaria Norteña"
manualRenames["Wang Qing"] = "Wang Qiang"
manualRenames["A Boogie Wit da Hoodie"] = "A Boogie Wit Da Hoodie"
manualRenames["Los Yonic's"] = "Los Yonics"
manualRenames["Sinead O'Connor"] = "Sinéad O'Connor"
manualRenames["VeggieTales"] = "Veggietales"
manualRenames["ScHoolboy Q"] = "Schoolboy Q"
manualRenames["Sergio Mendes"] = "Sérgio Mendes"
manualRenames["The B-52s"] = "The B-52's"
manualRenames["Franco de Vita"] = "Franco De Vita"
manualRenames["Ultra Nate"] = "Ultra Naté"
manualRenames["Alejandra Guzman"] = "Alejandra Guzmán"
manualRenames["Dareyes de La Sierra"] = "Dareyes De La Sierra"

manualRenames["Ray Charles and his Orchestra"] = "Ray Charles And His Orchestra"
manualRenames["Angelique Kidjo"] = "Angélique Kidjo"
manualRenames["Alegres de La Sierra"] = "Alegres De La Sierra"
manualRenames["Ray LaMontagne"] = "Ray Lamontagne"
manualRenames["La Original Banda el Limon de Salvador Lizarraga"] = "La Original Banda Limon De Salvador Lizarraga"
manualRenames["Mac DeMarco"] = "Mac Demarco"
manualRenames["N.W.A"] = "N.W.A."
manualRenames["El Trono de Mexico"] = "El Trono De Mexico"

manualRenames["The Chi-lites"] = "The Chi-Lites"
manualRenames["Janie Frickie"] = "Janie Fricke"
manualRenames["CeCe Peniston"] = "Ce Ce Peniston"
manualRenames["J Moss"] = "J. Moss"
manualRenames["Los Pikadientes de Caborca"] = "Los Pikadientes De Caborca"
manualRenames["Chaz Jankel"] = "Chas Jankel"

manualRenames["David Crowder Band"] = "David Crowder*Band"
manualRenames['Hector "El Father"'] = "Hector El Father"
manualRenames["Pedro Fernandez"] = "Pedro Fernández"
manualRenames["La Septima Banda"] = "La Séptima Banda"
manualRenames["Adan Chalino Sanchez"] = "Adan Chalino Sánchez"
              
manualRenames["LuHan"] = "Lu Han"
manualRenames["Paul and Paula"] = "Paul Und Paula"
manualRenames["Sisters Of Mercy"] = "Sisters Of No Mercy"
manualRenames["Bell Biv DeVoe"] = "Bell Biv Devoe"
manualRenames["Kid 'N Play"] = "Kid 'N' Play"
manualRenames["SoulDecision"] = "soulDecision"
manualRenames['Hector Acosta "El Torito"'] = 'Héctor Acosta "El Torito"'
manualRenames["El Chaval de la Bachata"] = "El Chaval De La Bachata"

manualRenames["Sixx: A.M."] = "Sixx:A.M."
manualRenames["Andre Rieu"] = "André Rieu"
manualRenames["Kany Garcia"] = "Kany García"
manualRenames["Lloyd Price and His Orchestra"] = "Lloyd Price And His Orchestra"
manualRenames["New Vaudeville Band"] = "The New Vaudeville Band"
manualRenames['Billy "Crash" Craddock'] = "Billy 'Crash' Craddock"
manualRenames["Lipps Inc."] = "Lipps, Inc."
manualRenames["Wreckx-N-Effect"] = "Wrecks-N-Effect"
manualRenames["49-ers"] = "49ers"
manualRenames["Father M.C."] = "Father MC"

manualRenames["Los Fantasmas del Caribe"] = "Los Fantasmas Del Caribe"
manualRenames["St. Germain"] = "St Germain"
manualRenames["Lil Bow Wow"] = "Lil' Bow Wow"
manualRenames["Marie-Elaine Thibert"] = "Marie Élaine Thibert"
manualRenames["DragonForce"] = "Dragonforce"
manualRenames["Bettye LaVette"] = "Bettye Lavette"
manualRenames["El Chapo de Sinaloa"] = "El Chapo De Sinaloa"
manualRenames["Renee Fleming"] = "Renée Fleming"
manualRenames["Mr. Children"] = "Mr.Children"
manualRenames["Los Inquietos del Norte"] = "Los Inquietos Del Norte"
manualRenames["Chase Rice"] = "Casey Rice"
manualRenames["BrockHampton"] = "Brockhampton"

manualRenames["Kenny Ball and his Jazzmen"] = "Kenny Ball And His Jazzmen"
manualRenames["Mary Macgregor"] = "Mary MacGregor"
manualRenames['Richard "Dimples" Fields'] = "Richard 'Dimples' Fields"
manualRenames["Camper van Beethoven"] = "Camper Van Beethoven"
manualRenames["The Sounds Of Blackness"] = "Sounds Of Blackness"
manualRenames["Tiranos del Norte"] = "Tiranos Del Norte"
manualRenames["Jaguares"] = "Jaguars"

manualRenames["Pierre LaPointe"] = "Pierre Lapointe"
manualRenames["La Energia Nortena"] = "La Energia Norteña"
manualRenames["Jon Batiste"] = "John Baptiste"
manualRenames["Sebastian Yatra"] = "Sebastián Yatra"

manualRenames["Four80East"] = "Four 80 East"
manualRenames['Little" Jimmy Dickens'] = "Little Jimmy Dickens"
manualRenames["Michael Murphey"] = "Michael Murphy"
manualRenames["USA-European Connection"] = "USA-European connection"
manualRenames["ConFunkShun"] = "Con Funk Shun"
manualRenames["Randy VanWarmer"] = "Randy Vanwarmer"
manualRenames["Blue Oyster Cult"] = "Blue Öyster Cult"
manualRenames["Andrae Crouch"] = "Andraé Crouch"
manualRenames["Jean Carne"] = "Jean Carn"
manualRenames["Force M.D.'s"] = "Force MD's"
manualRenames["Samuelle"] = "Samuele"

manualRenames["M.C. Brains"] = "M.C. Brain"
manualRenames["Del The Funky Homosapien"] = "Del The Funkee Homosapien"
manualRenames["Domingo Quinones"] = "Domingo Quiñones"
manualRenames["Los Angeles de Charly"] = "Los Angeles De Charly"
manualRenames["Josh Gracin"] = "Joshua Gracin"
manualRenames["Africanism All Stars"] = "African All Stars"
manualRenames["Pepper MaShay"] = "Pepper Mashay"
manualRenames["Trick-Trick"] = "Trick - Trick"
manualRenames["Graciela Beltran"] = "Graciela Beltrán"

manualRenames["PeeWee"] = "Pee-Wee"
manualRenames["blessthefall"] = "Blessthefall"
manualRenames["Arcangel"] = "Archangel"

manualRenames["La Tribu de Abrante"] = "La Tribu De Abrante"
manualRenames["The Turnpike Troubadours"] = "Turnpike Troubadours"
manualRenames["Ricky Duran"] = "Ricky Doran"
manualRenames["David Rose and His Orchestra"] = "David Rose & His Orchestra"
manualRenames["Cannonball Adderley Quintet"] = "The Cannonball Adderley Quintet"

manualRenames["Charlie Mccoy"] = "Charlie McCoy"
manualRenames["Edgar Winter Group"] = "The Edgar Winter Group"
manualRenames["Peoples Choice"] = "People's Choice"
manualRenames["Christy Lane"] = "Cristy Lane"
manualRenames["Rocker's Revenge"] = "Rockers Revenge"
manualRenames["Dexy's Midnight Runners"] = "Dexys Midnight Runners"
manualRenames["Todd Terry Project"] = "The Todd Terry Project"

manualRenames["Timmy T."] = "Timmy T"
manualRenames["Queensryche"] = "Queensrÿche"
manualRenames["Industria del Amor"] = "Industria Del Amor"
manualRenames["Lil' Mo' Yin Yang"] = "Lil Mo' Yin Yang"
manualRenames["C.J. Bolland"] = "CJ Bolland"

manualRenames["Lil Troy"] = "Lil' Troy"
manualRenames["Lo Fidelity Allstars"] = "Lo-Fidelity Allstars"
manualRenames["Mr. Serv-on"] = "Mr. Serv-On"
manualRenames["DJ Blass"] = "DJ Blas"
manualRenames["Killa Beez"] = "Killa Benz"
manualRenames['Kierra "KiKi" Sheard'] = 'Kierra "Kiki" Sheard'

manualRenames['Hector "El Bambino"'] = "Hector El Bambino"
manualRenames["Reggaeton Ninos"] = "Reggaeton Niños"
manualRenames["Jose Alfredo Jimenez"] = "José Alfredo Jiménez"
manualRenames["Eric Lapointe"] = "Éric Lapointe"
manualRenames["Gaither Vocal Band"] = "The Gaither Vocal Band"
manualRenames["iwrestledabearonce"] = "Iwrestledabearonce"
manualRenames["Paul Van Dyk"] = "Paul van Dyk"
manualRenames["Tercer Cielo"] = "tercer Cielo"

manualRenames["Los Titanes de Durango"] = "Los Titanes De Durango"
manualRenames["The Devin Townsend Project"] = "Devin Townsend Project"
manualRenames["Godspeed You! Black Emperor"] = "Godspeed You Black Emperor!"
              
manualRenames["Mayra Veronica"] = "Mayra Verónica"
manualRenames["R.A The Rugged Man"] = "R.A. The Rugged Man"
manualRenames["Super Junior-M"] = "Super Junior M"
manualRenames["Rokia Traore"] = "Rokia Traoré"
manualRenames["brentalfloss"] = "Brentalfloss"
manualRenames["The Dead Rabbitts"] = "The Dead Rabbits"
manualRenames["Pablo Alboran"] = "Pablo Alborán"
manualRenames["Anthony Santos"] = "Antony Santos"
              
manualRenames["Malea"] = "Malena"
manualRenames["Houndmouth."] = "Houndmouth"
manualRenames["Solido"] = "Solid"
manualRenames["Phora"] = "Phoera"
manualRenames["Steve'N'Seagulls"] = "Steve'n'Seagulls"
manualRenames["Livre"] = "Livrer"
manualRenames["Deon Kipping"] = "Dion Kipping"
              
manualRenames["Royce da 5'9"] = "Royce Da 5'9"
manualRenames["Gord Downie"] = "Gordon Downie"
manualRenames["Dominican Sisters Of Mary - Mother Of The Eucharist"] = "Dominican Sisters Of Mary, Mother Of The Eucharist"
manualRenames["Roy Woods"] = "Roy Wood"
manualRenames["Satsang"] = "Satsangi"
              
manualRenames["Seven Sense "] = "Seven Sense"
manualRenames["Andrew Schulz"] = "Andrew Schultz"
manualRenames["$uicideBoy$"] = "$uicideboy$"
manualRenames["Janelle Monae"] = "Janelle Monáe"
manualRenames["Lonesome River Band"] = "The Lonesome River Band"
manualRenames["Claypool Lennon Delirium"] = "The Claypool Lennon Delirium"
manualRenames["half alive"] = "half•alive"

# Near Renames Artists

In [None]:
from searchUtils import findNearest
cutoff = 0.9
artistsToGet = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
#for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i <= 4500:
        continue
    if i % 250 == 0:
        print("==>",i,len(slimResults))
    if multiRenames.get(artist) is not None:
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if knownArtists.get(artist) is not None:
        continue
    if manualRenames.get(artist) is not None:
        continue

    matches = mulArts.getArtistNames(artist)
    if len(matches) > 1:
        continue

    mdata = getMusicData("DiscArtist", artist)
    if mdata is None:
        results = findNearest(artist, artistNameToID.keys(), num=1, cutoff=cutoff)      
        if len(results) > 0:
            artistsToGet[artist] = results
            print("manualRenames[\"{0}\"] = \"{1}\"".format(artist, results[0]))

In [None]:
findNearest

In [None]:
for k,v in artistsToGet.items():
    print("keep[\"{0}\"] = {1}  #{2}".format(k, v[0], v[1]))



# Get Multi Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
len(multiMatchResult)

In [None]:
print(len(knownArtists))
knownArtists.update(multiMatchResult)
print(len(knownArtists))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

# Single Results

In [None]:
knownArtists["The Kingston Trio"]

In [16]:
cutoff = 0.60
singleResult = {}
#for i, (artist, cnt) in enumerate(slimResults.most_common()):
for i, (artist, cnt) in enumerate(sorted(slimResults.items(), key=lambda pair: pair[1], reverse=False)):
    if i > 0 and i % 100 == 0 or i == 100:
        print("Passed",i,'/',len(slimResults))
    if cnt < 1:
        continue
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]        
    if prevArtist is None:
        prevArtist = artist
    if manualRenames.get(prevArtist) is not None:
        prevArtist = manualRenames[prevArtist]
    if knownArtists.get(prevArtist) is not None or knownArtists.get(artist) is not None:
        continue
    artistAlbums = set(list(fullResults[artist]["Songs"].keys()) + list(fullResults[artist]["Albums"].keys()))
    match = mulArts.getArtistNames(artist                                                     )
    if len(match) == 1:
        mdata = getMusicData("DiscArtist", list(match.keys())[0])
        if not isinstance(mdata, DataFrame):
            results = getBestArtistMatch(artist, artistAlbums, N=3, cutoff=cutoff)
            if results[2] is not None:
                if results[2] >= cutoff:
                    print(i,"/",len(slimResults),'  \t',artist,'\t',results)
                    singleResult[artist] = results
                    if len(singleResult) > 1000:
                        break

Passed 100 / 6870
Passed 200 / 6870
Passed 300 / 6870
Passed 400 / 6870
Passed 500 / 6870
Passed 600 / 6870
Passed 700 / 6870
Passed 800 / 6870
Passed 900 / 6870
Passed 1000 / 6870
Passed 1100 / 6870
1121 / 6870   	 24 Hrs 	 ('5245200', '24hrs', 0.62)
Passed 1200 / 6870
Passed 1300 / 6870
Passed 1400 / 6870
Passed 1500 / 6870
Passed 1600 / 6870
Passed 1700 / 6870
Passed 1800 / 6870
1841 / 6870   	 Bryan Bautista 	 ('3388076', 'Juan Bautista', 0.62)
Passed 1900 / 6870
Passed 2000 / 6870
Passed 2100 / 6870
Passed 2200 / 6870
Passed 2300 / 6870
Passed 2400 / 6870
Passed 2500 / 6870
Passed 2600 / 6870
Passed 2700 / 6870
Passed 2800 / 6870
Passed 2900 / 6870
Passed 3000 / 6870
3011 / 6870   	 VASSY 	 ('40430', 'VAST', 0.64)
Passed 3100 / 6870
Passed 3200 / 6870
Passed 3300 / 6870
Passed 3400 / 6870
Passed 3500 / 6870
Passed 3600 / 6870
Passed 3700 / 6870
Passed 3800 / 6870
Passed 3900 / 6870
3943 / 6870   	 Phil Collins and Marilyn Martin 	 ('253972', 'Marilyn Martin', 0.94)
Passed 4000 / 6

In [None]:
del singleResult["Murk vs. Kristine W"]
del singleResult["Chris Cox Vs. Happy Clappers"]
del singleResult["T.M.Revolution × Nana Mizuki"]
del singleResult["Bob Marley Vs. Funkstar De Luxe"]

In [20]:
singleResults = {k: v[0] for k,v in singleResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))
saveFile(idata=knownArtists, ifile="artistMap.p", debug=True)

Found 5 single results
Found 6041 all results
Found 6046 all results
Saving data to artistMap.p
  --> This file is 158.9kB.
Saved data to artistMap.p
  --> This file is 158.9kB.


## Known Results

In [None]:
from collections import Counter
cutoff = 0.8
multiMatchResult = {}
for i, (artist, cnt) in enumerate(slimResults.most_common()):
    if i > 0 and i % 1000 == 0:
        print("Passed",i,'/',len(slimResults))
    prevArtist = None
    if multiRenames.get(artist) is not None:
        prevArtist = artist
        artist = multiRenames[artist]
    if singleRenames.get(artist) is not None:
        idx    = str(singleRenames[artist])
        artist = artistIDToName[idx]
    if prevArtist is None:
        prevArtist = artist
    if knownArtists.get(prevArtist) is not None:
        continue
    mdata = getMusicData("DiscArtist", prevArtist)
    if isinstance(mdata, DataFrame):
        if mdata.shape[0] <= 1:
            continue
        matches = mdata["Name"].index
        artistAlbums = set(list(fullResults[prevArtist]["Songs"].keys()) + list(fullResults[prevArtist]["Albums"].keys()))
        results = getBestArtistIDMatch(artist, artistAlbums, matches, N=3, cutoff=cutoff)
        if results[2] is not None:
            if results[2] >= cutoff:
                print(i,"/",len(slimResults),'  \t',artist,results)
                multiMatchResult[artist] = list(results[:-1])

In [None]:
singleResults = {k: v[0] for k,v in multiMatchResult.items()}
print("Found {0} single results".format(len(singleResults)))
print("Found {0} all results".format(len(knownArtists)))
knownArtists.update(singleResults)
print("Found {0} all results".format(len(knownArtists)))

In [None]:
saveFile(idata=knownArtists, ifile="singleRenames.p")

# Check For Misnames

In [None]:
ma = multiartist(cutoff=0.9, discdata=discdata)

In [None]:
print(ma.getArtistNames('Lipps, Inc.') == {'Lipps, Inc.': ['159617', '26641', '101850']})
print(ma.getArtistNames('Bob Wills & His Texas Playboys') == {'Bob Wills & His Texas Playboys': ['670000','786114','1134146','804668','804679','1004309','935875','935907']})
print(ma.getArtistNames('Brad Paisley Duet With Carrie Underwood', debug=False) == {'Carrie Underwood': ['1011680'], 'Brad Paisley': ['313755']})
print(ma.getArtistNames('Ray Charles With Willie Nelson', debug=False) == {'Willie Nelson': ['249449'], 'Ray Charles': ['521963', '30552']})

# Album Data

In [None]:
savename = join(savedir, "billboard.p")
albumData = getFile(join(savedir, "billboard-album.p"))
#chartdata = getFile(savename, debug=True)

In [None]:
for artist,artistData in albumData.items():
    print(artist)
    for album in artistData.keys():
        print("\t",album)

# Show Data

In [None]:
togets = {}
combos = []

savedir  = join(basedir, "results")
savename = join(savedir, "billboard.p")
chartdata = getFile(savename, debug=True)
chartArtists = sorted(chartdata.keys(), reverse=True)

import json
prevs = json.load(open("prevs.json", "r"))
artPrev = json.load(open("../discogs/prevs.json", "r"))
prevs.update(artPrev)
json.dump(prevs, open("prevs.json", "w"))
print("There are {0} artists previously searched.".format(len(prevs)))

ma = multiartist(cutoff=0.9, discdata=discdata, exact=False)

from random import shuffle
print("There are {0} chart artists".format(len(chartArtists)))
shuffle(chartArtists)
for i,artist in enumerate(chartArtists):
    if prevs.get(artist) is not None:
        continue
    retval = ma.getArtistNames(artist)
    if retval is None:
        togets[artist] = artist
    else:
        for name,value in retval.items():
            if prevs.get(name) is not None:
                continue
            if value is None:
                togets[name.upper()] = artist
                print("{0: <3}{1: <40}{2: <60}{3}".format(len(togets),name,artist,retval))
                print('\t---->',i,'/',len(chartArtists))

    
    if len(togets) > 50:
        print("\n\n\n\n")
        for name,artist in togets.items():
            print("art.searchDiscogForArtist(\"{0}\")  ## {1}".format(name, artist))
        break

###### artists = [x.strip() for x in vals.split("\n")]
artists = [x for x in artists if len(x) > 0]

In [None]:
Pitbull Featuring Ne-Yo, Afrojack & Nayer
Drake Featuring Kanye West, Lil Wayne & Eminem

In [None]:
def testName(name, discdata):
    if discdata.get(name) is not None:
        return discdata[name]
    return name

def findName(name, discArtists, threshold):
    retval = findNearest(name, discArtists, 1, threshold)
    if len(retval) == 0:
        return name
    return retval
    

def splitArtist(name):
    names = [name]
    names = [[x.strip() for x in y.split(" Featuring ")]
results = {}
    
for name in artists:
    if " & " in name:
        retvals = [testName(x, discdata) for x in 
        retvals = [findName(x, discArtists, 0.95) if not isinstance(x, list) else x for x in retvals]
        results[name] = retvals        

In [None]:
togets = []
for k,v in results.items():
    for val in v:
        if not isinstance(val, list):
            togets.append("art.searchDiscogForArtist(\"{0}\")".format(val))
            
for toget in set(togets):
    print(toget)

In [None]:
tmp = [['5070865', '90037', '4135543', '1507065'], ['Katy XXX Perry', ['1201210']]]

In [None]:
tmp

In [None]:
nameids = {}
namerefs = {}
for name,nameid in iddata.items():
    artist = name
    if name.endswith(")"):
        artist = None
        for x in [-3,-4,-5]:
            if artist is not None:
                continue
            if abs(x) > len(name):
                continue
            if name[x] == "(":
                try:
                    val = int(name[(x+1):-1])
                    artist = name[:x].strip()
                except:
                    continue
          
        if artist is None:
            artist = name

    ref = refdata[name]
            
    if nameids.get(artist) is None:
        nameids[artist] = {}
    nameids[artist][nameid] = 1
    
    if namerefs.get(artist) is None:
        namerefs[artist] = {}
    namerefs[artist][ref] = 1
    

nameids = {k: list(v.keys()) for k,v in nameids.items()}
namerefs = {k: list(v.keys()) for k,v in namerefs.items()}