# Discogs Functions

In [243]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib

from discogs import discogs
from discogsUtils import discogsUtils
from collection import collections
from artist import artist
from timeUtils import clock, elapsed

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-05-14 20:07:09.370618


# Discogs Base Class

In [244]:
disc = discogs()

Saved Discog Directory /Volumes/Music/Discog is Available
/Volumes/Music/Discog/base exists
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/base-db exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-extra exists
/Volumes/Music/Discog/search exists
/Volumes/Music/Discog/search-artists exists
/Volumes/Music/Discog/special exists
/Volumes/Music/Discog/artist-special exists
/Volumes/Music/Discog/db exists


# Collection Section

In [65]:
col = collections(disc)

In [66]:
#col = collections(disc)
#col.downloadCollection(maxPages=1, country="US", decade="2010", year="2019")
#for year in range(1950, 2018):
#    col.downloadCollectionsByYear(maxPages=2, year=str(year), Ncountries=50, Ngenres=100, Nstyles=100)

In [67]:
#col.createCollectionsData()
#col.downloadCollections()
#col.parseCollections()
#col.mergeCollections(debug=True, force=True)
#col.createCollectionDBs(debug=True)

# Artist Section

In [241]:
from fsUtils import setFile, isFile, setDir, isDir, mkDir, mkSubDir
from fileUtils import getBasename, getBaseFilename
from ioUtils import getFile, saveFile
from webUtils import getWebData, getHTML, getURL
from searchUtils import findExt, findPattern
from timeUtils import clock, elapsed, update
from collections import Counter
from math import ceil
from time import sleep
from artist import artist

class artists():
    def __init__(self, discog, basedir=None):
        self.disc = discog
        self.name = "artists"
        
        self.artist = artist()
        
        self.maxCollections = 500

        ## General Imports
        self.getCodeDir          = self.disc.getCodeDir
        self.getArtistsDir       = self.disc.getArtistsDir
        self.getArtistsDBDir     = self.disc.getArtistsDBDir
        self.getDiscogDBDir      = self.disc.getDiscogDBDir
        self.discogsUtils        = discogsUtils()
        
        self.modVal = self.disc.getMaxModVal
        
        self.starterDir = setDir(self.getCodeDir(), self.name)
        if not isDir(self.starterDir):
            print("Creating {0}".format(self.starterDir))
            mkDir(self.starterDir, debug=True)
        
    
    ###############################################################################
    # Find Known (Downloaded) Artists (0)
    ###############################################################################
    def findKnownArtists(self, debug=False):
        if debug:
            print("Finding Known (Downloaded) Artists")
        artistIDs = []
        artistDir = self.disc.getArtistsDir()
        maxModVal = self.disc.getMaxModVal()
        for i in range(maxModVal):
            dirVal       = setDir(artistDir, str(i))
            files        = findExt(dirVal, ext='.p')
            regArtistIDs = [getBaseFilename(x) for x in files] 
            artistIDs   += regArtistIDs

            
        if debug:
            print("Found {0} artist IDs in {1}".format(len(artistIDs), artistDir))
            
        artistDir = self.disc.getArtistsExtraDir()
        files     = findExt(artistDir, ext='.p')
        extraArtistIDs = list(set([getBaseFilename(x).split('-')[0] for x in files]))  
        if debug:
            print("Found {0} artist IDs in {1}".format(len(extraArtistIDs), artistDir))
                  
        artistIDs += extraArtistIDs
        
        savename = setFile(self.disc.getDiscogDBDir(), "KnownArtistIDs.p")
        print("Saving {0} known artists to {1}".format(len(artistIDs), savename))
        saveFile(ifile=savename, idata=artistIDs, debug=True)
        
    
    ###############################################################################
    # Find Unknown Artists (1)
    ###############################################################################
    def findUnknownArtists(self, minVal=0, debug=False):
        refCounts = Counter(self.disc.getArtistRefCountsData())
        if debug:
            print("There are {0} potential artists".format(len(refCounts)))
        
        check = {self.discogsUtils.getArtistID(k): k for k,v in refCounts.most_common() if v > minVal}
        checkSet = set(check.keys())
        if debug:
            print("There are {0} potential artists > {1} counts".format(len(checkSet), minVal))
        
        knownArtistIDs = self.disc.getKnownArtistIDsData()
        knownSet = set(knownArtistIDs)
        if debug:
            print("There are {0} known artists".format(len(knownSet)))

            
        toget = {check[k]: k for k in list(checkSet - knownSet)}
        if debug:
            print("There are {0} artists > {1} counts".format(len(toget), minVal))
        
        savename = setFile(self.disc.getDiscogDBDir(), "ToGet.p")
        print("Saving {0} known artists to {1}".format(len(toget), savename))
        saveFile(ifile=savename, idata=toget, debug=True)
        
    
    ###############################################################################
    # Download Unknown Artists (2)
    ###############################################################################
    def downloadUnknownArtists(self, forceWrite=False, debug=False):
        artistDir = self.disc.getArtistsDir()
        toget     = self.disc.getToGetData()
        if debug:
            print("There are {0} artists to download".format(len(toget)))
        

        from urllib.parse import quote
        baseURL = self.disc.discogURL
        for artistRef,discID in toget.items():            
            url = urllib.parse.urljoin(baseURL, quote(artistRef))
            
            modValue  = self.discogsUtils.getDiscIDHashMod(discID=discID, modval=self.disc.getMaxModVal())
            outdir    = mkSubDir(artistDir, str(modValue))
            savename  = setFile(outdir, discID+".p")
            if isFile(savename) and not forceWrite:
                continue

            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
            headers={'User-Agent':user_agent,} 

            sleep(1)
            
            print("Downloading: {0}".format(url))
            
            request=urllib.request.Request(url,None,headers) #The assembled request
            response = urllib.request.urlopen(request)
            data = response.read() # The data u need
            
            print("Saving {0}".format(savename))
            saveJoblib(data=data, filename=savename, compress=True)
            print("Done. Sleeping for 3 seconds")
            sleep(2)
            
            


    ################################################################################
    # Parse Artist Data (3)
    ################################################################################
    def parseArtistFile(ifile):
        bsdata     = getHTML(get(ifile))
        artistData = self.parse(bsdata) 
        return artistData
    

    def parseArtistFiles(self, debug=False):        
        artistInfo = artist()

        artistDir = self.disc.getArtistsDir()
        maxModVal = self.disc.getMaxModVal()
                    
        artistDBDir = self.disc.getArtistsDBDir()        
        
        totalSaves = 0
        for i in range(86, maxModVal):            
            dirVal       = setDir(artistDir, str(i))
            files        = findExt(dirVal, ext='.p')
            
            dbname = setFile(artistDBDir, "{0}-DB.p".format(i))            
            dbdata = getFile(dbname, version=2)
            
            saveIt = 0
            for ifile in files:
                discID = getBaseFilename(ifile)
                if dbdata.get(discID) is None:
                    saveIt += 1
                    info   = artistInfo.getData(ifile)
                    dbdata[discID] = info

            if saveIt > 0 or True:
                savename = setFile(artistDBDir, "{0}-DB.p".format(i))     
                print("Saving {0} new artist IDs to {1}".format(saveIt, savename))
                saveJoblib(data=dbdata, filename=savename, compress=True)
                totalSaves += saveIt
            
        print("Saved {0} new artist IDs".format(totalSaves))
        
    
    ################################################################################
    # Collect Metadata About Artists (4)
    ################################################################################
    def buildMetadata(self):
        start, cmt = clock("Building Artist Metadata DB")
        
        artistNames = {}
        artistYears = {}
        artistDBDir = self.disc.getArtistsDBDir()   
        files       = findExt(artistDBDir, ext='.p')     
        for i,ifile in enumerate(files):
            if i % 50 == 0:
                print(i,'/',len(files),'\t',elapsed(start, cmt))
            db = getFile(ifile)
            for discID,artistData in db.items():
                years = None
                artistName       = artistData['Artist']
                if artistNames.get(artistName) is None:
                    artistNames[artistName] = {}
                artistNames[artistName][discID] = 1
                artistVariations = artistData['Variations']
                artistMedia      = artistData['Media']
                if artistMedia is not None:
                    for mediaName,mediaData in artistMedia.items():
                        for mediaID, mediaValues in mediaData.items():
                            year = mediaValues['Year']
                            try:
                                year = int(year)
                            except:
                                continue
                            if years is None:
                                years = [year, year]
                            else:
                                years[0] = min([year, years[0]])
                                years[1] = max([year, years[1]])
                        
                if len(artistVariations) > 0:
                    vardata = artistVariations.values()
                    for var in vardata:
                        for varval in var:
                            varname = varval[0]
                            if artistNames.get(varname) is None:
                                artistNames[varname] = {}
                            artistNames[varname][discID] = 1
                                                        
        for artistName in artistNames.keys():
            artistNames[artistName] = list(artistNames[artistName].keys())
                            
        savename = setFile(self.disc.getDiscogDBDir(), "VariationNameToIDs.p")
        print("Saving {0} known artists to {1}".format(len(artistNames), savename))
        saveFile(ifile=savename, idata=artistNames, debug=True)
        elapsed(start, cmt)

In [242]:
art = artists(disc)
#art.findKnownArtists(debug=True)
#art.findUnknownArtists(minVal=200, debug=True)
#art.downloadUnknownArtists(debug=True)
#art.parseArtistFiles()
art.buildMetadata()

Current Time is Tue May 14, 2019 19:55:24 for Building Artist Metadata DB
Saving 906781 known artists to /Volumes/Music/Discog/db/VariationNameToIDs.p
Saving data to /Volumes/Music/Discog/db/VariationNameToIDs.p
  --> This file is 25.0MB.
Saved data to /Volumes/Music/Discog/db/VariationNameToIDs.p
  --> This file is 25.0MB.
Current Time is Tue May 14, 2019 20:05:34 for Done with Building Artist Metadata DB
Process [Done with Building Artist Metadata DB] took 10.2 minutes.


In [245]:
from searchUtils import findNearest
data = disc.getArtistVariationNameToIDsData()


Loading data from /Volumes/Music/Discog/db/VariationNameToIDs.p
  --> This file is 25.0MB.
Loading /Volumes/Music/Discog/db/VariationNameToIDs.p


In [255]:
artists = [x for x in data.keys() if x is not None]
findNearest("N'Sync", artists, 10, 0.9)

["N'Sync", "N' Sync", "N'Syc"]

In [145]:
from ioUtils import getFile
from fsUtils import isFile, isDir
from webUtils import getHTML, isBS4
from strUtils import fixName

from discogs import discogs

class artist(discogs):
    def __init__(self):
        self.name = "artist"
        
    def getData(self, inputdata):
        if isinstance(inputdata, str):
            if isFile(inputdata):
                bsdata = getHTML(getFile(fname))
            else:
                raise ValueError("Not sure about string input: {0}".format(inputdata))
        elif isBS4(inputdata):
            bsdata = inputdata
            pass
        else:
            raise ValueError("Not sure about input type: {0}".format(type(inputdata)))

        self.bsdata = bsdata
        
        return self.parse()
        
        
    def getNamesAndURLs(self, content):
        data = []
        for ref in content.findAll("a"):
            url    = ref.attrs['href']
            name   = ref.text

            discID = self.getArtistDiscID(url)
            data.append([name,url,discID])                    
        return data



    def getArtistDiscID(self, suburl, debug = False):
        ival = "/artist"
        if not isinstance(suburl, str) and not isinstance(suburl, unicode):
            return None

        pos = suburl.find(ival)
        if pos == -1:
            return None

        data = suburl[pos+len(ival)+1:]
        pos  = data.find("-")
        discID = data[:pos]
        try:
            int(discID)
        except:
            return None

        return str(discID)



    def getArtistMediaCounts(self, bsdata, debug = False):
        mediaCounts = {}
        results = bsdata.findAll("ul", {"class": "facets_nav"})
        for result in results:
            for li in result.findAll("li"):
                ref = li.find("a")
                if ref:
                    attrs = ref.attrs
                    span = ref.find("span", {"class": "facet_count"})
                    count = None
                    if span:
                        count = span.text
                        credittype    = attrs.get("data-credit-type")
                        creditsubtype = attrs.get("data-credit-subtype")
                        if credittype and creditsubtype:
                            if mediaCounts.get(credittype) == None:
                                mediaCounts[credittype] = {}
                            if mediaCounts[credittype].get(creditsubtype) == None:
                                mediaCounts[credittype][creditsubtype] = count

        return mediaCounts



    def getArtistName(self, bsdata, debug = False):
        ## 1st Try
        result = bsdata.find("h1", {'class':'hide_desktop'})
        if result:
            artist = result.text
            if len(artist) > 0:
                artist = fixName(artist)
                return artist

        ## 2nd Try
        result = bsdata.find("h1", {'class':'hide_mobile'})
        if result:
            artist = result.text
            if len(artist) > 0:
                artist = fixName(artist)
                return artist

        return None



    def getArtistURL(self, bsdata, debug = False):
        # 1st Try
        result = bsdata.find("link", {"rel": "canonical"})
        if result:
            url = result.attrs["href"]
            url = url.replace("https://www.discogs.com", "")
            if url.find("/artist/") > -1:
                return url

        # 2nd Try
        result = bsdata.find("link", {"hreflang": "en"})
        if result:
            url = result.attrs["href"]
            url = url.replace("https://www.discogs.com", "")
            if url.find("/artist/") > -1:
                return url            

        return None    


    def getArtistMediaAlbum(self, td, debug = False):
        retval = {"URL": None, "Album": None, "Format": None}
        for span in td.findAll("span"):
            attrs = span.attrs
            if attrs.get("class"):
                if 'format' in attrs["class"]:
                    albumformat = span.text
                    albumformat = albumformat.replace("(", "")
                    albumformat = albumformat.replace(")", "")
                    retval["Format"] = albumformat
                    continue
            span.replaceWith("")

        ref = td.find("a")
        if ref:
            retval["URL"]   = ref.attrs['href']
            retval["Album"] = ref.text

        return retval


    def getArtistMedia(self, bsdata, debug = False):
        table = bsdata.find("table", {"id": "artist"})
        if table == None:
            return None

        media = {}
        name  = None
        for tr in table.findAll("tr"):
            h3 = tr.find("h3")
            if h3:
                name = h3.text
                media[name] = []
                continue


            # Album, Class, Format
            result = tr.find("td", {"class": "title"})
            album  = None
            url    = None
            albumformat = name
            if result:
                retval      = self.getArtistMediaAlbum(result)
                album       = fixName(retval.get("Album"))
                url         = retval.get("URL")
                albumformat = retval.get("Format")

            if album == None:
                continue

            # Code
            code = tr.attrs.get("data-object-id")

            # AlbumClass
            albumclass = tr.attrs.get("data-object-type")

            # AlbumURL
            result  = tr.find("td", {"class": "artist"})
            artists = None
            if result:
                artists = self.getNamesAndURLs(result)

            # Year
            result = tr.find("td", {"class": "year"})
            year   = None
            if result:
                year = result.text

            data = {}
            data["Album"]  = album
            data["URL"]    = url
            data["Class"]  = albumclass
            data["Format"] = albumformat
            data["Artist"] = artists
            data["Code"]   = code
            data["Year"]   = year
            media[name].append(data)
            #if debug: print "  Found album:",album,"of type:",name


        newMedia = {}
        for name,v in media.items():
            newMedia[name] = {}
            for item in v:
                code = item['Code']
                del item['Code']
                newMedia[name][code] = item

        media = newMedia

        return media


    def getArtistVariations(self, bsdata, debug = False):
        result = bsdata.find("div", {"class": "profile"})
        variations = {}
        if result:
            heads = result.findAll("div", {"class": "head"})
            heads = [x.text for x in heads]
            heads = [x.replace(":","") for x in heads]

            content = result.findAll("div", {"class": "content"})
            if len(heads) != len(content):
                raise("Mismatch in head/content")

            for i in range(len(heads)):
                if heads[i] == "Sites":
                    content[i] = self.getNamesAndURLs(content[i])
                elif heads[i] == "In Groups":
                    content[i] = self.getNamesAndURLs(content[i])
                elif heads[i] == "Variations":
                    content[i] = self.getNamesAndURLs(content[i])
                elif heads[i] == "Aliases":
                    content[i] = self.getNamesAndURLs(content[i])
                else:
                    content[i] = content[i].text
                    content[i] = content[i].strip()
                variations[heads[i]] = content[i]

        return variations



    def getArtistPages(self, bsdata, debug = False):
        result = bsdata.find("div", {"class": "pagination bottom "})
        total = 0
        num   = 0
        if result:
            pages = result.find("strong").text
            pages = pages.strip()
            pages = pages.split()[-1]
            pages = pages.replace(",", "")
            try:
                total = int(pages)
                num = int(ceil(float(total)/500))
            except:
                raise("Can not parse pages",pages)


        return num,total


    def parse(self, debug = False):
        bsdata = self.bsdata
        
        retval = {}
        retval["Artist"]      = self.getArtistName(bsdata, debug)
        retval["URL"]         = self.getArtistURL(bsdata, debug)
        retval["ID"]          = self.getArtistDiscID(retval["URL"], debug)
        retval["Pages"]       = self.getArtistPages(bsdata, debug)
        retval["Variations"]  = self.getArtistVariations(bsdata, debug)
        retval["MediaCounts"] = self.getArtistMediaCounts(bsdata, debug)
        retval["Media"]       = self.getArtistMedia(bsdata, debug)

        #print retval
        return retval    

In [None]:




################################################################################
#
# Download Artist Files
#
################################################################################
def downloadArtists(minCounts = 1, debug = False, forceWrite = False):
    basedir      = getBaseDBDir()    
    countsname   = setFile(basedir, "artistFrequency.yaml")
    artistCounts = Counter(get(countsname))
    
    discdbname   = setFile(basedir, "downloadedArtists.yaml")
    discdb       = get(discdbname)
    

    print "Finding artists to download..."
    artistsToGet = []
    for item in artistCounts.most_common():
        href = makeUnicode(item[0])
        cnt  = item[1]
        if cnt < minCounts: continue
        discID = getArtistDiscID(href)
        if discdb.get(discID): continue
        artistsToGet.append(href)

    print "Finding artists to download...",len(artistsToGet)
        
    if debug: print "Loading",dbname
    dbdata = get(dbname)
    if debug: print "Found",len(dbdata),"unique artists from",dbname

    dbname  = setFile(basedir, "artistDB.json")
    if debug: print "Loading",dbname
    db      = get(dbname)
    if debug: print "Found",len(db),"known artists from",dbname
    
    errDB = getBadIDs(getDiscogDir())

    downloads = {}
    downloads = getMissing(dbdata, db, errDB)    
    if debug: print "Downloading",len(downloads),"new artists for missing artists."


    if len(downloads) == 0:
        print "Looping over",len(dbdata),"artist counts data..."
        for k,v in dbdata.iteritems():
            if v["CNT"] < minCounts:
                continue
            discID = getArtistDiscID(v["URL"])
            if db.get(discID):
                continue
            if errDB.get(discID):
                continue
            downloads[k] = v
            
        if debug:
            print "Downloading",len(downloads),"new artists with >=",minCounts,"counts."


    
    baseurl = u"https://www.discogs.com"
    useSafari = True
    dtime = 4
    problems = {}
    
    curDir  = setDir(base, artistDir)
    curDirs = [x for x in findAll(curDir) if isDir(x)]
    subDir  = setDir(base, artistDir)
    savedir = mkSubDir(subDir, str(len(curDirs)))
    print "\n\n===========> Saving files to",savedir," <===========\n\n"
    
    for i,artist in enumerate(downloads.keys()):
        v = downloads[artist]
        if i % 10 == 0:
            print '\n',i,'/',len(downloads),'\t',artist,'\t',v,'\n'
        suburl = v["URL"]
        artistSaveName = getSaveName(artist)
        savename = join(savedir, artistSaveName+"-1.p")
        if isFile(savename):
            continue
        retval = getData(baseurl, suburl, extra=None, savename=savename, 
                         useSafari=useSafari, dtime=dtime, debug=debug)
        if not retval:
            problems[artist] = suburl


    print "Found",len(problems),"download problems."
    if len(problems) > 0:
        savename = setFile(getDiscogDir(), "problems.json")
        print "Saving them to",savename
        save(savename, problems)
    



################################################################################
#
# Process the ArtistDB Files
#
################################################################################
def processArtistDBFiles(debug = False):
    basedir = getBaseDBDir()
    
    artistCntr   = Counter()    
    artistCntrDB = {}

    if debug:
        print "Looking for files in",basedir
    files = findPatternExt(basedir, pattern="artistDB-", ext=".p")
    #files = glob(join(basedir, "artistDB-*.p"))
    if debug:
        print "Found",len(files),"files in",basedir
                         
    nerr = 0
    print "  ",nice("   Progress", 20),nice("#Artists", 10),nice("#Errors", 10),"Filename"
    for i,ifile in enumerate(files):
        if i % 1000 == 0 or i == 10 or i == 100:
            print "  ",nicerate(i,len(files), 20),
            print nice(len(artistCntr), 10),
            print nice(nerr, 10),
            print ifile
        data = get(ifile)
        
        for artist,suburl in data.iteritems():
            discID = getArtistDiscID(suburl)
            if discID == None:
                if debug:
                    nerr += 1
                    #print "    No discID for",artist,suburl
                continue
            artistCntr[artist] += 1
            if artistCntrDB.get(artist) == None:
                artistCntrDB[artist] = suburl

    if debug:
        print "Found ",len(artistCntrDB),"unique artists from files in",basedir

    savename = setFile(basedir, "artistCountsAll.p")
    print "Saving",len(artistCntrDB),"artists to",savename
    save(savename, artistCntrDB)
    print savename,'size ->',getSize(savename, unit='MB'),"MB."

    savename = setFile(basedir, "artistCountsRaw.p")
    print "Saving",len(artistCntr),"artists to",savename
    save(savename, artistCntr)
    print savename,'size ->',getSize(savename, unit='MB'),"MB."


    
def findArtists(minCounts = 1, debug = True):
    basedir = getDiscogBaseDBDir()

    savename = setFile(basedir, "artistCountsAll.p")
    if debug: print "Loading",savename
    artistCntrDB = get(savename)
    if debug: print "Found",len(artistCntrDB),"artists in",savename
    
    savename = setFile(basedir, "artistCountsRaw.p")
    if debug: print "Loading",savename
    artistCntr = get(savename)
    if debug: print "Found",len(artistCntr),"artists in",savename
    

    artistCounter = {}                 
    for artist in artistCntrDB.keys():
        val = artistCntr[artist]
        if val < minCounts:
            continue
        url = artistCntrDB[artist]
        artistCounter[artist] = {"URL": url, "CNT": val}
        
        
    savename = setFile(basedir, "artistCounts.p")
    print "Saving",len(artistCounter),"/",len(artistCntrDB),
    print "unique artists with >=",minCounts,"counts to",savename
    save(savename, artistCounter)
    print savename,'size ->',getSize(savename, unit='kB'),"kB."
    
                                    



################################################################################
#
# Artist Helpers
#
################################################################################
def saveArtistData(artistData, debug = False, ifile = None, forceWrite = False):
    artistDBDir = getArtistsDBDir()

    artist   = artistData["Artist"]
    discID   = artistData["ID"]
    if artist == None or discID == None:
        removeFile(ifile)
        print " --> Removing due to artist/discID error:",ifile
        return

    modValue = getDiscIDHashMod(discID, modval=500)
    subDir   = mkSubDir(artistDBDir, str(modValue))
    outdir   = subDir
    savename  = setFile(outdir, discID+".p")
    if isFile(savename) and not forceWrite:
        return
    save(savename, artistData)
    print " --> Saved",savename




################################################################################
#
# Special Artists
#
################################################################################
def parseSpecialArtists(base = "/Volumes/Music/Discog", debug = False):  
    basedbdir   = getDiscogBaseDBDir()        
    dbname  = setFile(basedbdir, "artistDB.json")
    dbdata  = get(dbname)
    
    specialdir  = getDiscogSpecialDir()
    files = findExt(specialdir, ext=".html")
    for i,ifile in enumerate(files):
        if i % 250 == 0 or i == 50 or i == 10:
            print "\n====>",i,"/",len(files),'\t',ifile
        if getsize(ifile) < 1000:
            print " --> Removing due to low size:",ifile
            removeFile(ifile)
            continue
        
        bsdata         = getHTML(get(ifile))
        artistData     = parse(bsdata, debug)
        saveArtistData(artistData, dbdata, debug, ifile, forceWrite = False)

        artist         = artistData["Artist"]
        artistSaveName = getSaveName(artist)
        savename = setSubFile(base, "artists-special", artistSaveName+"-1.p")
        save(savename, open(ifile).read())

        if isFile(ifile):
            if debug:
                print " --> Removing special artist:",ifile
            removeFile(ifile)




################################################################################
#
# Special Artists
#
################################################################################
def downloadMultipageArtist(debug = False, forceWrite = False):
    
def parseMultipageArtists(debug = False, forceWrite = False):
    files = findPatternExt(getArtistsExtraDir(), pattern='-1', ext='.p')
    artists = [x.replace("-1.p", "") for x in files]
    discIDs = [getBasename(x) for x in artists if x.endswith('.p') == False]
    
    for i,discID in enumerate(discIDs):
        print i,'/',len(discIDs)
        files = findPatternExt(getArtistsExtraDir(), pattern=discID+"-", ext='.p')
        fullArtistData = None
        print "  Found",len(files),"for discID:",discID
        for j,ifile in enumerate(files):
            print "    -->",j,"/",len(files)
            bsdata         = getHTML(ifile)
            artistData     = artistdata.parse(bsdata, debug)
        
            if j == 0:
                fullArtistData = artistData
                continue
            else:
                #fullArtistData["Pages"] = max(int(fullArtistData["Pages"]), int(artistData["Pages"]))
                for media,mediaData in artistData["Media"].iteritems():
                    if fullArtistData["Media"].get(media) == None:
                        fullArtistData["Media"][media] = mediaData
                    else:
                        for item in mediaData:
                            fullArtistData["Media"][media].append(item)

        
        modValue = getDiscIDHashMod(discID, modval=500)
        dbname = setFile(getArtistsDBDir(), str(modValue)+"-DB.p")
        dbdata = get(dbname, debug)
        dbdata[discID] = fullArtistData
        save(dbname, dbdata, debug = True)
        
    




################################################################################
#
# Show ArtistData
#
################################################################################
def showArtistData(artistData):
    print nice("Artist:",10),artistData.get("Artist")
    for key in artistData.keys():
        print key




################################################################################
#
# Update Artist DBs
#
################################################################################
def updateArtistDBs():
    artistDBDir = getArtistsDBDir()
    files = findExt(artistDBDir, ext=".p")
    for ifile in files:
        data = get(ifile)
        for discID,artistData in data.iteritems():
            media = artistData['Media']
            for mediatype in media.keys():
                tmp = {}
                for item in media[mediatype]:
                    code = item['Code']
                    del item['Code']
                    tmp[code] = item
                media[mediatype] = tmp
                     
        save(ifile, data)



################################################################################
#
# Error handlers
#
################################################################################
def getBadIDs(base, debug = True):
    errDBname = setSubFile(base, "artists-db-err", "errDB.json")
    errDB = get(errDBname)
    
    if debug: print "  Found ",len(errDB),"bad IDs."
    
    files = findSubExt(base, "artists-db-err", ext=".p")
    #for ifile in glob(join(base, "artists-db-err", "*.p")):
    for ifile in files:
        data = get(ifile)
        try:
            discID = data["ID"]
            errDB[discID] = 1
        except:
            continue
        
        removeFile(ifile)

    if debug: print "  Saving",len(errDB),"bad IDs."
    save(errDBname, errDB)
    return errDB
    

    
def removeKnownArtists(allDB, known, debug = False):
    print "Removing Known Artists:",len(allDB)
    print "         Known Artists:",len(known)
    for artist in known.keys():
        if allDB.get(artist):
            del allDB[artist]
    print "     New Known Artists:",len(allDB)
    raise ValueError("Done.")
    return allDB




def getMissing(dbdata, db, errDB, debug = False):
    missingDir  = mkSubDir(getMusicDir(), "missing")
    missingFile = setFile(missingDir, "missing.json")
    missing     = get(missingFile)
    
    downloads = {}
    #allDB = removeKnownArtists(dbdata, db)
    artists = dbdata.keys()
    #artists = allDB.keys()
    
    for i,artist in enumerate(reversed(missing.keys())):
        if len(artist) < 2: continue
        name = artist.replace("The ", "")        
        matches1 = findMatchingWord(name, artists)
        #matches2 = []
        matches2 = findNearest(name, artists, 100, 0.75)
        matches  = list(set(matches1 + matches2))
        #matches  = matches[:min(len(matches), 10)]
        if len(matches) > 0:
            print i,'/',len(missing),'\t',len(downloads),'\t',artist,' \t---> ',len(matches)
        for x in matches:
            v = dbdata[x]
            discID = getArtistDiscID(v["URL"])
            if db.get(discID):
                continue
            if errDB.get(discID):
                continue
            downloads[x] = v
        
        if len(downloads) > 100000:
            break
        
    return downloads



def splitArtistFilesByHashval(artistdir = "artists", N = 2000):
    base   = getDiscogDir()
    files  = glob(join(base, "artistFiles", "artists[0-9]", "*.p"))
    files += glob(join(base, "artistFiles", "artists[1-9][0-9]", "*.p"))
    print "Found",len(files)

    modN  = int(ceil(len(files) / N)) + 2
    fvals = {}    
    for ifile in files:
        hexval = int(sha1(ifile).hexdigest(), 16)
        modval = hexval % modN
        if fvals.get(modval) == None:
            fvals[modval] = []
        fvals[modval].append(ifile)
        
    for k,v in fvals.iteritems():
        outdir = mkSubDir(base, [artistdir, str(k)])
        print "Moving",len(v),"to",outdir
        for ifile in v:
            src = ifile
            dst = setFile(outdir, getBasename(src))
            moveFile(src, dst)
            
            
            


def addNewArtistsDBToDB():
    startVal       = start()
    artistDB       = getArtistDB()
    tmpdir = setDir(getDiscogDir(), "artists-db")
    files = findSubExt(getDiscogDir(), "artist-db", ext=".p")
    newToDB  = {}
    for i,ifile in enumerate(files):
        if (i+1) % 25 == 0: inter(startVal, i+1,len(files))
        artistData = get(ifile)
        #artistData = parseArtistFile(ifile)
        #print artistData
        discID = artistData.get("ID")
        if discID == None:
            removeFile(ifile, debug = True)
            continue
        if newToDB.get(discID):
            removeFile(ifile, debug = True)
            continue
        if artistDB.get(discID) == None:
            ref    = artistData.get("URL")
            name   = makeStrFromUnicode(makeUnicode(artistData.get("Artist")))
            newToDB[discID] = {"URL": ref, "Name": name}

        outfile = setFile(tmpdir, discID+".p")
        if isFile(outfile):
            removeFile(ifile, debug = True)
            continue
        moveFile(ifile, outfile, debug = True)
        
            
    saveNewDBs(newToDB)
    end(startVal)



def addNewArtistsToDB():
        
    startVal       = start()
    artistDB       = getArtistDB()
    knownArtistIDs = getKnownArtistIDs()
    
    tmpdir = setDir(getDiscogDir(), "artists-tmp")
    files = findSubExt(getDiscogDir(), "artists-special", ext=".p")
    newToDB  = {}
    for i,ifile in enumerate(files):
        if (i+1) % 25 == 0: inter(startVal, i+1,len(files))
        artistData = parseArtistFile(ifile)
        #print artistData
        discID = artistData.get("ID")
        if discID == None:
            removeFile(ifile, debug = True)
            continue
            
        if knownArtistIDs.get(discID) or newToDB.get(discID):
            removeFile(ifile, debug = True)
            continue
        
        outfile = setFile(tmpdir, discID+".p")
        moveFile(ifile, outfile, debug = True)

        if artistDB.get(discID) == None or True:
            ref    = artistData.get("URL")
            name   = makeStrFromUnicode(makeUnicode(artistData.get("Artist")))
            newToDB[discID] = {"URL": ref, "Name": name}
        #print ifile,'\t\t',discID,'\t',ref,'\t\t',name

    saveNewDBs(newToDB)
    end(startVal)



###############################################################################
#
# Re-arrange Artists by ModVal
#
###############################################################################
def moveArtistsByHash():
    artistsDir = getArtistsDBDir()
    #artistNameDB = getArtistNameDB(slim = False, debug = True)
    #files = findSubExt(setDir(getDiscogDir(), "artistNew"), "*", ext=".p")
    files = findSubExt(getDiscogDir(), "artists-db", ext=".p")
    #cnts = [0, 0]
    for ifile in files:
        discID = getBaseFilename(ifile)
        modValue = getDiscIDHashMod(discID, modval=500)
        subDir   = mkSubDir(artistsDir, str(modValue))
        outdir   = subDir
        outname  = setFile(outdir, discID+".p")
        if isFile(outname):
            removeFile(ifile, debug = True)
            continue
        moveFile(ifile, outname, debug = True)
        

In [None]:




###############################################################################
#
# Search Results
#
###############################################################################
def downloadSearchResults(searchTerm, forceWrite = False, debug = True):
    if debug:
        print "downloadSearchResults(",searchTerm,")"
    baseURL   = u"https://www.discogs.com/search/"
    #subURL    = "?limit=250&q="+pathname2url(makeUnicode(searchTerm).encode("utf-8"))+"&type=artist&layout=sm"
    subURL    = "?limit=250&q="+pathname2url(makeUnicode(searchTerm).encode("utf-8"))+"&layout=sm"
    #subURL    = "?q="+searchTerm+"&type=artist&layout=sm"
    URL       = baseURL + subURL
    savename  = setFile(getSearchDir(),getSaveName(searchTerm)+".p")
    if isFile(savename) and forceWrite == False:
        return
    
    if debug:
        print "Saving search results for",searchTerm,"to",savename
    dtime     = 5
    useSafari = True    
    attempts  = 1
    retval    = False
    
    while not retval and attempts <= 3:
        retval = getData(base=URL, suburl=None, extra=None, savename=savename, 
                         useSafari=useSafari, dtime=dtime, debug=debug)
        if not retval:
            print "  There was an error. Logging it."
            attempts += 1
            if isFile(savename):
                removeFile(savename, debug)

    if retval and debug:
        print "Downloaded search results for",searchTerm


        
def parseSearchResultsFile(bsdata):
    refDB    = {}
    for h4 in bsdata.findAll("h4"):
        ref = h4.find("a")
        if ref:
            attrs  = ref.attrs
            href   = attrs.get('href')
            if href:
                if href.find("anv=") != -1:
                    continue
            discID = getArtistDiscID(href)
            
            refDB[discID] = href
                     
    return refDB




def parseDownloadedSearchResults(forceWrite = False, debug = True):
    files = findExt("/Volumes/Music/Discog/search", ".html")
    for ifile in files:
        savename = ifile.replace(".html", ".p")
        data = open(ifile).read()
        save(savename, data)    
    
    #artistDB = getArtistDB()
    files    = findExt(getSearchDir(), ext=".p")

    toGet = {}
    print "Searching through",len(files),"search files."
    startVal = start()
    for i,ifile in enumerate(files):
        if (i+1) % 25 == 0: inter(startVal, i+1, len(files))
        bsdata = getHTML(ifile)
        refDB = parseSearchResultsFile(bsdata)
        for discID,href in refDB.iteritems():
            if toGet.get(discID): continue
            #if artistDB.get(discID) or toGet.get(discID): continue
            toGet[discID] = href

    end(startVal)

    savename = setFile(getSearchDir(), "toGet.yaml")    
    print "Downloading",len(toGet),"new disc IDs."
    save(savename, toGet, debug = True)
    


def downloadSearchResultArtists(debug = False, dtime = 4):
    artistDB = getArtistDB()
    outdir   = getSearchArtistsDir()
    outdbdir = getSearchArtistsDBDir()
    baseURL  = u"https://www.discogs.com"
    newToDB  = {}
    useSafari = True
    
    
    savename = setFile(getSearchDir(), "toGet.yaml")
    toGet = get(savename, debug = True)
    print "Downloading",len(toGet),"new disc IDs."

    startVal = start()

    for i,discID in enumerate(toGet.keys()):
        
        if (i+1) % 100 == 0: inter(startVal, i+1, len(toGet))
        
        print i,'/',len(toGet)
        if discID == None:
            continue
    
        href = toGet[discID]
        
        if artistDB.get(discID) or newToDB.get(discID): continue
        #if newToDB.get(discID): continue

        try:
            savename = setFile(outdir, discID+".p")
        except:
            continue
        
        if isFile(savename):
            continue
        
        URL = baseURL + href
        URL = baseURL + pathname2url(makeUnicode(href).encode("utf-8"))
        URL = URL + "?sort=year%2Casc&limit=500&page=1"

        retval   = False
        attempts = 0
        while not retval and attempts < 3:
            retval = getData(base=URL, suburl=None, extra=None, savename=savename, 
                             useSafari=useSafari, dtime=dtime+2*attempts, debug=debug)
            attempts += 1
            #sleep()
            
        if retval:
            bsdata = getHTML(savename)
            artistData     = parse(bsdata, debug)
            savename = setFile(outdbdir, discID+".p")
            if isFile(savename):
                continue
            save(savename, artistData, debug = True)

        newToDB[discID] = 1
               
    end(startVal)
               

def mergeSearchResults(debug = True):
    newToDB  = {}
    artistDBData = {}

    modVal   = 500
        
    files    = findExt(getSearchArtistsDBDir(), ext=".p")
    for i,ifile in enumerate(files):
        if i % 100 == 0:
            print i,'/',len(files),'\t',ifile
        artistData = get(ifile)
        discID   = artistData["ID"]
        href     = artistData["URL"]
        artist   = makeStrFromUnicode(artistData["Artist"])
        modValue = getDiscIDHashMod(discID, modval=modVal)
        if artistDBData.get(modValue) == None:
            artistDBData[modValue] = {}
        newToDB[discID] = {"URL": href, "Name": artist}
        artistDBData[modValue][discID] = artistData

    saveNewDBs(newToDB)    
    mergeArtistDBs(False)

    for modValue in artistDBData.keys():
        modDBfile = setFile(getArtistsDBDir(), str(modValue)+"-DB.p")
        modDB = get(modDBfile)
        for discID in artistDBData[modValue].keys():
            modDB[discID] = artistDBData[modValue][discID]
        try:
            save(modDBfile, modDB)
        except:
            continue

    moveSearchResults()        

def moveSearchResults(debug = True):    
    modVal   = 500

    files    = findExt(getSearchArtistsDir(), ext=".p")
    for ifile in files:
        discID   = getBaseFilename(ifile)
        modValue = getDiscIDHashMod(discID, modval=modVal)
        outfile  = setSubFile(getArtistsDir(), str(modValue), discID+".p")
        moveFile(ifile, outfile, forceMove = True, debug = True)

    files    = findExt(getSearchArtistsDBDir(), ext=".p")
    for ifile in files:
        removeFile(ifile, debug = True)

        

In [None]:
savename = setFile(disc.getDiscogDBDir(), "{0}.p".format("NameToID"))
data = getFile(savename)
