# Master Discogs Database

In [1]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib
from urllib.parse import quote

from discogsBase import discogs
from discogsUtils import discogsUtils
from collection import collections
from artist import artist
from searchUtils import findExt, findSubExt, findPatternExt
from timeUtils import clock, elapsed
from fsUtils import moveFile, setFile, setDir, setSubDir, isFile, isDir, mkDir
from fileUtils import getFileBasics, getBasename
from artists import artists
from artist import artist
from albums import albums
from album import album, albumURLInfo
from time import sleep

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

Python: 3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-11-07 18:49:37.894367


In [2]:
%load_ext autoreload
%autoreload
disc = discogs()
arts = artists(disc)
art  = artist()
albs = albums(disc)
alb  = album()
dutils = discogsUtils()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Saved Discog Directory /Volumes/Music/Discog is Available
Local Discog Directory /Users/tgadfort/Music/Discog is Available
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-db/metadata exists
/Volumes/Music/Discog/albums-db/metadata exists
/Volumes/Music/Discog/diagnostic exists
/Volumes/Music/Discog/db exists


# Master DB Functions

In [96]:
from pandas import Series, DataFrame

def directoryName(x):
    if x is None:
        return x
    if "..." in x:
        x = x.replace("...", "")
    if "/" in x:
        x = x.replace("/", "-")
    return x

def realName(x):
    if x is None:
        return [None,-1]
    
    lenx = len(x)
    if len(x) < 1:
        return [x,-1]

    if x[-1] != ")":
        return [x, None]
    

    if lenx >=5:
        if x[-3] == "(":
            try:
                num = int(x[-2:-1])
                val = x[:-3].strip()
                return [val, num]
            except:
                return [x, None]
            
    if lenx >= 6:
        if x[-4] == "(":
            try:
                num = int(x[-3:-1])
                val = x[:-4].strip()
                return [val, num]
            except:
                return [x, None]
            
    if lenx >= 7:
        if x[-4] == "(":
            try:
                num = int(x[-3:-1])
                val = x[:-4].strip()
                return [val, num]
            except:
                return [x, None]

    return [x, None]

# Create Dictionary Lookup Files

## Artist ID --> Ref and Name

In [3]:
start, cmt = clock("Creating Artist DBs")
from searchUtils import findPatternExt

artistIDToName       = {}
artistIDToRef        = {}
artistIDToVariations = {}

artistMetadataDBDir = disc.getArtistsMetadataDBDir()
files = findPatternExt(artistMetadataDBDir, pattern="-Metadata", ext='.p')

for i,ifile in enumerate(files):
    print(ifile,' \t',end="")
    db = getFile(ifile)
    artistIDToName.update({k: v[0] for k,v in db.items()})
    artistIDToRef.update({k: v[1] for k,v in db.items()})    
    artistIDToVariations.update({k: v[2] for k,v in db.items()})

    print(i,len(artistIDToName))
print("\n\n==============================================\n")
    
savenames = {"IDToRef": artistIDToRef, "IDToName": artistIDToName, "IDToVariations": artistIDToVariations}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)
    
elapsed(start, cmt)

Current Time is Thu Nov 07, 2019 18:51:02 for Creating Artist DBs
/Volumes/Music/Discog/artists-db/metadata/0-Metadata.p  	0 5278
/Volumes/Music/Discog/artists-db/metadata/1-Metadata.p  	1 10851
/Volumes/Music/Discog/artists-db/metadata/10-Metadata.p  	2 16298
/Volumes/Music/Discog/artists-db/metadata/11-Metadata.p  	3 21747
/Volumes/Music/Discog/artists-db/metadata/12-Metadata.p  	4 27279
/Volumes/Music/Discog/artists-db/metadata/13-Metadata.p  	5 32855
/Volumes/Music/Discog/artists-db/metadata/14-Metadata.p  	6 38259
/Volumes/Music/Discog/artists-db/metadata/15-Metadata.p  	7 43780
/Volumes/Music/Discog/artists-db/metadata/16-Metadata.p  	8 49248
/Volumes/Music/Discog/artists-db/metadata/17-Metadata.p  	9 54826
/Volumes/Music/Discog/artists-db/metadata/18-Metadata.p  	10 60328
/Volumes/Music/Discog/artists-db/metadata/19-Metadata.p  	11 65713
/Volumes/Music/Discog/artists-db/metadata/2-Metadata.p  	12 71142
/Volumes/Music/Discog/artists-db/metadata/20-Metadata.p  	13 76646
/Volumes/M

## Artist ID --> Albums

In [12]:
start, cmt = clock("Creating Artist DBs")

artistIDAlbumNames     = {}
artistIDAlbumRefs      = {}
artistIDCoreAlbumNames = {}
artistIDCoreAlbumRefs  = {}

artistMetadataDBDir = disc.getArtistsMetadataDBDir()
files = findPatternExt(artistMetadataDBDir, pattern="-MediaMetadata", ext='.p')

core = ["Albums"]
nAllAlbums  = 0
nCoreAlbums = 0
for i,ifile in enumerate(files):
    print(ifile,'\t',end="")
    db = getFile(ifile)
    
    for j,(artistID,artistData) in enumerate(db.items()):
        artistIDAlbumNames[artistID]     = {}
        artistIDAlbumRefs[artistID]      = {}
        artistIDCoreAlbumNames[artistID] = {}
        artistIDCoreAlbumRefs[artistID]  = {}
        
        for mediaName,mediaData in artistData.items():
            artistIDAlbumNames[artistID].update({mediaName: mediaData[0]})
            artistIDAlbumRefs[artistID].update({mediaName: mediaData[1]})
            nAllAlbums += len(artistIDAlbumNames[artistID].values())
            if mediaName in core:
                artistIDCoreAlbumNames[artistID].update({mediaName: mediaData[0]})
                artistIDCoreAlbumRefs[artistID].update({mediaName: mediaData[1]})
                nCoreAlbums += len(artistIDCoreAlbumNames[artistID].values())

    print("{0: <10}{1: <10}{2: <10}".format(len(artistIDAlbumNames),nCoreAlbums,nAllAlbums))
print("\n\n==============================================\n")
    
    
savenames = {"IDToAlbumNames": artistIDAlbumNames, "IDToAlbumRefs": artistIDAlbumRefs, 
             "IDToCoreAlbumNames": artistIDCoreAlbumNames, "IDToCoreAlbumRefs": artistIDCoreAlbumRefs}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)
    
    
elapsed(start, cmt)

Current Time is Sun Oct 27, 2019 22:18:09 for Creating Artist DBs
/Volumes/Music/Discog/artists-db/metadata/0-MediaMetadata.p 	5278      2905      15285     
/Volumes/Music/Discog/artists-db/metadata/1-MediaMetadata.p 	10851     5917      31991     
/Volumes/Music/Discog/artists-db/metadata/10-MediaMetadata.p 	16298     8888      48108     
/Volumes/Music/Discog/artists-db/metadata/11-MediaMetadata.p 	21747     11842     64132     
/Volumes/Music/Discog/artists-db/metadata/12-MediaMetadata.p 	27279     14854     80260     
/Volumes/Music/Discog/artists-db/metadata/13-MediaMetadata.p 	32855     17847     96494     
/Volumes/Music/Discog/artists-db/metadata/14-MediaMetadata.p 	38259     20780     112245    
/Volumes/Music/Discog/artists-db/metadata/15-MediaMetadata.p 	43780     23734     128810    
/Volumes/Music/Discog/artists-db/metadata/16-MediaMetadata.p 	49248     26699     145090    
/Volumes/Music/Discog/artists-db/metadata/17-MediaMetadata.p 	54826     29754     161413    
/Volum

## Artist ID --> Genre, Style, Artists Lookup Table

In [3]:
start, cmt = clock("Creating Artist DBs")

artistIDGenre          = {}
artistIDStyle          = {}
artistIDCollaborations = {}

albumsMetadataDBDir = disc.getAlbumsMetadataDBDir()
files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p')

for ifile in files:
    print(ifile,'\t',end="")
    for artistID,artistData in getFile(ifile).items():
        genre   = artistData['Genre']
        artistIDGenre[artistID] = genre
        artists = artistData['Artists']
        artistIDCollaborations[artistID] = artists
        style   = artistData['Style']
        artistIDStyle[artistID] = style
    print(len(artistIDGenre))
print("\n\n==============================================\n")
    
    
savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)   
    
elapsed(start, cmt)

Current Time is Mon Nov 04, 2019 19:28:19 for Creating Artist DBs
/Volumes/Music/Discog/albums-db/metadata/0-ArtistMetadata.p 	3697
/Volumes/Music/Discog/albums-db/metadata/1-ArtistMetadata.p 	7656
/Volumes/Music/Discog/albums-db/metadata/10-ArtistMetadata.p 	10556
/Volumes/Music/Discog/albums-db/metadata/11-ArtistMetadata.p 	14430
/Volumes/Music/Discog/albums-db/metadata/12-ArtistMetadata.p 	19495
/Volumes/Music/Discog/albums-db/metadata/13-ArtistMetadata.p 	22738
/Volumes/Music/Discog/albums-db/metadata/14-ArtistMetadata.p 	28307
/Volumes/Music/Discog/albums-db/metadata/15-ArtistMetadata.p 	34087
/Volumes/Music/Discog/albums-db/metadata/16-ArtistMetadata.p 	38735
/Volumes/Music/Discog/albums-db/metadata/17-ArtistMetadata.p 	42689
/Volumes/Music/Discog/albums-db/metadata/18-ArtistMetadata.p 	45684
/Volumes/Music/Discog/albums-db/metadata/19-ArtistMetadata.p 	51331
/Volumes/Music/Discog/albums-db/metadata/2-ArtistMetadata.p 	55057
/Volumes/Music/Discog/albums-db/metadata/20-ArtistMetad

## Album ID --> Name, Ref, Artists Lookup Table

In [4]:
start, cmt = clock("Creating Artist DBs")

albumIDToName    = {}
albumIDToRef     = {}
albumIDToArtists = {}
    
albumsMetadataDBDir = disc.getAlbumsMetadataDBDir()
files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistAlbums", ext='.p')
for ifile in files:
    print(ifile,'\t',end="")
    for artistID,artistData in getFile(ifile).items():
        for albumID,albumData in artistData.items():
            albumName    = albumData[0]
            albumRef     = albumData[1]
            albumCountry = albumData[2].most_common(1)[0]
            albumYear    = albumData[3].most_common(1)[0]

            
            albumIDToName[albumID] = albumName
            albumIDToRef[albumID]  = albumRef

            if albumIDToArtists.get(albumID) is None:                
                albumIDToArtists[albumID] = []
            albumIDToArtists[albumID].append(artistID)
    print(len(albumIDToArtists))
print("\n\n==============================================\n")

for albumID in albumIDToArtists.keys():
    albumIDToArtists[albumID] = list(set(albumIDToArtists[albumID]))
print("\n\n==============================================\n")

    
savenames = {"IDToName": albumIDToName, "IDToRef": albumIDToRef, "IDToArtists": albumIDToArtists}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Album{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True) 
    
elapsed(start, cmt)

Current Time is Thu Nov 07, 2019 18:57:12 for Creating Artist DBs
/Volumes/Music/Discog/albums-db/metadata/0-ArtistAlbums.p 	14621
/Volumes/Music/Discog/albums-db/metadata/1-ArtistAlbums.p 	31674
/Volumes/Music/Discog/albums-db/metadata/10-ArtistAlbums.p 	41053
/Volumes/Music/Discog/albums-db/metadata/11-ArtistAlbums.p 	54762
/Volumes/Music/Discog/albums-db/metadata/12-ArtistAlbums.p 	69554
/Volumes/Music/Discog/albums-db/metadata/13-ArtistAlbums.p 	79949
/Volumes/Music/Discog/albums-db/metadata/14-ArtistAlbums.p 	101393
/Volumes/Music/Discog/albums-db/metadata/15-ArtistAlbums.p 	117568
/Volumes/Music/Discog/albums-db/metadata/16-ArtistAlbums.p 	134020
/Volumes/Music/Discog/albums-db/metadata/17-ArtistAlbums.p 	145568
/Volumes/Music/Discog/albums-db/metadata/18-ArtistAlbums.p 	160732
/Volumes/Music/Discog/albums-db/metadata/19-ArtistAlbums.p 	179266
/Volumes/Music/Discog/albums-db/metadata/2-ArtistAlbums.p 	194073
/Volumes/Music/Discog/albums-db/metadata/20-ArtistAlbums.p 	204980
/Volu

# Pandas DB

## Artist DB

In [24]:
def getArtistDB():
    start, cmt = clock("\n===================================== Creating Artist DB =====================================")
    from pandas import Series, DataFrame
    print("Loading ArtistID Data")
    artistIDtoName  = Series(disc.getArtistIDToNameData())
    artistIDtoRef   = Series(disc.getArtistIDToRefData())
    artistIDToVariations  = Series(disc.getArtistIDToVariationsData())

    print("Creating Pandas DataFrame for {0} Artists".format(artistIDtoName.shape[0]))
    cols = ["Name"]
    discdf = DataFrame(artistIDtoName)
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Ref")
    discdf = discdf.join(DataFrame(artistIDtoRef))
    cols += ["Ref"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Variations")
    discdf = discdf.join(DataFrame(artistIDToVariations))
    cols += ["Variations"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    discdf["Known"] = True
    
    print("  Finding Real Artist Name")
    discdf[["Artist", "Num"]] = DataFrame(discArtistDB['Name'].apply(realName).tolist(), index=discArtistDB.index)
    print("\tShape --> {0}".format(discdf.shape))

    

    print("DataFrame Shape is {0}".format(discdf.shape))
    elapsed(start, cmt)
    
    return discdf

In [25]:
discArtistDB = getArtistDB()
discArtistDB.head()

Current Time is Thu Nov 07, 2019 19:50:21 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToName.p
  --> This file is 10.3MB.
Loading /Volumes/Music/Discog/db/ArtistIDToName.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToRef.p
  --> This file is 12.2MB.
Loading /Volumes/Music/Discog/db/ArtistIDToRef.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToVariations.p
  --> This file is 21.0MB.
Loading /Volumes/Music/Discog/db/ArtistIDToVariations.p
Creating Pandas DataFrame for 544462 Artists
	Shape --> (544462, 1)
  Joining Ref
	Shape --> (544462, 2)
  Joining Variations
	Shape --> (544462, 3)
  Finding Real Artist Name
	Shape --> (544462, 6)
DataFrame Shape is (544462, 6)
Current Time is Thu Nov 07, 2019 19:50:56 for Done with 
Process [{0}] took 35 seconds.


## Artist Metadata DB

In [27]:
def getArtistMetadataDB():
    start, cmt = clock("\n===================================== Creating Artist Metadata DB =====================================")
    from pandas import Series, DataFrame
    print("Loading ArtistID Data")
    artistIDtoGenre          = Series(disc.getArtistIDToGenreData())
    artistIDtoStyle          = Series(disc.getArtistIDToStyleData())
    artistIDToCollaboration  = Series(disc.getArtistIDToCollaborationData())

    print("Creating Pandas DataFrame for {0} Artists".format(artistIDtoGenre.shape[0]))
    cols = ["Genre"]
    discdf = DataFrame(artistIDtoGenre)
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Style")
    discdf = discdf.join(DataFrame(artistIDtoStyle))
    cols += ["Style"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Collaboration")
    discdf = discdf.join(DataFrame(artistIDToCollaboration))
    cols += ["Collaboration"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("DataFrame Shape is {0}".format(discdf.shape))
    elapsed(start, cmt)
    
    return discdf

In [28]:
discArtistMetadataDB = getArtistMetadataDB()
discArtistMetadataDB.head()

Current Time is Thu Nov 07, 2019 19:53:21 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToGenre.p
  --> This file is 9.0MB.
Loading /Volumes/Music/Discog/db/ArtistIDToGenre.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToStyle.p
  --> This file is 13.1MB.
Loading /Volumes/Music/Discog/db/ArtistIDToStyle.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToCollaborations.p
  --> This file is 28.9MB.
Loading /Volumes/Music/Discog/db/ArtistIDToCollaborations.p
Creating Pandas DataFrame for 426594 Artists
	Shape --> (426594, 1)
  Joining Style
	Shape --> (426594, 2)
  Joining Collaboration
	Shape --> (426594, 3)
DataFrame Shape is (426594, 3)
Current Time is Thu Nov 07, 2019 19:54:32 for Done with 
Process [{0}] took 1.2 minutes.


Unnamed: 0,Genre,Style,Collaboration
1000,"{'Electronic': 53, 'Hip Hop': 1, 'Non-Music': ...","{'Techno': 45, 'Electro': 16, 'Downtempo': 1, ...","{'Dave Clarke': 52, 'Bang The Future': 1, 'Mr...."
1000500,{},{},{}
100200,{'Hip Hop': 4},{'Conscious': 4},"{'Dike': 4, 'ABS (2)': 2, 'Creutzfeld&Jakob': ..."
1002000,{},{},{}
1005400,{'Rock': 3},"{'Alternative Rock': 1, 'Indie Rock': 1, 'Math...","{'Gapeseed': 3, 'Gerling': 1}"


## Artist Albums DB

In [79]:
def getArtistAlbumsDB(loadRefs=False):
    start, cmt = clock("\n===================================== Creating Artist Albums DB =====================================")
    from pandas import Series, DataFrame
    print("Loading ArtistID Data")
    artistIDtoAlbumNames  = Series(disc.getArtistIDToAlbumNamesData())
    if loadRefs:
        artistIDtoAlbumRefs   = Series(disc.getArtistIDToAlbumRefsData())

    print("Creating Pandas DataFrame for {0} Artists".format(artistIDtoAlbumNames.shape[0]))
    cols = ["Albums"]
    discdf = DataFrame(artistIDtoAlbumNames)
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("DataFrame Shape is {0}".format(discdf.shape))
    elapsed(start, cmt)
    
    return discdf

In [80]:
discArtistAlbumsDB = getArtistAlbumsDB()
discArtistAlbumsDB.head()

Current Time is Thu Nov 07, 2019 20:59:44 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
  --> This file is 161.6MB.
Loading /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
Creating Pandas DataFrame for 544462 Artists
	Shape --> (544462, 1)
DataFrame Shape is (544462, 1)
Current Time is Thu Nov 07, 2019 21:02:38 for Done with 
Process [{0}] took 2.9 minutes.


Unnamed: 0,Albums
1000,"{'Albums': {'44815': 'Archive One', '44831': '..."
1000500,{'Miscellaneous': {'1178875': 'Peak Controller'}}
100200,"{'Albums': {'50511': 'PottpÃ¼ree'}, 'Singles &..."
1002000,{'Instruments & Performance': {'1002108': 'Are...
1002600,{'Vocals': {'1180905': 'Quiero'}}


## Albums DB

In [32]:
def getAlbumDB():
    start, cmt = clock("\n===================================== Creating Artist Album DB =====================================")
    from pandas import Series, DataFrame
    print("Loading AlbumID Data")
    albumIDtoName    = Series(disc.getAlbumIDToNameData())
    albumIDtoRef     = Series(disc.getAlbumIDToRefData())
    albumIDToArtists = Series(disc.getAlbumIDToArtistsData())

    print("Creating Pandas DataFrame for {0} Albums".format(albumIDtoName.shape[0]))
    cols = ["Name"]
    discdf = DataFrame(albumIDtoName)
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Ref")
    discdf = discdf.join(DataFrame(albumIDtoRef))
    cols += ["Ref"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("  Joining Artists")
    discdf = discdf.join(DataFrame(albumIDToArtists))
    cols += ["Artists"]
    discdf.columns = cols
    print("\tShape --> {0}".format(discdf.shape))

    print("DataFrame Shape is {0}".format(discdf.shape))
    elapsed(start, cmt)
    
    return discdf

In [33]:
discAlbumDB = getAlbumDB()
discAlbumDB.head()

Current Time is Thu Nov 07, 2019 20:05:29 for 
Loading AlbumID Data
Loading data from /Volumes/Music/Discog/db/AlbumIDToName.p
  --> This file is 30.2MB.
Loading /Volumes/Music/Discog/db/AlbumIDToName.p
Loading data from /Volumes/Music/Discog/db/AlbumIDToRef.p
  --> This file is 42.4MB.
Loading /Volumes/Music/Discog/db/AlbumIDToRef.p
Loading data from /Volumes/Music/Discog/db/AlbumIDToArtists.p
  --> This file is 17.2MB.
Loading /Volumes/Music/Discog/db/AlbumIDToArtists.p
Creating Pandas DataFrame for 1302715 Albums
	Shape --> (1302715, 1)
  Joining Ref
	Shape --> (1302715, 2)
  Joining Artists
	Shape --> (1302715, 3)
DataFrame Shape is (1302715, 3)


Unnamed: 0,Name,Ref,Artists
1012077,Live,/Dave-Clarke-Live/master/1012077,[1000]
1258261,Walls Of Genius And Miracle,/Walls-Of-Genius-and-Miracle-Walls-Of-Genius-A...,"[1000, 182635, 1009227]"
2546978,RA.EX007 Dave Clarke,/Dave-Clarke-RAEX007-Dave-Clarke/release/2546978,[1000]
44815,Archive One,/Dave-Clarke-Archive-One/master/44815,[1000]
44831,Devil's Advocate,/Dave-Clarke-Devils-Advocate/master/44831,[1000]


## Artist Album ID --> Known Albums

In [86]:
def getArtistAlbumKnownDB(discArtistAlbumsDB):
    start, cmt = clock("\n===================================== Creating Artist Album DB =====================================")
    from pandas import Series, DataFrame
    
    tmpdb = discArtistAlbumsDB["Albums"].copy()
    print("Creating Pandas DataFrame for {0} Arist Albums".format(tmpdb.shape[0]))
    discdf = DataFrame(tmpdb.apply(isKnownAlbum).tolist(), index=tmpdb.index)
    discdf.columns = ["Known Albums", "All Albums", "Albums"]
    print("\tShape --> {0}".format(discdf.shape))
    
    print("DataFrame Shape is {0}".format(discdf.shape))
    elapsed(start, cmt)
    
    return discdf
    
    
def isKnownAlbum(x):
    retval = {}
    albumSummary = [0, 0]
    for mediaType in x.keys():
        for albumID in x[mediaType].keys():
            albumName = x[mediaType][albumID]
            #print(mediaType,albumID,albumName,'\t\t',end="")
            known     = albumID in discAlbumDB.index
            #print(known)
            
            retval[albumID] = [albumName, mediaType, known]
            albumSummary[0] += known
            albumSummary[1] += 1
            
    return [albumSummary[0], albumSummary[1], retval]



In [87]:
discArtistAlbumKnownDB = getArtistAlbumKnownDB(discArtistAlbumsDB)
discArtistAlbumKnownDB.head()

Current Time is Thu Nov 07, 2019 21:19:24 for 
Creating Pandas DataFrame for 544462 Arist Albums
	Shape --> (544462, 3)
DataFrame Shape is (544462, 3)


Unnamed: 0,Known Albums,All Albums,Albums
1000,53,53,"{'44815': ['Archive One', 'Albums', True], '44..."
1000500,0,1,"{'1178875': ['Peak Controller', 'Miscellaneous..."
100200,2,9,"{'50511': ['PottpÃ¼ree', 'Albums', True], '219..."
1002000,2,3,"{'1002108': ['Are You Faithful?', 'Instruments..."
1002600,0,1,"{'1180905': ['Quiero', 'Vocals', False]}"


# Joining Artist ID DataFrame

In [94]:
print("Creating Pandas DataFrame for {0} Arist IDs".format(discArtistDB.shape[0]))
print("  Joining Artist Metadata")
discdf = discArtistDB.join(discArtistMetadataDB)
print("\tShape --> {0}".format(discdf.shape))
print("  Joining Artist Albums")
discdf = discdf.join(discArtistAlbumKnownDB)
print("\tShape --> {0}".format(discdf.shape))
discdf.head()

Creating Pandas DataFrame for 544462 Arist IDs
  Joining Artist Metadata
	Shape --> (544462, 9)
  Joining Artist Albums
	Shape --> (544462, 12)


Unnamed: 0,Name,Ref,Variations,Known,Artist,Num,Genre,Style,Collaboration,Known Albums,All Albums,Albums
1000,Dave Clarke,/artist/1000-Dave-Clarke,"[Dave Clarke, 7 Red 7, Clarck, Clarke, Clarke,...",True,Dave Clarke,,"{'Electronic': 53, 'Hip Hop': 1, 'Non-Music': ...","{'Techno': 45, 'Electro': 16, 'Downtempo': 1, ...","{'Dave Clarke': 52, 'Bang The Future': 1, 'Mr....",53,53,"{'44815': ['Archive One', 'Albums', True], '44..."
1000500,Club Pulse,/artist/1000500-Club-Pulse,[Club Pulse],True,Club Pulse,,{},{},{},0,1,"{'1178875': ['Peak Controller', 'Miscellaneous..."
100200,Dike,/artist/100200-Dike,"[Dike, D.I.K.E., Dike D, Uchegdu]",True,Dike,,{'Hip Hop': 4},{'Conscious': 4},"{'Dike': 4, 'ABS (2)': 2, 'Creutzfeld&Jakob': ...",2,9,"{'50511': ['PottpÃ¼ree', 'Albums', True], '219..."
1002000,Larry Stokes,/artist/1002000-Larry-Stokes,[Larry Stokes],True,Larry Stokes,,{},{},{},2,3,"{'1002108': ['Are You Faithful?', 'Instruments..."
1002600,Gidd Sanchez,/artist/1002600-Gidd-Sanchez,[Gidd Sanchez],True,Gidd Sanchez,,,,,0,1,"{'1180905': ['Quiero', 'Vocals', False]}"


# Save Master Database DataFrame

In [95]:
savename = disc.getMasterDiscogsDBFilename()
saveFile(idata=discdf, ifile=savename, debug=True)

Saving data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 516.3MB.
Saved data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 516.3MB.


# Missing / ToDo

In [None]:
from fsUtils import removeFile

In [None]:
%load_ext autoreload
%autoreload
disc = discogs()
arts = artists(disc)

toget = discdf[discdf['Name'].isna()]['Ref'].to_dict()
for artistID,artistRef in toget.items():
    url = arts.getArtistURL(artistRef)
    savename = arts.getArtistSavename(artistID)
    if isFile(savename):
        removeFile(savename)
    try:
        arts.downloadArtistURL(url=url, savename=savename, debug=True, force=True)
    except:
        continue
