# Master Discogs Database

In [60]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib
from urllib.parse import quote

from discogsBase import discogs
from discogsUtils import discogsUtils
from collection import collections
from artist import artist
from searchUtils import findExt, findSubExt, findPatternExt
from timeUtils import clock, elapsed
from fsUtils import moveFile, setFile, setDir, setSubDir, isFile, isDir, mkDir
from fileUtils import getFileBasics, getBasename
from artists import artists
from artist import artist
from albums import albums
from album import album, albumURLInfo
from time import sleep

from masterdb import discConv, isKnownAlbum, directoryName

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-12-14 18:32:23.343038


In [2]:
%load_ext autoreload
%autoreload
disc = discogs()
arts = artists(disc)
art  = artist()
albs = albums(disc)
alb  = album()
dutils = discogsUtils()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Saved Discog Directory /Volumes/Music/Discog is Available
Local Discog Directory /Users/tgadfort/Music/Discog is Available
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-db/metadata exists
/Volumes/Music/Discog/albums-db/metadata exists
/Volumes/Music/Discog/diagnostic exists
/Volumes/Music/Discog/db exists


# Create Dictionary Lookup Files

## Artist ID --> Ref and Name

In [9]:
start, cmt = clock("Creating Artist DBs")
from searchUtils import findPatternExt

artistIDToName       = {}
artistIDToRef        = {}
artistIDToVariations = {}

artistMetadataDBDir = disc.getArtistsMetadataDBDir()
files = findPatternExt(artistMetadataDBDir, pattern="-Metadata", ext='.p')

for i,ifile in enumerate(files):
    print(ifile,' \t',end="")
    db = getFile(ifile)
    artistIDToName.update({k: v[0] for k,v in db.items()})
    artistIDToRef.update({k: v[1] for k,v in db.items()})    
    artistIDToVariations.update({k: v[2] for k,v in db.items()})

    print(i,len(artistIDToName))
print("\n\n==============================================\n")
    
savenames = {"IDToRef": artistIDToRef, "IDToName": artistIDToName, "IDToVariations": artistIDToVariations}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)
    
elapsed(start, cmt)

Current Time is Thu Dec 12, 2019 20:01:20 for Creating Artist DBs
/Volumes/Music/Discog/artists-db/metadata/0-Metadata.p  	0 5799
/Volumes/Music/Discog/artists-db/metadata/1-Metadata.p  	1 11840
/Volumes/Music/Discog/artists-db/metadata/10-Metadata.p  	2 17717
/Volumes/Music/Discog/artists-db/metadata/11-Metadata.p  	3 23668
/Volumes/Music/Discog/artists-db/metadata/12-Metadata.p  	4 29655
/Volumes/Music/Discog/artists-db/metadata/13-Metadata.p  	5 35669
/Volumes/Music/Discog/artists-db/metadata/14-Metadata.p  	6 41527
/Volumes/Music/Discog/artists-db/metadata/15-Metadata.p  	7 47533
/Volumes/Music/Discog/artists-db/metadata/16-Metadata.p  	8 53440
/Volumes/Music/Discog/artists-db/metadata/17-Metadata.p  	9 59434
/Volumes/Music/Discog/artists-db/metadata/18-Metadata.p  	10 65426
/Volumes/Music/Discog/artists-db/metadata/19-Metadata.p  	11 71304
/Volumes/Music/Discog/artists-db/metadata/2-Metadata.p  	12 77191
/Volumes/Music/Discog/artists-db/metadata/20-Metadata.p  	13 83151
/Volumes/M

## Artist ID --> Albums

In [None]:
start, cmt = clock("Creating Artist DBs")

artistIDAlbumNames     = {}
artistIDAlbumRefs      = {}
artistIDCoreAlbumNames = {}
artistIDCoreAlbumRefs  = {}

artistMetadataDBDir = disc.getArtistsMetadataDBDir()
files = findPatternExt(artistMetadataDBDir, pattern="-MediaMetadata", ext='.p')

core = ["Albums"]
nAllAlbums  = 0
nCoreAlbums = 0
for i,ifile in enumerate(files):
    print(ifile,'\t',end="")
    db = getFile(ifile)
    
    for j,(artistID,artistData) in enumerate(db.items()):
        artistIDAlbumNames[artistID]     = {}
        artistIDAlbumRefs[artistID]      = {}
        artistIDCoreAlbumNames[artistID] = {}
        artistIDCoreAlbumRefs[artistID]  = {}
        
        for mediaName,mediaData in artistData.items():
            artistIDAlbumNames[artistID].update({mediaName: mediaData[0]})
            artistIDAlbumRefs[artistID].update({mediaName: mediaData[1]})
            nAllAlbums += len(artistIDAlbumNames[artistID].values())
            if mediaName in core:
                artistIDCoreAlbumNames[artistID].update({mediaName: mediaData[0]})
                artistIDCoreAlbumRefs[artistID].update({mediaName: mediaData[1]})
                nCoreAlbums += len(artistIDCoreAlbumNames[artistID].values())

    print("{0: <10}{1: <10}{2: <10}".format(len(artistIDAlbumNames),nCoreAlbums,nAllAlbums))
print("\n\n==============================================\n")
    
    
savenames = {"IDToAlbumNames": artistIDAlbumNames, "IDToAlbumRefs": artistIDAlbumRefs, 
             "IDToCoreAlbumNames": artistIDCoreAlbumNames, "IDToCoreAlbumRefs": artistIDCoreAlbumRefs}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)
    
    
elapsed(start, cmt)

## Artist ID --> Genre, Style, Artists Lookup Table

In [None]:
start, cmt = clock("Creating Artist DBs")

artistIDGenre          = {}
artistIDStyle          = {}
artistIDCollaborations = {}

albumsMetadataDBDir = disc.getAlbumsMetadataDBDir()
files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p')

for ifile in files:
    print(ifile,'\t',end="")
    for artistID,artistData in getFile(ifile).items():
        genre   = artistData['Genre']
        artistIDGenre[artistID] = genre
        artists = artistData['Artists']
        artistIDCollaborations[artistID] = artists
        style   = artistData['Style']
        artistIDStyle[artistID] = style
    print(len(artistIDGenre))
print("\n\n==============================================\n")
    
    
savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Artist{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True)   
    
elapsed(start, cmt)

## Album ID --> Name, Ref, Artists Lookup Table

In [None]:
start, cmt = clock("Creating Artist DBs")

albumIDToName    = {}
albumIDToRef     = {}
albumIDToArtists = {}
    
albumsMetadataDBDir = disc.getAlbumsMetadataDBDir()
files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistAlbums", ext='.p')
for ifile in files:
    print(ifile,'\t',end="")
    for artistID,artistData in getFile(ifile).items():
        for albumID,albumData in artistData.items():
            albumName    = albumData[0]
            albumRef     = albumData[1]
            albumCountry = albumData[2].most_common(1)[0]
            albumYear    = albumData[3].most_common(1)[0]

            
            albumIDToName[albumID] = albumName
            albumIDToRef[albumID]  = albumRef

            if albumIDToArtists.get(albumID) is None:                
                albumIDToArtists[albumID] = []
            albumIDToArtists[albumID].append(artistID)
    print(len(albumIDToArtists))
print("\n\n==============================================\n")

for albumID in albumIDToArtists.keys():
    albumIDToArtists[albumID] = list(set(albumIDToArtists[albumID]))
print("\n\n==============================================\n")

    
savenames = {"IDToName": albumIDToName, "IDToRef": albumIDToRef, "IDToArtists": albumIDToArtists}
for basename,savedata in savenames.items():
    savename = setFile(disc.getDiscogDBDir(), "Album{0}.p".format(basename))
    print("Saving {0} entries to {1}\n".format(len(savedata), savename))
    saveFile(ifile=savename, idata=savedata, debug=True) 
    
elapsed(start, cmt)

# Master Lookup Tests

In [None]:
print("Testing ArtistID --> Name")
discdf = disc.getArtistIDToNameData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Ref")
discdf = disc.getArtistIDToRefData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Variations")
discdf = disc.getArtistIDToRefData()
print("\tDim = {0}".format(len(discdf)))
print("")

In [None]:
print("Testing ArtistID --> Genre")
discdf = disc.getArtistIDToGenreData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Style")
discdf = disc.getArtistIDToStyleData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Collaborations")
discdf = disc.getArtistIDToCollaborationData()
print("\tDim = {0}".format(len(discdf)))
print("")

In [None]:
print("Testing ArtistID --> Album Names")
discdf = disc.getArtistIDToAlbumNamesData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Album Refs")
discdf = disc.getArtistIDToAlbumRefsData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Core Album Names")
discdf = disc.getArtistIDToCoreAlbumNamesData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing ArtistID --> Core Album Refs")
discdf = disc.getArtistIDToCoreAlbumRefsData()
print("\tDim = {0}".format(len(discdf)))
print("")

In [None]:
print("Testing AlbumID --> Album Names")
discdf = disc.getAlbumIDToNameData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing AlbumID --> Album Refs")
discdf = disc.getAlbumIDToRefData()
print("\tDim = {0}".format(len(discdf)))
print("")

print("Testing AlbumID --> Artists")
discdf = disc.getAlbumIDToArtistsData()
print("\tDim = {0}".format(len(discdf)))
print("")

# Pandas DB

## Slim Artist DB

In [24]:
from masterdb import getSlimArtistDB

In [25]:
discSlimArtistDB = getSlimArtistDB(disc)
discSlimArtistDB.head()

Current Time is Sat Dec 14, 2019 17:41:14 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToName.p
  --> This file is 11.2MB.
Loading /Volumes/Music/Discog/db/ArtistIDToName.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToRef.p
  --> This file is 13.3MB.
Loading /Volumes/Music/Discog/db/ArtistIDToRef.p
Creating Pandas DataFrame for 589990 Artists
	Shape --> (589990, 1)
  Finding Real Artist Name
	Shape --> (589990, 3)
  Removing None Artist
	Shape --> (589990, 3)
  Finding Disc Artist Name
	Shape --> (589990, 4)
DataFrame Shape is (589990, 4)
Current Time is Sat Dec 14, 2019 17:41:30 for Done with 
Process [{0}] took 16 seconds.
Saving Master Artist DB File
  --> This file is 47.0MB.


Unnamed: 0,Name,Artist,Num,DiscArtist
1000,Dave Clarke,Dave Clarke,,Dave Clarke
1000500,Club Pulse,Club Pulse,,Club Pulse
100200,Dike,Dike,,Dike
1002000,Larry Stokes,Larry Stokes,,Larry Stokes
1002600,Gidd Sanchez,Gidd Sanchez,,Gidd Sanchez


## Artist DB

In [29]:
from masterdb import getArtistDB

In [31]:
discArtistDB = getArtistDB(disc)
discArtistDB.head()

Current Time is Sat Dec 14, 2019 17:44:20 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToName.p
  --> This file is 11.2MB.
Loading /Volumes/Music/Discog/db/ArtistIDToName.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToRef.p
  --> This file is 13.3MB.
Loading /Volumes/Music/Discog/db/ArtistIDToRef.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToVariations.p
  --> This file is 22.6MB.
Loading /Volumes/Music/Discog/db/ArtistIDToVariations.p
Creating Pandas DataFrame for 589990 Artists
	Shape --> (589990, 1)
  Joining Ref
	Shape --> (589990, 2)
  Joining Variations
	Shape --> (589990, 3)
  Finding Real Artist Name
	Shape --> (589990, 6)
DataFrame Shape is (589990, 6)
Current Time is Sat Dec 14, 2019 17:44:53 for Done with 
Process [{0}] took 32 seconds.
Saving Master Artist DB File
  --> This file is 88.9MB.


Unnamed: 0,Name,Ref,Variations,Known,Artist,Num
1000,Dave Clarke,/artist/1000-Dave-Clarke,"[Dave Clarke, 7 Red 7, Clarck, Clarke, Clarke,...",True,Dave Clarke,
1000500,Club Pulse,/artist/1000500-Club-Pulse,[Club Pulse],True,Club Pulse,
100200,Dike,/artist/100200-Dike,"[Dike, D.I.K.E., Dike D, Uchegdu]",True,Dike,
1002000,Larry Stokes,/artist/1002000-Larry-Stokes,[Larry Stokes],True,Larry Stokes,
1002600,Gidd Sanchez,/artist/1002600-Gidd-Sanchez,[Gidd Sanchez],True,Gidd Sanchez,


## Artist Metadata DB

In [34]:
from masterdb import getArtistMetadataDB

In [35]:
discArtistMetadataDB = getArtistMetadataDB(disc)
discArtistMetadataDB.head()

Current Time is Sat Dec 14, 2019 17:47:09 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToGenre.p
  --> This file is 9.3MB.
Loading /Volumes/Music/Discog/db/ArtistIDToGenre.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToStyle.p
  --> This file is 13.6MB.
Loading /Volumes/Music/Discog/db/ArtistIDToStyle.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToCollaborations.p
  --> This file is 30.4MB.
Loading /Volumes/Music/Discog/db/ArtistIDToCollaborations.p
Creating Pandas DataFrame for 442853 Artists
	Shape --> (442853, 1)
  Joining Style
	Shape --> (442853, 2)
  Joining Collaboration
	Shape --> (442853, 3)
DataFrame Shape is (442853, 3)
Current Time is Sat Dec 14, 2019 17:48:11 for Done with 
Process [{0}] took 1.0 minutes.
Saving Master Artist Metadata DB File
  --> This file is 90.5MB.


Unnamed: 0,Genre,Style,Collaboration
1000,"{'Electronic': 53, 'Hip Hop': 1, 'Non-Music': ...","{'Techno': 45, 'Electro': 16, 'Downtempo': 1, ...","{'Dave Clarke': 52, 'Bang The Future': 1, 'Mr...."
1000500,{'Electronic': 1},{'Progressive Trance': 1},{'Club Pulse': 1}
100200,{'Hip Hop': 11},"{'Conscious': 7, 'Pop Rap': 2, '': 3}","{'Dike': 11, 'ABS (2)': 2, 'Creutzfeld&Jakob':..."
1002000,{},{},{}
1005400,{'Rock': 5},"{'Alternative Rock': 1, 'Indie Rock': 1, 'Math...","{'Gapeseed': 5, 'Gerling': 1}"


## Artist Albums DB

In [37]:
from masterdb import getArtistAlbumsDB

In [39]:
discArtistAlbumsDB = getArtistAlbumsDB(disc)
discArtistAlbumsDB.head()

Current Time is Sat Dec 14, 2019 17:50:16 for 
Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
  --> This file is 168.0MB.
Loading /Volumes/Music/Discog/db/ArtistIDToAlbumNames.p
Creating Pandas DataFrame for 589677 Artists
	Shape --> (589677, 1)
DataFrame Shape is (589677, 1)
Current Time is Sat Dec 14, 2019 17:51:53 for Done with 
Process [{0}] took 1.6 minutes.


Unnamed: 0,Albums
1000,"{'Albums': {'44815': 'Archive One', '44831': '..."
1000500,{'Miscellaneous': {'1178875': 'Peak Controller'}}
100200,"{'Albums': {'50511': 'PottpÃ¼ree'}, 'Singles &..."
1002000,{'Instruments & Performance': {'1002108': 'Are...
1002600,{'Vocals': {'1180905': 'Quiero'}}


## Albums DB

In [45]:
from masterdb import getAlbumDB

In [46]:
discAlbumDB = getAlbumDB(disc)
discAlbumDB.head()

Current Time is Sat Dec 14, 2019 17:53:02 for 
Loading AlbumID Data
Loading data from /Volumes/Music/Discog/db/AlbumIDToName.p
  --> This file is 31.9MB.
Loading /Volumes/Music/Discog/db/AlbumIDToName.p
Loading data from /Volumes/Music/Discog/db/AlbumIDToRef.p
  --> This file is 44.8MB.
Loading /Volumes/Music/Discog/db/AlbumIDToRef.p
Loading data from /Volumes/Music/Discog/db/AlbumIDToArtists.p
  --> This file is 18.1MB.
Loading /Volumes/Music/Discog/db/AlbumIDToArtists.p
Creating Pandas DataFrame for 1372657 Albums
	Shape --> (1372657, 1)
  Joining Ref
	Shape --> (1372657, 2)
  Joining Artists
	Shape --> (1372657, 3)
DataFrame Shape is (1372657, 3)
Current Time is Sat Dec 14, 2019 17:54:02 for Done with 
Process [{0}] took 59 seconds.


Unnamed: 0,Name,Ref,Artists
1012077,Live,/Dave-Clarke-Live/master/1012077,[1000]
1258261,Walls Of Genius And Miracle,/Walls-Of-Genius-and-Miracle-Walls-Of-Genius-A...,"[1009227, 1000, 182635]"
2546978,RA.EX007 Dave Clarke,/Dave-Clarke-RAEX007-Dave-Clarke/release/2546978,[1000]
44815,Archive One,/Dave-Clarke-Archive-One/master/44815,[1000]
44831,Devil's Advocate,/Dave-Clarke-Devils-Advocate/master/44831,[1000]


## Artist Album ID --> Known Albums

In [55]:
from masterdb import getArtistAlbumKnownDB

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
discArtistAlbumKnownDB = getArtistAlbumKnownDB(discAlbumDB, discArtistAlbumsDB)
discArtistAlbumKnownDB.head()

Current Time is Sat Dec 14, 2019 18:29:20 for 
Creating Pandas DataFrame for 589677 Arist Albums
	Shape --> (589677, 3)
DataFrame Shape is (589677, 3)
Current Time is Sat Dec 14, 2019 18:29:51 for Done with 
Process [{0}] took 31 seconds.


Unnamed: 0,Known Albums,All Albums,Albums
1000,53,53,"{'44815': ['Archive One', 'Albums', True], '44..."
1000500,1,1,"{'1178875': ['Peak Controller', 'Miscellaneous..."
100200,9,9,"{'50511': ['PottpÃ¼ree', 'Albums', True], '219..."
1002000,3,3,"{'1002108': ['Are You Faithful?', 'Instruments..."
1002600,1,1,"{'1180905': ['Quiero', 'Vocals', True]}"


# Joining Artist ID DataFrame

In [61]:
from masterdb import createMasterDB

In [62]:
createMasterDB(disc, discArtistDB, discArtistMetadataDB, discArtistAlbumKnownDB)

Current Time is Sat Dec 14, 2019 18:32:34 for 
Creating Pandas DataFrame for 589990 Arist IDs
  Joining Artist Metadata
	Shape --> (589990, 9)
  Joining Artist Albums
	Shape --> (589990, 12)
Current Time is Sat Dec 14, 2019 18:32:35 for Done with 
Process [{0}] took 902.0 millseconds.
Saving data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 543.5MB.
Saved data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 543.5MB.
