# Master Discogs Database

In [4]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib
from urllib.parse import quote

from discogsBase import discogs
from discogsUtils import discogsUtils
from collection import collections
from artist import artist
from timeUtils import clock, elapsed
from fsUtils import moveFile, setFile, setDir, setSubDir, isFile, isDir, mkDir
from fileUtils import getFileBasics, getBasename
from artists import artists
from artist import artist
from albums import albums
from album import album, albumURLInfo
from time import sleep

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-09-14 20:16:23.707550


In [5]:
%load_ext autoreload
%autoreload
disc = discogs()
arts = artists(disc)
art  = artist()
albs = albums(disc)
alb  = album()
dutils = discogsUtils()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Saved Discog Directory /Volumes/Music/Discog is Available
Local Discog Directory /Users/tgadfort/Music/Discog is Available
/Volumes/Music/Discog/base exists
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/base-db exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-extra exists
/Volumes/Music/Discog/search exists
/Volumes/Music/Discog/search-artists exists
/Volumes/Music/Discog/db exists
Found 100 artist DB files and that is equal to the max mod value


In [6]:
from pandas import Series, DataFrame

def realName(x):
    if x is None:
        return [None,-1]
    
    lenx = len(x)
    if len(x) < 1:
        return [x,-1]

        

    if x[-1] != ")":
        return [x, None]
    

    if lenx >=5:
        if x[-3] == "(":
            try:
                num = int(x[-2:-1])
                val = x[:-3].strip()
                return [val, num]
            except:
                return [x, None]
            
    if lenx >= 6:
        if x[-4] == "(":
            try:
                num = int(x[-3:-1])
                val = x[:-4].strip()
                return [val, num]
            except:
                return [x, None]
            
    if lenx >= 7:
        if x[-4] == "(":
            try:
                num = int(x[-3:-1])
                val = x[:-4].strip()
                return [val, num]
            except:
                return [x, None]

    return [x, None]
            

def getAlbumNames(x):
    if isinstance(x, dict):
        return list(x.values())
    else:
        return []
    
    
def splitMetaData(x):
    retval = {}
    if isinstance(x, dict):
        for k,v in x.items():
            retval[k] = [z[0] for z in v.most_common(3)]
    else:
        retval = None
    return retval

def createArtistAlbumsDB(disc):
    print("Creating ArtistAlbums DB")
    
    print("  Loading ArtistID Data")
    artistIDtoName  = disc.getArtistIDToNameData()
    artistIDtoRefs  = disc.getArtistIDToRefData()
    
    print("  Loading AlbumID Data")
    albumIDtoName   = disc.getAlbumIDToNameData()
    albumIDtoRef    = disc.getAlbumIDToRefData()

    print("  Loading ArtistID <-> AlbumID Data")
    artistIDtoAlbumNames = disc.getArtistIDAlbumNames()
    artistIDtoAlbumIDs   = disc.getArtistIDAlbumIDs()

    print("  Loading Artist MetaData")
    artistMetaData = disc.getAlbumArtistMetaData()


    sArtistToRef  = Series(artistIDtoRefs)
    sArtistToName = Series(artistIDtoName)
    sAlbumToRef   = Series(albumIDtoRef)
    sAlbumToName  = Series(albumIDtoName)

    sArtistToAlbums = Series(artistIDtoAlbumIDs)
    sArtistToAlbumNames = Series(artistIDtoAlbumNames)
    sArtistAlbums = Series([dict(zip(x, y)) for x,y in list(zip(sArtistToAlbums.values, sArtistToAlbumNames))], index=sArtistToAlbums.index)

    sArtistMetaData = Series(artistMetaData)
    sArtistMetaData = sArtistMetaData.apply(splitMetaData)
    
    print("  Creating Pandas DataFrame for {0} Artists".format(sArtistToRef.shape[0]))
    cols = ["Ref"]
    discdf = DataFrame(sArtistToRef)
    discdf.columns = cols
    discdf = discdf.join(DataFrame(sArtistToName))
    cols += ["Name"]
    discdf.columns = cols
    tmp = DataFrame(DataFrame(sArtistMetaData)[0].tolist())
    tmp.index = sArtistMetaData.index
    discdf = discdf.join(tmp)
    cols += ["Extra Artists", "Genres", "Styles"]
    discdf.columns = cols
    discdf = discdf.join(DataFrame(sArtistAlbums))
    cols += ["Albums Data"]
    discdf.columns = cols
    discdf["Known"] = True
    print("  DataFrame Shape is {0}".format(discdf.shape))    
    return discdf


def createArtistName(discdf):
    tmp = DataFrame(discdf["Name"].apply(realName).tolist())
    tmp.index = discdf.index
    discdf["Artist"]     = tmp[0]
    discdf["Artist Num"] = tmp[1]
    return discdf


def createAlbums(discdf):
    discdf["Albums"] = discdf["Albums Data"].apply(getAlbumNames)
    return discdf


def createCollectionsDB(disc):
    print("Creating Collections DB")

    print("  Loading Collection Data")
    colArtistIDtoName  = disc.getCollectionIDToNameData()
    colArtistIDtoRefs  = disc.getCollectionIDToRefData()
    colArtistReftoCnts = disc.getCollectionRefCountsData()

    sColArtistToRef  = Series(colArtistIDtoRefs)
    sColArtistToName = Series(colArtistIDtoName)
    sColArtistRefToCnts = Series(colArtistReftoCnts)

    print("  Creating Pandas DataFrame for {0} Artists".format(sColArtistToRef.shape[0]))
    cols = ["Ref"]
    coldiscdf = DataFrame(sColArtistToRef)
    coldiscdf.columns = cols
    coldiscdf = coldiscdf.join(DataFrame(sColArtistToName))
    cols += ["Name"]
    coldiscdf.columns = cols

    colrefdf = DataFrame(sColArtistRefToCnts)
    colrefdf.columns = ["Counts"]
    colrefdf.reset_index(inplace=True)
    colrefdf.columns = ["Ref", "Counts"]
    coldiscdf = coldiscdf.merge(colrefdf, on="Ref")
    coldiscdf.index = DataFrame(sColArtistToRef).index
    
    print("  DataFrame Shape is {0}".format(coldiscdf.shape))
    return coldiscdf


def mergeDBs(discdf, coldiscdf):
    print("  Merging AlbumArtists {0} and Collections {1} DBs".format(discdf.shape, coldiscdf.shape))
    musicdf = coldiscdf.merge(discdf, on=["Ref", "Name"], how='left')
    musicdf.index = coldiscdf.index
    print("  Merged DataFrame Shape is {0}".format(musicdf.shape))
    return musicdf


%load_ext autoreload
%autoreload
disc      = discogs()
discdf    = createArtistAlbumsDB(disc)
discdf    = createArtistName(discdf)
discdf    = createAlbums(discdf)
discdf.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Saved Discog Directory /Volumes/Music/Discog is Available
Local Discog Directory /Users/tgadfort/Music/Discog is Available
/Volumes/Music/Discog/base exists
/Volumes/Music/Discog/collections exists
/Volumes/Music/Discog/artists exists
/Volumes/Music/Discog/albums exists
/Volumes/Music/Discog/base-db exists
/Volumes/Music/Discog/collections-db exists
/Volumes/Music/Discog/artists-db exists
/Volumes/Music/Discog/albums-db exists
/Volumes/Music/Discog/artists-extra exists
/Volumes/Music/Discog/search exists
/Volumes/Music/Discog/search-artists exists
/Volumes/Music/Discog/db exists
Found 100 artist DB files and that is equal to the max mod value
Creating ArtistAlbums DB
  Loading ArtistID Data
Loading data from /Volumes/Music/Discog/db/ArtistIDToName.p
  --> This file is 9.1MB.
Loading /Volumes/Music/Discog/db/ArtistIDToName.p
Loading data from /Volumes/Music/Discog/db/ArtistIDToRef.p
  --> This file i

Unnamed: 0,Ref,Name,Extra Artists,Genres,Styles,Albums Data,Known,Artist,Artist Num,Albums
1003338,/artist/1003338-St-Pauls-Cathedral-Choir,St. Paul's Cathedral Choir,"[St Paul's Cathedral Choir, John Scott (10), B...",[Classical],"[Modern, Romantic, ]","{'1003580': 'The Sounds Of The Pipes', '985835...",True,St. Paul's Cathedral Choir,,"[The Sounds Of The Pipes, Unaccompanied Choral..."
100338,/artist/100338-Pain-Nail,Pain Nail,,,,"{'1825350': 'Promo Tape 1998', '114864': 'Magn...",True,Pain Nail,,"[Promo Tape 1998, Magneettinen Kohtalo, Strang..."
1003938,/artist/1003938-Fabiana,Fabiana,"[Fabiana, Shabba Ranks, Dennis Brown]",[Reggae],[Dancehall],,True,Fabiana,,[]
1005138,/artist/1005138-CC-Productions,C.C. Productions,,,,"{'988036': 'Highlights From High Society', '79...",True,C.C. Productions,,"[Highlights From High Society, Highlights From..."
1005638,/artist/1005638-Randy-Sandke,Randy Sandke,,,,"{'9961790': 'The Music Of Bob Haggart', '76081...",True,Randy Sandke,,"[The Music Of Bob Haggart, Tribute To Louis Ar..."


In [7]:
savename = disc.getMasterDiscogsDBFilename()
saveFile(idata=discdf, ifile=savename, debug=True)

Saving data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 400.5MB.
Saved data to /Volumes/Music/Discog/db/MasterDB.p
  --> This file is 400.5MB.
