In [1]:
## Basic stuff
%load_ext autoreload
%autoreload

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
#IPython.Cell.options_default.cm_config.lineNumbers = true;

In [3]:
from fileIO import fileIO
from fsUtils import fileUtil
from pandas import read_csv, DataFrame, Series, concat, to_datetime, isna, Timestamp


from timeUtils import timestat
from webUtils import getHTML
from fsUtils import setFile, setDir, mkDir

from timeUtils import timestat
from xml.etree import ElementTree
from glob import glob

io = fileIO()

Nmod = 100
#basedir = "."
basedir = "/Volumes/Seagate/DB"
#savedir = "DiscogsArtistAlbumData"

In [4]:
def roundTo(increment, value):
    return int(round(value / float(increment)) * increment)


def getElement(element,nC):
    if element is None or nC > 5:
        return None
    if element.text is not None:
        items  = element.items()
        return element.text if len(items) == 0 else [element.text, items]
    else:
        lElements = list(element)
        if len(lElements) > 0:
            retval = []
            for el in lElements:
                key  = el.tag
                val  = getElement(el,nC+1)
                if val is not None:
                    retval.append({key: val})
            return retval
            return [{el.tag: getElement(el,nC+1)} for el in lElements]
        else:
            items  = element.items()
            return items if len(items) > 0 else None

In [5]:
def getTree(ifile):
    tsFile  = timestat("Getting/Parsing XML Data For {0}".format(ifile))
    tree    = ElementTree.parse(ifile)
    root    = tree.getroot()
    tsFile.stop()
    return tree,root


def getTreeData(tree, key, keepKeys=None, N=None):
    treeData = {}
    elements = tree.findall(key)
    elements = elements[:N] if N is not None else elements
    N        = len(elements)
    modval   = roundTo(10000,N/5)
    
    tsParse  = timestat("Parsing {0} Elements".format(N))
    for i,element in enumerate(elements):
        _ = tsParse.update(n=i, N=N) if (modval > 0 and i % modval == 0 and i > 0) else None
        elementID,elementData = getElementData(element, keepKeys)
        treeData[elementID] = Series(elementData)
    tsParse.stop()
    return treeData


def getElementData(element, keepKeys):
    lelement    = list(element)
    subElements = [el for el in lelement if el.tag in keepKeys] if keepKeys is not None else lelement
    elementData = {el.tag: getElement(el,0) for el in subElements}
    elementID   = element.get("id")
    elementID   = elementData.get("id") if elementID is None else elementID
    elementID   = elementID if isinstance(elementID,str) else None
    return elementID,elementData


def getTreeDataFrame(treeData):
    ts = timestat("Creating DataFrame For {0} Elements".format(len(treeData)))
    df = DataFrame(treeData).T
    ts.update()
    return df

In [5]:
from numpy import isnan


def checkTypeAndLength(x, typeval, length=None):
    if x is None:
        return False
    if isinstance(x, float) and isnan(x):
        return False
    if length is not None:
        if isinstance(x, typeval) and len(x) == length:
            return True
    else:
        if isinstance(x, typeval):
            return True
        
    #print("Not sure how to process --> {0}. Expected {1}".format(x, typeval))
    return False


def fixListOfDict(x, key):
    masterListData = []
    if checkTypeAndLength(x, list):
        for dictval in x:
            if checkTypeAndLength(dictval, dict):
                masterListData.append(dictval[key])
    return masterListData


def fixGenresList(x):
    return fixListOfDict(x, 'genre')

def fixStylesList(x):
    return fixListOfDict(x, 'style')

def fixVariationsList(x):
    return fixListOfDict(x, 'name')

def fixArtistsList(x):
    if not checkTypeAndLength(x,list):
        return None
    
    masterArtistData = []
    for artistDict in x:
        if checkTypeAndLength(artistDict, dict):
            artistData = {key: val for item in artistDict['artist'] for key, val in item.items()}
            masterArtistData.append(artistData)
    return masterArtistData


def fixGroupItem(item):
    if checkTypeAndLength(item,list,2):
        if checkTypeAndLength(item[0],str) and checkTypeAndLength(item[1],list,1):
            if item[1][0][0] == 'id':
                return {item[1][0][1]: item[0]}
            else:
                print("Unsure how to parse alias data: {0}".format(item))
    return None


def fixGroupsList(x):
    if not checkTypeAndLength(x,list):
        return None

    masterGroupData = []
    for groupData in x:
        if checkTypeAndLength(groupData,dict,1):
            if groupData.get('name') is not None:
                groupItem = fixGroupItem(groupData['name'])
                masterGroupData.append(groupItem) if groupItem is not None else None
    return masterGroupData


def fixMembersList(x):
    if not checkTypeAndLength(x,list):
        return None

    masterMemberData = []
    for memberData in x:
        if checkTypeAndLength(memberData,dict):
            if memberData.get('name') is not None:
                memberItem = fixGroupItem(memberData['name'])
                masterMemberData.append(memberItem) if memberItem is not None else None
    return masterMemberData


def fixReleaseDate(x):
    if isinstance(x,float) and isnan(x):
        return None
    if x is None:
        return None
    if isinstance(x,str):
        retval = None
        x = x.strip()
        
        #### Just Year
        if x.isdigit() is True: 
            retval = to_datetime(x, format='%Y', errors='ignore')
            return retval
            
            
        splitX = x.split("-")
        #### Just Year/Month
        if len(splitX) == 2:
            retval = to_datetime(x, format='%Y-%m', errors='ignore')
            return retval
        
        #### Just Year/Month/Day
        if len(splitX) == 3:
            if splitX[1].isdigit() and splitX[2].isdigit():
                if int(splitX[1]) == 0:
                    retval = to_datetime(splitX[0], format='%Y', errors='ignore')
                    return retval
                elif int(splitX[2]) == 0:
                    retval = to_datetime("-".join(splitX[:2]), format='%Y-%m', errors='ignore')
                    return retval
                else:
                    retval = to_datetime(x, format='%Y-%m-%d', errors='ignore')
                    return retval

        if x == "?":
            return None
        print("Unknown Format: {0}".format(x))
              
    print("Unknown Type: [{0}] / [{1}]".format(x,type(x)))
    return x



def fixMasterArtists(x):
    if checkTypeAndLength(x, list):
        artists = [artistData['id'] for artistData in x]
        return artists
    return x

def fixMasterGenre(x):
    return x["MasterGenres"] if isna(x["MasterID"]) else None

def fixMasterStyle(x):
    return x["MasterStyles"] if isna(x["MasterID"]) else None


def fixFormat(x):
    if not checkTypeAndLength(x, list):
        return None
    
    for fmatData in x:
        if checkTypeAndLength(fmatData,dict,1):
            if checkTypeAndLength(fmatData.get('format'),list):
                    for fmatDescr in fmatData['format']:
                        if checkTypeAndLength(fmatDescr,dict):
                            if checkTypeAndLength(fmatDescr.get('descriptions'),list):
                                return fixListOfDict(fmatDescr['descriptions'], 'description')
                        elif checkTypeAndLength(fmatDescr,tuple):
                            if len(fmatDescr) == 2:
                                if fmatDescr[0] == "name":
                                    return [fmatDescr[1]]
                                
    print("Could not parse format: {0}".format(x))
    return x
                        

def mergeListOfDicts(x):
    retval = {}
    if checkTypeAndLength(x,list):
        for item in x:
            if checkTypeAndLength(item,dict):
                retval.update(item)
    return retval

def fixTrackList(x):
    if not checkTypeAndLength(x, list):
        return None
    
    trackArtists = {}
    
    tracks = []
    for trackData in x:
        trackInfo = {}
        if checkTypeAndLength(trackData,dict,1):
            if checkTypeAndLength(trackData.get('track'),list):
                trackValue = mergeListOfDicts(trackData['track'])
                if trackValue.get('extraartists') is not None:
                    trackValue['extraartists'] = fixArtistsList(trackValue['extraartists'])
                    for artist in trackValue['extraartists']:
                        trackArtists[artist['id']] = True
                tracks.append(trackValue)

    trackArtists = list(trackArtists.keys())
    return {"Artists": trackArtists, "Tracks": tracks}


def fixMasterID(x):
    if x is None:
        return
    if checkTypeAndLength(x, list, 2):
        return [x[0],x[1][0][1] == 'true']
    return None

In [6]:
def fixDataFrame(df):
    ts = timestat("Fixing Artists/Genres/Styles Data For {0} Elements".format(df.shape[0]))
    coldrops = []
    if "artists" in df.columns:
        df["MasterArtists"] = df["artists"].apply(fixArtistsList)
        coldrops.append("artists")
        #ts.update()
    if "genres" in df.columns:
        df["MasterGenres"] = df["genres"].apply(fixGenresList)
        coldrops.append("genres")
        #ts.update()
    if "styles" in df.columns:
        df["MasterStyles"] = df["styles"].apply(fixStylesList)
        coldrops.append("styles")
        #ts.update()
    if "aliases" in df.columns:
        df["MasterAliases"] = df["aliases"].apply(fixGroupsList)
        coldrops.append("aliases")   
        #ts.update()
    if "groups" in df.columns:
        df["MasterGroups"] = df["groups"].apply(fixGroupsList)
        coldrops.append("groups")   
        #ts.update()
    if "members" in df.columns:
        df["MasterMembers"] = df["members"].apply(fixMembersList)
        coldrops.append("members")   
        #ts.update()
    if "namevariations" in df.columns:
        df["MasterNameVariations"] = df["namevariations"].apply(fixVariationsList)
        coldrops.append("namevariations")
        #ts.update()
    if "released" in df.columns:
        df["MasterReleaseDate"] = df["released"].apply(fixReleaseDate)
        coldrops.append("released")   
        #ts.update()
    if "master_id" in df.columns:
        mIDData = df["master_id"].apply(lambda x: x[0] if isinstance(x,list) else None)
        mIDData.name = "MasterID"
        #ts.update()
        isMData = df["master_id"].apply(lambda x: x[1][0][1] == 'true' if isinstance(x,list) else None)
        isMData.name = "IsMaster"
        df = df.join(mIDData).join(isMData)
        coldrops.append("master_id")
        #ts.update()
    if "id" in df.columns:
        coldrops.append("id")
    ts.stop()
    
    if len(coldrops) > 0:
        df.drop(coldrops, axis=1, inplace=True)
    return df



def saveModData(df):
    io = fileIO()
    Nmod = 10
    releaseMap = {x: [] for x in range(Nmod)}
    ts = timestat("Saving Mod Data")
    for releaseID,releaseData in df.iterrows():
        artistIDs = [artistData['id'] for artistData in releaseData["MasterArtists"]] if isinstance(releaseData["MasterArtists"],list) else []
        for artistID in artistIDs:
            releaseMap[int(artistID)%Nmod].append(releaseID)

    ts.update()
    for modVal,modValData in releaseMap.items():
        savename  = "{0}-ArtistMod{1}.p".format(ifile.split("-")[0], modVal)
        dfModData = df.loc[modValData]
        if fileInfo(savename).exists is True:
            previousData = io.get(savename)
            print("Found Previous {0} Releases".format(previousData.shape[0]))
            dfModData = concat([dfModData, previousData])
            dfModData = dfModData[~dfModData.index.duplicated()]
            print("Saving {0} Releases To {1}".format(dfModData.shape[0], savename))
            io.save(idata=dfModData, ifile=savename)
        elif fileInfo(savename).exists is False:
            print("Saving {0} Releases To {1}".format(dfModData.shape[0], savename))
            io.save(idata=dfModData, ifile=savename)
        ts.update()
    ts.stop()

In [7]:
def splitList(ilist, N):
    retval = {k: [] for k in range(N)}
    for i,lval in enumerate(ilist):
        retval[i%N].append(lval)
    return retval

# Artists

## Split Code

In [None]:
discKey = "Artists"

In [None]:
!xml_split -s 200MB /Volumes/Seagate/DB/discogs_20211001_artists.xml
#  xml_split -s 1000MB discogs_20211001_artists.xml

## Parse Code

In [None]:
files      = [x for x in glob("/Volumes/Seagate/DB/discogs_20211001_{0}-*.xml".format(discKey.lower())) if fileInfo(x).basefilename.endswith("00") is False]
print("Found {0} Files".format(len(files)))
#splitFiles = splitList(files, N=3)

In [None]:
#keys = ["groups", "members", "namevariations", "aliases"]
#keys = ["profile", "urls"]
keepKeys    = ["id", "name", "realname"] + ["groups", "members", "namevariations", "aliases"]

In [None]:
ts          = timestat("Parsing {0} Data".format(discKey))
sfiles      = files
N           = len(sfiles)
for i,ifile in enumerate(sfiles):
    savename = ifile.replace(".xml", ".p")
    if fileInfo(savename).exists is True:
        continue

    tree,root = getTree(ifile)
    treeData  = getTreeData(tree, key='artist', keepKeys=keepKeys)
    
    df = getTreeDataFrame(treeData)
    df = fixDataFrame(df)
    io.save(idata=df, ifile=savename)
    
    del df
    del treeData
    del tree
    del root
    
    ts.update(n=i+1,N=N)
    print("\n\n")
ts.stop()

# Process [Parsing Artists Data] Took 69.9 Minutes

## Merge Code

In [None]:
files      = [x for x in glob("/Volumes/Seagate/DB/discogs_20211001_{0}-*.p".format(discKey.lower())) if fileInfo(x).basefilename.endswith("00") is False]
print("Found {0} Files".format(len(files)))

ts = timestat("Loading {0} DataFrames".format(len(files)))
pdata = [io.get(ifile) for ifile in files]
ts.update()
pdf = concat(pdata)
ts.update()
io.save(idata=pdf, ifile="/Volumes/Seagate/DB/discogs_20211001_{0}.p".format(discKey.lower()))
ts.stop()
# Process [Loading 10 DataFrames] Took 1.8 Minutes

# Master

In [None]:
discKey = "masters"

## Split Code

In [None]:
!xml_split -s 250MB /Volumes/Seagate/DB/discogs_20211001_masters.xml
#  xml_split -s 1000MB discogs_20211001_releases.xml

## Parse Code

In [None]:
files      = [x for x in glob("/Volumes/Seagate/DB/discogs_20211001_{0}-*.xml".format(discKey.lower())) if fileInfo(x).basefilename.endswith("00") is False]
print("Found {0} Files".format(len(files)))
#splitFiles = splitList(files, N=3)

In [None]:
ts          = timestat("Parsing {0} Data".format(discKey))
#sfiles      = splitFiles[0]
sfiles      = files
N           = len(sfiles)
keepKeys    = ["main_release", "artists", "year", "title", 'styles', 'genres']


for i,ifile in enumerate(sfiles):
    savename = ifile.replace(".xml", ".p")
    if fileInfo(savename).exists is True:
        continue

    tree,root = getTree(ifile)
    treeData  = getTreeData(tree, key='master', keepKeys=keepKeys)
    
    df = getTreeDataFrame(treeData)
    df = fixDataFrame(df)
    io.save(idata=df, ifile=savename)
    
    del df
    del treeData
    del tree
    del root
    
    ts.update(n=i+1,N=N)
    print("\n\n")
ts.stop()

# Process [Parsing masters Data] Took 18.2 Minutes

## Merge Code

In [None]:
files      = [x for x in glob("/Volumes/Seagate/DB/discogs_20211001_{0}-*.p".format(discKey.lower())) if fileInfo(x).basefilename.endswith("00") is False]
print("Found {0} Files".format(len(files)))

ts = timestat("Loading {0} DataFrames".format(len(files)))
pdata = [io.get(ifile) for ifile in files]
ts.update()
pdf = concat(pdata)
ts.update()
io.save(idata=pdf, ifile="/Volumes/Seagate/DB/discogs_20211001_{0}.p".format(discKey.lower()))
ts.stop()
# Process [Loading 10 DataFrames] Took 1.8 Minutes

In [None]:
masterData = io.get("/Volumes/Seagate/DB/discogs_20211001_masters.p")

In [None]:
masterData.loc['45284']

# Release

In [None]:
discKey = "Releases"

## Split Code

In [None]:
# !xml_split -s 3GB /Volumes/Seagate/DB/discogs_20211001_releases.xml
#  xml_split -s 1000MB discogs_20211001_releases.xml

## Parse Code

In [None]:
files      = [x for x in glob("/Volumes/Seagate/DB/discogs_20211001_releases-*.xml") if fileInfo(x).basefilename.endswith("00") is False]
releaseFiles = {fileInfo(ifile).basefilename.split("-")[1]: ifile for ifile in files[1:]}
print("Found {0} Files".format(len(releaseFiles)))

In [None]:
tsAll       = timestat("Parsing {0} Data".format(discKey))
N           = len(releaseFiles)
keepKeys    = ["artists", "title", "genres", "styles", "country", "released", "master_id", 'formats', 'tracklist', 'extraartists']

for i,ifile in releaseFiles.items():
    ts = timestat("Getting Data From {0}".format(ifile))
    tree,root = getTree(ifile)
    treeData  = getTreeData(tree, key='release', keepKeys=keepKeys)
    ts.update()
    
    extraData = {releaseID: {"Genres": fixGenresList(releaseData.get('genres')),
                             "Styles": fixStylesList(releaseData.get("styles")),
                             "Country": releaseData.get('country'),
                             "Master": fixMasterID(releaseData.get("master_id"))} for releaseID,releaseData in treeData.items()}
    trackData = {releaseID: fixTrackList(releaseData['tracklist']) for releaseID,releaseData in treeData.items()}
    artistData = {releaseID: {"Artists": fixArtistsList(releaseData.get('artists')),
                              "ExtraArtists": fixArtistsList(releaseData.get('extraartists'))} for releaseID,releaseData in treeData.items()}
    generalData = {releaseID: {"Title": releaseData.get('title'),
                               "Format": fixFormat(releaseData.get('formats')),
                               "Released": fixReleaseDate(releaseData.get('released'))} for releaseID,releaseData in treeData.items()}
    
    extraData   = Series(extraData)
    trackData   = Series(trackData)
    artistData  = Series(artistData)
    generalData = Series(generalData)
    ts.update()
    
    savedir = setDir(basedir, "DiscogsReleaseData")
    io.save(idata=extraData, ifile=setFile(savedir, "{0}-{1}.p".format("ExtraReleaseData", i)))
    io.save(idata=trackData, ifile=setFile(savedir, "{0}-{1}.p".format("TrackReleaseData", i)))
    io.save(idata=artistData, ifile=setFile(savedir, "{0}-{1}.p".format("ArtistReleaseData", i)))
    io.save(idata=generalData, ifile=setFile(savedir, "{0}-{1}.p".format("GeneralReleaseData", i)))

    del extraData
    del trackData
    del generalData
    del treeData
    del tree
    del root
    
    ts.stop()
    tsAll.update(n=i,N=N)
tsAll.stop()

## Create Slimmed Release Data

In [None]:
sFiles = splitList([x for x in glob("/Volumes/Seagate/DB/discogs_20211001_releases-*.p") if fileInfo(x).basefilename.endswith("00") is False], 10)

tsMods = timestat("Creating ReleaseModData")
for modVal,files in sFiles.items():
    tsAll = timestat("Creating ReleaseData For ModVal={0}".format(modVal))
    releaseData = None
    for i,ifile in enumerate(files):
        ts = timestat("Fixing {0}".format(ifile))
        xData = io.get(ifile)
        ts.update()
        xData = fixDataFrame(xData)
        xData["MasterArtists"] = xData["MasterArtists"].apply(fixMasterArtists)
        xData["MasterGenres"]  = xData.apply(fixMasterGenre, axis=1)
        xData["MasterStyles"]  = xData.apply(fixMasterStyle, axis=1)
        xData.drop(['IsMaster'], axis=1, inplace=True)
        releaseData = xData if releaseData is None else concat([releaseData,xData])
        ts.stop()
        print(i,'\t',len(files),'\t',releaseData.shape,'\n')
        
    tsAll.stop()
    
    io.save(idata=releaseData, ifile="/Volumes/Seagate/DB/DiscogsArtistAlbumData/ReleaseData-{0}.p".format(modVal))
    tsMods.update()
    print("\n",'='*150,'\n')
tsMods.stop()

# Create Artist-Keyed Mod Data

## Artist Mod Data

In [None]:
tsFile = timestat("Loading Artist Data")
ifile = setFile(basedir, "discogs_20211001_artists.p")
artistsData = io.get(ifile)
savedir = setDir(basedir, "DiscogsMetadata")
sIndex = Series(artistsData.index)
tsFile.stop()

ts = timestat("Creating Artist Data")
for modVal in range(Nmod):
    modDir = mkDir(setDir(savedir, str(modVal)))
    savename  = setFile(modDir, "{0}-{1}.p".format("Artist", modVal))
    if fileInfo(savename).exists:
        continue
        
    idxVals = sIndex[sIndex.apply(lambda x: int(x)%Nmod == modVal)]
    modValData = artistsData[artistsData.index.isin(idxVals)]
    
    if fileInfo(savename).exists is True:
        previousData = io.get(savename)
        dfModData = concat([modValData, previousData])
        io.save(idata=modValData, ifile=savename)
    elif fileInfo(savename).exists is False:
        io.save(idata=modValData, ifile=savename)
    if (modVal+1) % 10 == 0 or (modVal+1) == 5:
        ts.update(n=modVal+1,N=Nmod)
ts.stop()

#Current Time is Thu Oct 21, 2021 09:24 for Loading Artist Data
#  Process [Loading Artist Data] Took 1.1 Minutes
#Current Time is Thu Oct 21, 2021 09:25 for Creating Artist Data
#  50/100     : Process [Creating Artist Data] Has Run For 3.9 Minutes
# 100/100     : Process [Creating Artist Data] Has Run For 7.9 Minutes
#  Process [Creating Artist Data] Took 7.9 Minutes

## Master Mod Data

In [None]:
tsAll = timestat("Creating Master Mod Data")
tsFile = timestat("Loading Master Data")
ifile = setFile(basedir, "discogs_20211001_masters.p")
mastersData = io.get(ifile)
mastersData["ArtistIDs"] = mastersData["MasterArtists"].apply(fixMasterArtists)
savedir = setDir(basedir, "DiscogsMetadata")
tsFile.stop()


artistModData      = {x: {} for x in range(Nmod)}
artistModGenreData = {x: {} for x in range(Nmod)}
from collections import Counter
N = mastersData.shape[0]
ts = timestat("Getting Artist/Master Mod Data")
for i,(idx,row) in enumerate(mastersData.iterrows()):
    if (i+1) % 250000 == 0 or (i+1) == 100000:
        ts.update(n=i+1, N=N)
    for artistID in row["ArtistIDs"]:
        modVal = int(artistID) % Nmod
        if artistModData[modVal].get(artistID) is None:
            artistModData[modVal][artistID] = {}
        artistModData[modVal][artistID][idx] = {"Artists": row["ArtistIDs"], "Album": row["title"],
                                                "Year": row["year"], "MainRelease": row["main_release"]}
        if artistModGenreData[modVal].get(artistID) is None:
            artistModGenreData[modVal][artistID] = {"Genres": Counter(), "Styles": Counter()}
        for genre in row["MasterGenres"]:
            artistModGenreData[modVal][artistID]["Genres"][genre] += 1
        for style in row["MasterStyles"]:
            artistModGenreData[modVal][artistID]["Styles"][style] += 1
ts.stop()            
            
            
artistModData = {modVal: Series(modData) for modVal,modData in artistModData.items()}
artistModGenreData = {modVal: Series({artistID: {"Genres": [item[0] for item in artistData["Genres"].most_common(3)],
                                                 "Styles": [item[0] for item in artistData["Styles"].most_common(3)]} for artistID,artistData in modValData.items()}) for modVal,modValData in artistModGenreData.items()}            
            
ts = timestat("Saving Artist Master Mod Data")
for modVal,modValData in artistModData.items():
    modDir = mkDir(setDir(savedir, str(modVal)))
    savename  = setFile(modDir, "{0}-{1}.p".format("MasterAlbum", modVal))
    if fileInfo(savename).exists is True:
        previousData = io.get(savename)
        dfModData = concat([modValData, previousData])
        io.save(idata=modValData, ifile=savename)
    elif fileInfo(savename).exists is False:
        io.save(idata=modValData, ifile=savename)
    if (modVal+1) % 10 == 0 or (modVal+1) == 5:
        ts.update(n=modVal+1,N=Nmod)
ts.stop()

ts = timestat("Saving Artist Master Genre Mod Data")
for modVal,modValData in artistModGenreData.items():
    modDir = mkDir(setDir(savedir, str(modVal)))
    savename  = setFile(modDir, "{0}-{1}.p".format("MasterGenre", modVal))
    if fileInfo(savename).exists is True:
        previousData = io.get(savename)
        dfModData = concat([modValData, previousData])
        io.save(idata=modValData, ifile=savename)
    elif fileInfo(savename).exists is False:
        io.save(idata=modValData, ifile=savename)
    if (modVal+1) % 10 == 0 or (modVal+1) == 5:
        ts.update(n=modVal+1,N=Nmod)
ts.stop()
tsAll.stop()

# Current Time is Thu Oct 21, 2021 09:36 for Loading Master Data
#   Process [Loading Master Data] Took 25.4 Seconds
# Current Time is Thu Oct 21, 2021 09:36 for Getting Artist/Master Mod Data
#   Process [Getting Artist/Master Mod Data] Took 6.1 Minutes
# Current Time is Thu Oct 21, 2021 09:36 for Getting Artist/Master Mod Data
#   Process [Saving Artist Master Mod Data] Took 40.9 Seconds
# Current Time is Thu Oct 21, 2021 09:43 for Saving Artist Master Genre Mod Data
#   Process [Saving Artist Master Genre Mod Data] Took 3.8 Seconds
#   Process [Creating Master Mod Data] Took 7.4 Minutes

## Release Mod Data

### Create Master ReleaseID <-> ArtistID Map

In [None]:
masterReleaseArtistModMap = {}
ts = timestat("Creating Master Release ArtistID Map")
artistReleaseFiles = glob(setFile(setDir(basedir, "DiscogsReleaseData"), "ArtistReleaseData-*.p"))
for i,ifile in enumerate(artistReleaseFiles):
    releaseArtistData = io.get(ifile)
    for releaseID,releaseData in releaseArtistData.iteritems():
        ## Ignoring Extra Artists For Now
        artistIDs = fixMasterArtists(releaseData["Artists"])
        masterReleaseArtistModMap[releaseID] = {int(artistID)%Nmod: artistID for artistID in artistIDs} if isinstance(artistIDs,list) else {}
    if (i+1) % 10 == 0 or (i+1) == 5:
        ts.update(n=i+1,N=len(artistReleaseFiles))
ts.update()
io.save(idata=masterReleaseArtistModMap, ifile=setFile(setDir(basedir, "DiscogsReleaseData"), "ArtistReleaseMap.p"))
del masterReleaseArtistModMap
ts.stop()

In [None]:
useTracks = False
useExtras = True

In [None]:
tsAll = timestat("Creating Release Mod Data")

tsMap = timestat("Loading Release ID <-> ArtistID Map Map")
masterReleaseArtistModMap = io.get(ifile=setFile(setDir(basedir, "DiscogsReleaseData"), "ArtistReleaseMap.p"))
tsMap.stop()
fCheck = []
artistReleaseFiles  = glob(setFile(setDir(basedir, "DiscogsReleaseData"), "ArtistReleaseData-*.p"))
artistReleaseFiles  = {fileInfo(ifile).basefilename.split("-")[1]: ifile for ifile in artistReleaseFiles}
fCheck.append(artistReleaseFiles)
print("Found {0} Artist Release Files".format(len(artistReleaseFiles)))
generalReleaseFiles = glob(setFile(setDir(basedir, "DiscogsReleaseData"), "GeneralReleaseData-*.p"))
generalReleaseFiles = {fileInfo(ifile).basefilename.split("-")[1]: ifile for ifile in generalReleaseFiles}
fCheck.append(generalReleaseFiles)
print("Found {0} General Release Files".format(len(generalReleaseFiles)))
extraReleaseFiles   = glob(setFile(setDir(basedir, "DiscogsReleaseData"), "ExtraReleaseData-*.p")) if useExtras else []
extraReleaseFiles   = {fileInfo(ifile).basefilename.split("-")[1]: ifile for ifile in extraReleaseFiles}
if useExtras:
    fCheck.append(extraReleaseFiles)
    print("Found {0} Extra Release Files".format(len(generalReleaseFiles)))
trackReleaseFiles   = glob(setFile(setDir(basedir, "DiscogsReleaseData"), "TrackReleaseData-*.p")) if useTracks else []
trackReleaseFiles   = {fileInfo(ifile).basefilename.split("-")[1]: ifile for ifile in trackReleaseFiles}
if useTracks:
    fCheck.append(trackReleaseFiles)
    print("Found {0} Track Release Files".format(len(trackReleaseFiles)))


if len(set([len(x) for x in fCheck])) == 1:
    Nrelmods = list(generalReleaseFiles.keys())
else:
    raise ValueError("Didn't find an equal number of files")
    
#Current Time is Sat Oct 23, 2021 12:20 for Loading Release ID <-> ArtistID Map Map
#  Process [Loading Release ID <-> ArtistID Map Map] Took 3.1 Minutes

In [None]:
def saveArtistModData(artistModData, savedir):
    ts = timestat("Saving Artist Mod Data")
    for modVal,modValData in artistModData.items():
        modDir = mkDir(setDir(savedir, str(modVal)))
        savename  = setFile(modDir, "{0}-{1}.p".format("ReleaseAlbum", modVal))
        if fileInfo(savename).exists is True:
            previousData = io.get(savename)
            dfModData = concat([modValData, previousData])
            io.save(idata=dfModData, ifile=savename)
            del dfModData
        elif fileInfo(savename).exists is False:
            io.save(idata=modValData, ifile=savename)
        if (modVal+1) % 25 == 0:
            ts.update(n=modVal+1,N=Nmod)
    ts.stop()

In [None]:
savedir = setDir(basedir, "DiscogsMetadata")
tsTotal = timestat("Getting Artist/Release Mod Data")
artistModData = {x: {} for x in range(Nmod)}
for n,relmodVal in enumerate(Nrelmods):
    artistRelModData  = io.get(artistReleaseFiles[relmodVal])
    generalRelModData = io.get(generalReleaseFiles[relmodVal])
    extraRelModData   = io.get(extraReleaseFiles[relmodVal]) if useExtras else None
    trackRelModData   = io.get(trackReleaseFiles[relmodVal]) if useTracks else None

    
    N = generalRelModData.shape[0]
    ts = timestat("Splitting {0} Release Data For RelMod={1}".format(N,relmodVal))
    for i,(idx,row) in enumerate(generalRelModData.iteritems()):
        rowData = {"Album": row["Title"], "Format": row["Format"], "Year": row["Released"].year if isinstance(row["Released"], Timestamp) else None}
        rowData.update({"Artists": artistRelModData.get(idx)['Artists'], "ExtraArtists": artistRelModData.get(idx)['ExtraArtists']})
        if useExtras:
            rowData.update(extraRelModData[idx])
        if useTracks:
            rowData.update(trackRelModData[idx])
        
        modVals = masterReleaseArtistModMap.get(idx)
        for modVal,artistID in modVals.items():
            if artistModData[modVal].get(artistID) is None:
                artistModData[modVal][artistID] = {}
            artistModData[modVal][artistID][idx] = rowData

        if (i+1) % 500000 == 0 or (i+1) == 250000:
            ts.update(n=i+1,N=N)
    ts.stop()
    
    if (n+1) % 15 == 0 or (n+1) == len(Nrelmods):
        artistModData = {modVal: Series(modData) for modVal,modData in artistModData.items()}
        saveArtistModData(artistModData, savedir)
        del artistModData
        artistModData = {x: {} for x in range(Nmod)}
    
    tsTotal.update(n=n, N=len(Nrelmods))
#del masterReleaseArtistModMap
tsTotal.stop()
 
# Current Time is Sat Oct 23, 2021 12:23 for Getting Artist/Release Mod Data
#   14/66      : Process [Getting Artist/Release Mod Data] Has Run For 17.3 Minutes
#   29/66      : Process [Getting Artist/Release Mod Data] Has Run For 63.8 Minutes
#   44/66      : Process [Getting Artist/Release Mod Data] Has Run For 122.9 Minutes
#   65/66      : Process [Getting Artist/Release Mod Data] Has Run For 221.3 Minutes
#   Process [Getting Artist/Release Mod Data] Took 221.3 Minutes

### Merge Multi Index

# DB Data

In [None]:
modVal = 0
artistModData       = io.get(setFile(modDir, "{0}-{1}.p".format("Artist", modVal)))
artistModData

In [None]:
from artistDBBase import artistDBBase, artistDBDataClass
from artistDBBase import artistDBNameClass, artistDBMetaClass, artistDBIDClass, artistDBURLClass, artistDBPageClass
from artistDBBase import artistDBProfileClass, artistDBMediaClass, artistDBMediaAlbumClass
from artistDBBase import artistDBMediaDataClass, artistDBMediaCountsClass, artistDBFileInfoClass
from artistDBBase import artistDBTextClass, artistDBLinkClass
from strUtils import fixName
from dbUtils import utilsDiscogs

def getMediaCounts(self, media):
    amcc = artistDBMediaCountsClass()

    credittype = "Releases"
    if amcc.counts.get(credittype) == None:
        amcc.counts[credittype] = {}
    for creditsubtype in media.media.keys():
        amcc.counts[credittype][creditsubtype] = int(len(media.media[creditsubtype]))

    return amcc

savedir = setDir(basedir, "DiscogsMetadata")
tsAll = timestat("Creating DB Data")
for modVal in range(Nmod):
    
    tsLoad = timestat("Loading ModValData")
    modDir = setDir(savedir, str(modVal))
    artistModData       = io.get(setFile(modDir, "{0}-{1}.p".format("Artist", modVal)))
    masterAlbumModData  = io.get(setFile(modDir, "{0}-{1}.p".format("MasterAlbum", modVal)))
    masterGenreModData  = io.get(setFile(modDir, "{0}-{1}.p".format("MasterGenre", modVal)))
    releaseAlbumModData = io.get(setFile(modDir, "{0}-{1}.p".format("ReleaseAlbum", modVal)))
    tsLoad.stop()
        

    modValData = {}
    N = artistModData.shape[0]
    tsMod = timestat("Creating DB Data From {0} Artists For ModVal={1}".format(N,modVal))
    for i,(artistID,artistData) in enumerate(artistModData.iterrows()):
        artistName  = artistData["name"]
        url         = "https://www.discogs.com/artist/{0}".format(artistID)

        generalData = {}
        generalData["RealName"]   = artistData["realname"]
        generalData["Aliases"]    = artistData["MasterAliases"]
        generalData["Groups"]     = artistData["MasterGroups"]
        generalData["Members"]    = artistData["MasterMembers"]
        generalData["Variations"] = artistData["MasterNameVariations"]
        generalData = {k: v for k,v in generalData.items() if v is not None}
        generalData = generalData if len(generalData) > 0 else None
        
        masterGenreData = masterGenreModData.get(artistID)
        masterAlbumData = masterAlbumModData.get(artistID)
        releaseAlbumData = releaseAlbumModData.get(artistID)
        
        genresData = masterGenreData.get("Genres") if isinstance(masterGenreData,dict) else None
        tagsData   = masterGenreData.get("Styles") if isinstance(masterGenreData,dict) else None
        
        if isinstance(releaseAlbumData,Series):
            artistReleaseData = {releaseID: releaseIDData for releaseData in releaseArtistAlbumData for releaseID, releaseIDData in releaseData.items()}
        elif isinstance(releaseAlbumData,dict):
            artistReleaseData = releaseAlbumData
        elif releaseAlbumData == None:
            artistReleaseData = {}
        else:
            print("Unsure how to parse [{0}] / [{1}]".format(artistID,releaseAlbumData))
            artistReleaseData = {}
            
                    
        ########################################################################
        # Get Releases
        ########################################################################
        mediaName = "Releases"
        mediaData = {mediaName: []}
        for code,releaseInfo in artistReleaseData.items():
            album        = releaseInfo["Album"]
            albumURL     = "https://www.discogs.com/release/{0}".format(code)
            
            albumArtists = releaseInfo["Artists"] + releaseInfo["ExtraArtists"] if isinstance(releaseInfo["ExtraArtists"],list) else releaseInfo["Artists"]
            albumFormat  = releaseInfo["Format"]
            albumYear    = releaseInfo["Year"]
            amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=albumYear)
            mediaData[mediaName].append(amdc)
            
        credittype = "Releases"
        counts = {credittype: {mediaName: len(mediaData[mediaName])}}

            
            
        artist      = artistDBNameClass(name=artistName, err=None)
        meta        = artistDBMetaClass(title=None, url=url)
        url         = artistDBURLClass(url=url)
        ID          = artistDBIDClass(ID=artistID)
        pages       = artistDBPageClass(ppp=1, tot=1, redo=False, more=False)
        profile     = artistDBProfileClass(general=generalData, genres=genresData, tags=tagsData)
        media       = artistDBMediaClass()
        media.media = mediaData
        mediaCounts = artistDBMediaCountsClass()
        mediaCounts.counts = counts
        info        = artistDBFileInfoClass(info=None)
        
        adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info)
        modValData[artistID] = adc
        
        if (i+1) % 35000 == 0 or (i+1) == 10000:
            tsMod.update(n=i+1, N=N)
    tsMod.stop()
            
    outdir = setDir(basedir, "DiscogsDBData")
    io.save(idata=Series(modValData), ifile=setFile(outdir, "{0}-{1}.p".format(modVal, "DB")))
    tsAll.update()
    print("\n")

# Merge DB Data With Downloaded DB Data

In [9]:
ts = timestat("Merging DBs")
for n,modVal in enumerate(range(1,100)):
    newDB = Series(io.get("/Volumes/Seagate/DB/DiscogsDBData/{0}-DB.p".format(modVal)))
    known = io.get("/Users/tgadfort/dbdiscogs/artists-discogs-db/{0}-DB.p".format(modVal))
    
    toMerge = newDB[~newDB.index.isin(known.index)]
    fullDB = concat([known,toMerge]).sort_index()
    io.save(idata=fullDB, ifile="/Users/tgadfort/dbdiscogs/artists-discogs-db/full/{0}-DB.p".format(modVal))
    ts.update(n=n+1,N=100)
ts.stop()

Current Time is Wed Oct 27, 2021 07:41 for Merging DBs
1/100      : Process [Merging DBs] Has Run For 57.4 Seconds.  ETA is 5682.6 Seconds
2/100      : Process [Merging DBs] Has Run For 1.9 Minutes.  ETA is 93.1 Minutes
3/100      : Process [Merging DBs] Has Run For 2.9 Minutes.  ETA is 93.8 Minutes
4/100      : Process [Merging DBs] Has Run For 3.9 Minutes.  ETA is 93.6 Minutes
5/100      : Process [Merging DBs] Has Run For 4.9 Minutes.  ETA is 93.1 Minutes
6/100      : Process [Merging DBs] Has Run For 6.0 Minutes.  ETA is 94.0 Minutes
7/100      : Process [Merging DBs] Has Run For 7.1 Minutes.  ETA is 94.3 Minutes
8/100      : Process [Merging DBs] Has Run For 8.3 Minutes.  ETA is 95.5 Minutes
9/100      : Process [Merging DBs] Has Run For 9.4 Minutes.  ETA is 95.0 Minutes
10/100     : Process [Merging DBs] Has Run For 10.5 Minutes.  ETA is 94.5 Minutes
11/100     : Process [Merging DBs] Has Run For 11.7 Minutes.  ETA is 94.7 Minutes
12/100     : Process [Merging DBs] Has Run For 12

In [None]:
{artistID: [artistData.artist.name, artistData.url.url] for artistID,artistData in tmp.items()}

In [None]:
!python setup.py install

In [None]:
masterGenreData

In [None]:
releaseAlbumModData.sort_index().head(40)

In [None]:
releaseAlbumModData.get('400')

In [None]:
#releaseAlbumModData.get('400').values
releaseArtistAlbumData = releaseAlbumModData.get('400')

In [None]:
for item in releaseArtistAlbumData:
    print(item)

In [None]:
for k,v in releaseAlbumData.iteritems():
    print(k,len(v))

In [None]:
releaseAlbumModData.get('300')

In [None]:

for k,v in releaseAlbumData.iteritems():
    print(k,'\t',len(v))
    if isinstance(v, dict):
        for k2,v2 in v.items():
            print("\t",k2,'\t',len(v2))
            if isinstance(v2, dict):
                for k3,v3 in v2.items():
                    print("\t\t",k3,'\t',v3)

In [None]:
for modVal in range(Nmod):
    artistModData  = io.get("{0}/{1}-{2}.p".format(savedir, "ArtistModData", modVal))
    genreModData   = io.get("{0}/{1}-{2}.p".format(savedir, "ArtistModGenreData", modVal))
    masterModData  = io.get("{0}/{1}-{2}.p".format(savedir, "ArtistModMasterAlbumData", modVal))
    releaseModData = io.get("{0}/{1}-{2}.p".format(savedir, "ArtistModReleaseData", modVal))
    break

In [None]:
modValIdxs = sorted(list(set(artistModData.index).union(set(genreModData.index)).union(set(masterModData.index)).union(set(releaseModData.index))))
len(modValIdxs)
#len(artistModData.index)

In [None]:
if True:
    for artistID,artistDataFrame in artistModData.iterrows():
        dbData = {"ID": artistID, "Artist": artistDataFrame, "Genre": genreModData.get(artistID), "Master": masterModData.get(artistID), "Release": releaseModData.get(artistID)}
        art = artistDiscogsFull()
        art.getData(dbData)

In [None]:
from artistDBBase import artistDBBase, artistDBDataClass
from artistDBBase import artistDBNameClass, artistDBMetaClass, artistDBIDClass, artistDBURLClass, artistDBPageClass
from artistDBBase import artistDBProfileClass, artistDBMediaClass, artistDBMediaAlbumClass
from artistDBBase import artistDBMediaDataClass, artistDBMediaCountsClass, artistDBFileInfoClass
from artistDBBase import artistDBTextClass, artistDBLinkClass
from strUtils import fixName
from dbUtils import utilsDiscogs


class artistDiscogsFull(artistDBBase):
    def __init__(self, debug=False):
        super().__init__(debug)
        self.dutils = utilsDiscogs()
        
        
    ##############################################################################################################################
    ## Parse Data
    ##############################################################################################################################
    def getData(self, inputdata):
        if not isinstance(inputdata, dict):
            raise ValueError("Input must be a Discogs Dictionary")
            
        artistID    = inputdata["ID"]
        artistData  = inputdata["Artist"]
        genreData   = inputdata["Genre"]
        masterData  = inputdata["Master"]
        releaseData = inputdata["Release"]
        
        print(artistData)
        print(genreData)
        print(masterData)
        print(releaseData)
        artistName  = artistData["name"]
        url         = "https://www.discogs.com/artist/{0}".format(artistID)

        generalData = {}
        generalData["RealName"]   = artistData["realname"]
        generalData["Aliases"]    = artistData["MasterAliases"]
        generalData["Groups"]     = artistData["MasterGroups"]
        generalData["Members"]    = artistData["MasterMembers"]
        generalData["Variations"] = artistData["MasterNameVariations"]
        generalData = {k: v for k,v in generalData.items() if v is not None}
        generalData = generalData if len(generalData) > 0 else None
            
        artist      = artistDBNameClass(name=artistName, err=None)
        meta        = artistDBMetaClass(title=None, url=url)
        url         = artistDBURLClass(url=url)
        ID          = artistDBIDClass(ID=artistID)
        pages       = artistDBPageClass(ppp=1, tot=1, redo=False, more=False)
        profile     = artistDBProfileClass(general=generalData)
        
        amc  = artistDBMediaClass()
        for code,masterInfo in masterData.items():
            album        = masterInfo["Album"]
            albumURL     = "https://www.discogs.com/release/{0}".format(code)
            albumArtists = masterInfo["Artists"]
            albumYear    = masterInfo["Year"]
            amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=year)
        amc.media[name].append(amdc)

        media       = self.getMedia()
        mediaCounts = self.getMediaCounts(media)
        info        = artistDBFileInfoClass(info=None)
        
        adc = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info)
        
        return adc

In [None]:
%autoreload

In [None]:
masterPoolPrimaryParser("Discogs", modVals=[0], force=True, numProcs=1)

In [None]:
fInfo = fileInfo("/Volumes/Piggy/Discog/artists-discogs/0/1000500.p")

In [None]:
fInfo.created

In [None]:
from ioUtils import getFile
data = getFile("/Volumes/Piggy/Discog/artists-discogs/0/1000500.p", version=2)

In [None]:
%autoreload
from dbArtistsDiscogs import dbArtistsDiscogs
dbDiscogs = dbArtistsDiscogs()
from glob import glob
from ioUtils import getFile
io = fileIO()
for modVal in range(17,100):
    files = glob(setFile(setDir(dbDiscogs.disc.getArtistsDir(), str(modVal)), "*.p"))
    ts = timestat("Repickling {0} Files For ModVal={1}".format(len(files), modVal))
    N  = len(files)
    n2 = 0
    for i,ifile in enumerate(files):
        if (i+1) % 2000 == 0 or (i+1) == 1000:
            ts.update(n=i+1, N=N)

        try:
            artistData = io.get(ifile)
            continue
        except:
            try:
                artistData = getFile(ifile, version=2)
                n2 += 1
            except:
                print("Can't open {0}".format(ifile))
                
        io.save(ifile=ifile, idata=artistData)
        
            #print("Fixed {0}/{1} Files".format(n2,i+1))
    ts.stop()
#data = getFile("/Volumes/Piggy/Discog/artists-discogs/0/1000500.p", version=2)

In [None]:
len(files)