In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Basic Functions

In [None]:
from utils import FileIO, CSVIO, PickleIO
from utils import DirInfo, FileInfo
from utils import Timestat
from pandas import read_csv, DataFrame, Series, to_datetime, NaT, isna, concat, merge
from numpy import nan
from glob import glob
import gc

def loadData(ifile):
    mbdata = cio.get(ifile, delimiter="\t", header=None, on_bad_lines='skip')
    #mbdata = read_csv(ifile, delimiter="\t", header=None)
    mbdata = mbdata.replace('\\N', nan)
    return mbdata


def getData(files, colnames):
    data  = {FileInfo(ifile).basename: loadData(ifile) for ifile in files}
    print("Keys: {0}".format(data.keys()))
    data = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in data.items() if key in colnames} if colnames is not None else data
    print("Keys: {0}".format(data.keys()))
    return data


def setIndex(data):
    for key,df in data.items():
        colname = df.columns[0]
        df.index = df[colname]
        df.drop([colname], axis=1, inplace=True)
    return data


def createDate(year, month, day):
    if all([isinstance(x,str) for x in [year,month,day]]):
        return to_datetime('{0}-{1}-{2}'.format(year, month, day), format='%Y-%m-%d', errors='ignore')
    elif all([isinstance(x,str) for x in [year,month]]):
        return to_datetime('{0}-{1}'.format(year, month), format='%Y-%m', errors='ignore')
    elif all([isinstance(x,str) for x in [year]]):
        return to_datetime('{0}'.format(year), format='%Y', errors='ignore')
    return NaT


def convertToDatetime(year, month, day):
    year  = year.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    month = month.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    day   = day.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    tmp   = DataFrame(year).join(month).join(day)
    tmp.columns = ["year", "month", "day"]
    return to_datetime(tmp, errors='coerce')


cio = CSVIO()
pio = PickleIO()
io  = FileIO()

basedir = "./"
basedir = DirInfo("/Volumes/Seagate/DB")
saveDir = DirInfo(basedir.join("MusicBrainzData"))
lookDir = DirInfo(saveDir.join("lookup"))
dumpDir = DirInfo(basedir.join("mbdump"))

aIDs={"ArianaGrande": 823336, "BuddyHolly": 10937, "Rupaul": 34318, "U2": 197, "DMB": 502, "Bono": 35575, "Mozart": 11285, "JohnMayer": 33563}

In [None]:
!ln -s '/Volumes/Seagate/DB/mbdump'

# DB Data

In [None]:
lengthData = open("flength.csv").readlines()
lengthData = [x.replace("\n", "").strip().split() for x in lengthData]
lengthData = Series({item[1].split("/")[1]: int(item[0]) for item in lengthData if len(item[1].split("/")) > 1})
lengthData

In [None]:
list(lengthData.index)

In [None]:
#lengthData['release_label']
lengthData[(lengthData <= 6)]

# Lookup Dictionary

In [None]:
colnames = {}
colnames['gender'] = {0: "GenderID", 1: "GenderName"} #, 3: "NA3", "GenderDescr"}

ts = Timestat("Loading Gender Data")
files = [ifile for ifile in dumpDir.glob("gender*")]
genderData = getData(files, colnames)
genderData = setIndex(genderData)
ts.stop()

io.save(idata=genderData['gender']['GenderName'], ifile=lookDir.join("Gender.p"))
del genderData
_ = gc.collect()

In [None]:
colnames = {}
colnames["area_type"] = {0: "AreaTypeID", 1: "AreaTypeName", 3: "AlsoAreaTypeID", 4: "AreaTypeDescr", 5: "AreaTypeGID"}
colnames["area"]      = {0: "AreaID", 1: "AreaGID", 2: "AreaName", 3: "AreaTypeID"}
#colnames["area_gid_redirect"] = {0: "AreaGIDUUID", 1: "AreaGID"}
#colnames["area_alias_type"] = {0: "AreaAliasTypeID", 1: "AreaAliasTypeName", 5: "AreaAliasTypeUUID"}
#colnames["area_alias"] = {0: "AreaAliasID", 1: "NA1", 2: "AreaAlias", 3: "AreaLang", 6: "AliasTypeID", 7: "AreaSortName"}

ts = Timestat("Loading Area Data")
files = dumpDir.glob("area*")
areaData = getData(files, colnames)
areaData = setIndex(areaData)
ts.stop()

io.save(idata=areaData['area']['AreaName'], ifile=lookDir.join("Area.p"))
del areaData
_ = gc.collect()

In [None]:
colnames = {}
colnames['isrc'] = {0: "ISRCID", 1: "RecordingID", 2: "ISRC"}
colnames['iswc'] = {0: "ISWCID", 1: "WorkID", 2: "ISWC"}
colnames['iso_3166_1'] = {0: "ISO31661ID", 1: "ISO31661"}
colnames['iso_3166_2'] = {0: "ISO31662ID", 1: "ISO31662"}
colnames['iso_3166_3'] = {0: "ISO31663ID", 1: "ISO31663"}

ts = Timestat("Loading i* Code Data")
files = dumpDir.glob("is*")
icodeData = getData(files, colnames)
icodeData = setIndex(icodeData)
ts.stop()

iSWCData = icodeData['iswc']["ISWC"].copy(deep=True)
iSWCData.index = icodeData['iswc']['WorkID']
iSWCData = iSWCData.drop_duplicates()

io.save(idata=iSWCData, ifile=lookDir.join("iSWC.p"))
del iSWCData
_ = gc.collect()

In [None]:
colnames = {}
colnames["language"] = {0: "LanguageID", 1: "LanguageShort1", 2: "LanguageShort2", 3: "LanguageShort3", 4: "LanguageName", 5: "NA5", 6: "LanguageShort"}

ts = Timestat("Loading URL Data")
files = dumpDir.glob("language*")
languageData = getData(files, colnames)
languageData = setIndex(languageData)
ts.stop()

io.save(idata=languageData['language']['LanguageName'], ifile=lookDir.join("Language.p"))
del languageData
_ = gc.collect()

In [None]:
colnames = {}
colnames["script"] = {0: "ScriptID", 1: "ScriptName", 2: "NA2", 3: "ScriptDescr", 4: "NA4"}

ts = Timestat("Loading URL Data")
files = dumpDir.glob("script")
scriptData = getData(files, colnames)
scriptData = setIndex(scriptData)
ts.stop()

io.save(idata=scriptData['script']['ScriptName'], ifile=lookDir.join("Script.p"))
del scriptData
_ = gc.collect()

In [None]:
colnames = {}
colnames["label_alias"] = {0: "LabelAliasID", 1: "LabelID", 2: "LabelAliasName", 7: "LabelAliasName2"}
colnames["label_alias_type"] = {0: "LabelAliasTypeID", 1: "LabelAliasTypeName", 5: "LabelAliasGID"}
colnames["label_ipi"] = {0: "LabelIPIID", 1: "LabelIPI"}
colnames["label_isni"] = {0: "LabelISNIID", 1: "LabelISNI"}
colnames["label_type"] = {0: "LabelTypeID", 1: "LabelTypeName", 5: "LabelTypeGID"}
colnames["label"] = {0: "LabelID", 1: "LabelGID", 2: "LabelName"}

ts = Timestat("Loading Label Data")
files = dumpDir.glob("label*")
labelData = getData(files, colnames)
labelData = setIndex(labelData)
ts.stop()

io.save(idata=labelData['label']['LabelName'], ifile=lookDir.join("Label.p"))
del labelData
_ = gc.collect()

# Gender

In [None]:
colnames = {}
colnames['gender'] = {0: "GenderID", 1: "GenderName"} #, 3: "NA3", "GenderDescr"}

ts = Timestat("Loading Gender Data")
files = dumpDir.glob("gender*")
genderData = getData(files, colnames)
genderData = setIndex(genderData)
ts.stop()

savename = saveDir.join("GenderData.p")
io.save(idata=genderData, ifile=savename)

# Lookup

In [None]:
colnames = {}
colnames["l_label_release"] = {0: "Index", 1: "NA1", 2: "NA2", 3: "ReleaseID"}

ts = Timestat("Loading Area Data")
files = glob("mbdump/l_label_release")
lookupData = {FileInfo(ifile).basename: loadData(ifile) for ifile in files}
lookupData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in lookupData.items()} if colnames is not None else lookupData
print("Keys: {0}".format(lookupData.keys()))
ts.stop()

In [None]:
lookupData['l_label_release'][lookupData['l_label_release']['ReleaseID'].isin(releaseIDs)]

In [None]:
lookupData['l_label_release'].max()

# Area

In [None]:
colnames = {}
colnames["area_type"] = {0: "AreaTypeID", 1: "AreaTypeName", 3: "AlsoAreaTypeID", 4: "AreaTypeDescr", 5: "AreaTypeGID"}
colnames["area"]      = {0: "AreaID", 1: "AreaGID", 2: "AreaName", 3: "AreaTypeID"}
#colnames["area_gid_redirect"] = {0: "AreaGIDUUID", 1: "AreaGID"}
#colnames["area_alias_type"] = {0: "AreaAliasTypeID", 1: "AreaAliasTypeName", 5: "AreaAliasTypeUUID"}
#colnames["area_alias"] = {0: "AreaAliasID", 1: "NA1", 2: "AreaAlias", 3: "AreaLang", 6: "AliasTypeID", 7: "AreaSortName"}

ts = Timestat("Loading Area Data")
files = dumpDir.glob("area*")
areaData = getData(files, colnames)
areaData = setIndex(areaData)
ts.stop()

savename = saveDir.join("AreaData.p")
io.save(idata=areaData, ifile=savename)

# Event

In [None]:
colnames = {}
colnames["event_type"] = {0: "EventTypeID", 1: "EventTypeName", 4: "EventTypeDescr"}
colnames["event_alias_type"] = {0: "EventAliasTypeID", 1: "EventAliasTypeName", 5: "EventAliasTypeGID"}
colnames["event_alias"] = {0: "EventAliasID", 1: "EventID", 2: "EventAliasName", 3: "EventAliasLang", 7: "EventAliasName2"}
colnames["event"] = {0: "EventID", 1: "EventGID", 2: "EventName", 
                     3: "EventStartYear", 4: "EventStartMonth", 5: "EventStartDay", 6: "EventEndYear", 7: "EventEndMonth", 8: "EventEndDay"}

ts = Timestat("Loading Event Data")
files = glob("mbdump/event*")
eventData = {FileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(eventData.keys()))
eventData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in eventData.items() if key in colnames} if colnames is not None else eventData
print("Keys: {0}".format(eventData.keys()))
ts.stop()

In [None]:
eventData['event']

# ICode

In [None]:
colnames = {}
colnames['isrc'] = {0: "ISRCID", 1: "RecordingID", 2: "ISRC"}
colnames['iswc'] = {0: "ISWCID", 1: "WorkID", 2: "ISWC"}
colnames['iso_3166_1'] = {0: "ISO31661ID", 1: "ISO31661"}
colnames['iso_3166_2'] = {0: "ISO31662ID", 1: "ISO31662"}
colnames['iso_3166_3'] = {0: "ISO31663ID", 1: "ISO31663"}

ts = Timestat("Loading i* Code Data")
files = glob("mbdump/is*")
icodeData = getData(files, colnames)
icodeData = setIndex(icodeData)
ts.stop()

iSWCData = icodeData['iswc']["ISWC"].copy(deep=True)
iSWCData.index = icodeData['iswc']['WorkID']
iSWCData = iSWCData.drop_duplicates()
del icodeData

# Medium

In [None]:
colnames = {}
colnames["medium_format"] = {0: "MediumFormatID", 1: "MediumName", 2: "MediumGroupID", 3: "NA3", 4: "MediumIntroYear", 5: "MediumDescr", 6: "MediumGID"}
#colnames["medium_cdtoc"]  = {0: "NA0", 1: "NA1", 2: "NA2", 3: "NA3"}
colnames["medium"] = {0: "ReleaseID_1", 1: "ReleaseID", 2: "SideNum", 3: "MediumFormatID", 4: "NA4", 5: "NA5", 7: "NumTracks"}

ts = Timestat("Loading Medium Data")
files = glob("mbdump/medium*")
mediumData = {FileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(mediumData.keys()))
mediumData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in mediumData.items() if key in colnames} if colnames is not None else mediumData
print("Keys: {0}".format(mediumData.keys()))
ts.stop()

In [None]:
mediumData['medium_format'].nunique()

In [None]:
mediumData['medium'].nunique()

In [None]:
## 12" Vinyl ; ReleaseID=1310220
#mediumData['medium'][mediumData['medium'].eq(1310220).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1310220   1274390    1       33        4
# 1351220   1310220    1       31        11      <- Matches https://musicbrainz.org/release/6b5f33a8-fc5e-4c1e-b379-c659ce20a1c8


## Digital Media : ReleaseID=1792210
#mediumData['medium'][mediumData['medium'].eq(1792210).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1792210   1694923    1      12	NaN	0	2015-11-30 00:32:22.335038+00	13
# 1904010   1792210    1      12	NaN	0	2016-06-20 18:41:01.106192+00	11   Both Match


# 2x12" Vinyl ; ReleaseID=1680415

#mediumData['medium'][mediumData['medium'].eq(1680415).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1680415	1598741	1	1	NaN	0	2015-04-27 04:37:49.277705+00	11
# 1775088	1680415	1	31	NaN	0	2015-10-31 12:02:35.843886+00	5
# 1775089	1680415	2	31	NaN	0	2015-10-31 12:02:35.843886+00	6

# 1792210   1694923    1      12	NaN	0	2015-11-30 00:32:22.335038+00	13
# 1904010   1792210    1      12	NaN	0	2016-06-20 18:41:01.106192+00	11   Both Match

# Pixies Velouria CD w/ 4 Tracks
# 3099, 1162482, 3097, 2259927

## CD
# mediumData['medium'][mediumData['medium'].eq(3099).any(1)]
# 3099	3099	1	1	NaN	0	2011-05-16 14:57:06.530063+00	4

## CD (Status=Promotional)
# mediumData['medium'][mediumData['medium'].eq(1162482).any(1)]
# 1162482	1146184	1	12	NaN	0	2012-04-14 05:35:56.931961+00	1
# 1181517	1162482	1	1	NaN	0	2012-05-30 00:53:24.512335+00	4

## CD
# mediumData['medium'][mediumData['medium'].eq(3097).any(1)]
# 3097	3097	1	1	NaN	0	2012-10-18 19:49:17.567219+00	4

## 12" Vinyl
# mediumData['medium'][mediumData['medium'].eq(2259927).any(1)]
# 2259927	2099781	1	12	NaN	0	2018-01-11 08:03:13.107915+00	5
# 2441117	2259927	1	31	NaN	0	2018-09-28 23:20:29.279923+00	4  <-- This matches Web

In [None]:
mediumData['medium'].nunique()

In [None]:
mediumData['medium'].shape

In [None]:
mediumData['medium_cdtoc'][mediumData['medium_cdtoc'].eq(1598741).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(38).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq('38').any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(63).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(64).any(1)]

# Language

In [None]:
colnames = {}
colnames["language"] = {0: "LanguageID", 1: "LanguageShort1", 2: "LanguageShort2", 3: "LanguageShort3", 4: "LanguageName", 5: "NA5", 6: "LanguageShort"}

ts = Timestat("Loading URL Data")
files = dumpDir.glob("language*")
languageData = getData(files, colnames)
languageData = setIndex(languageData)
ts.stop()

savename = saveDir.join("LanguageData.p")
io.save(idata=languageData, ifile=savename)

# Script

In [None]:
colnames = {}
colnames["script"] = {0: "ScriptID", 1: "ScriptName", 2: "NA2", 3: "ScriptDescr", 4: "NA4"}

ts = Timestat("Loading Script Data")
files = dumpDir.glob("script")
scriptData = getData(files, colnames)
scriptData = setIndex(scriptData)
ts.stop()

savename = saveDir.join("ScriptData.p")
io.save(idata=scriptData, ifile=savename)

# URL

In [None]:
colnames = {}
colnames["url"] = {0: "URLID", 1: "URLGID", 2: "URLName"}
#colnames["url_gid_redirect"] = {0: "URLGIDUUID", 1: "URLGIDID"}

ts = Timestat("Loading URL Data")
files = dumpDir.glob("url*")
urlData = getData(files, colnames)
urlData = setIndex(urlData)
ts.stop()

savename = saveDir.join("URLData.p")
io.save(idata=urlData, ifile=savename)

# Work

In [None]:
colnames = {}
colnames["work_type"] = {0: "WorkTypeID", 1: "WorkTypeName", 3: "WorkTypeRanking", 4: "WorkTypeDescr", 5: "WorkTypeGID"}
#colnames["work_alias"] = {0: "WorkAliasID", 1: "WorkID", 2: "WorkName", 3: "WorkLang", 7: "WorkName2"}
#colnames["work_alias_type"] = {0: "WorkAliasTypeID", 1: "WorkAliasTypeName", 5: "WorkAliasTypeGID"}
colnames['work_attribute_type_allowed_value'] = {0: 'WorkAttributeTypeValueID', 1: "WorkAttributeTypeID", 2: "WorkAttributeTypeValue", 6: "WorkAttributeTypeValueGID"}
colnames["work_attribute_type"] = {0: "WorkAttributeTypeID", 1: "WorkAttributeTypeName", 6: "WorkAttributeTypeDescr"}
colnames["work_attribute"] = {0: "WorkAttributeID", 1: "WorkID", 2: "WorkAttributeTypeID", 3: "WorkAttributeTypeValueID", 4: "WorkAttributeCode"}
colnames["work_language"] = {0: "WorkID", 1: "LanguageID"}
colnames["work"] = {0: "WorkID", 1: "WorkGID", 2: "WorkName", 3: "WorkTypeID"} #, 4: "WorkDescr"}

ts = Timestat("Loading Work Data")
files = dumpDir.glob("work*")
workData = getData(files, colnames)
workData = setIndex(workData)
ts.stop()

## Append Data And Create Master Work DataFrame

In [None]:
ts = Timestat("Joining Release Language Name")
try:
    languageData  = io.get(saveDir.join("LanguageData.p"))
    dLanguageName = languageData['language']["LanguageName"]
except:
    raise ValueError("Error loading language data")
workData['work_language']["Language"] = workData['work_language']['LanguageID'].apply(lambda x: dLanguageName.get(x) if not isna(x) else None)
workData['work_language'].drop(["LanguageID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Work Attribute Type")
dWorkAttributeTypeName = workData['work_attribute_type']["WorkAttributeTypeName"].to_dict()
workData['work_attribute_type_allowed_value']["WorkAttributeType"] = workData['work_attribute_type_allowed_value']['WorkAttributeTypeID'].apply(lambda x: dWorkAttributeTypeName.get(x) if not isna(x) else None)
workData['work_attribute_type_allowed_value'].drop(["WorkAttributeTypeID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Work Attribute Type")
dWorkAttributeTypeName = workData['work_attribute_type']["WorkAttributeTypeName"].to_dict()
workData['work_attribute']["WorkAttributeType"] = workData['work_attribute']['WorkAttributeTypeID'].apply(lambda x: dWorkAttributeTypeName.get(x) if not isna(x) else None)
workData['work_attribute'].drop(["WorkAttributeTypeID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Work Type")
dWorkTypeName = workData['work_type']["WorkTypeName"].to_dict()
workData['work']["WorkTypeName"] = workData['work']['WorkTypeID'].apply(lambda x: dWorkTypeName.get(int(x)) if not isna(x) else None)
workData['work'].drop(["WorkTypeID"], axis=1, inplace=True)
ts.stop()

###
# Ignore Work Attributes
###

ts = Timestat("Dropping Last Columns")
workData['work'].drop(["WorkGID"], axis=1, inplace=True)
ts.stop()

savename = saveDir.join("WorkDataFrame.p")
ts = Timestat("Saving Master Release DataFrame To {0}".format(savename.str))
io.save(idata=workData, ifile=savename)
ts.stop()

# Label

In [None]:
colnames = {}
colnames["label_alias"] = {0: "LabelAliasID", 1: "LabelID", 2: "LabelAliasName", 7: "LabelAliasName2"}
colnames["label_alias_type"] = {0: "LabelAliasTypeID", 1: "LabelAliasTypeName", 5: "LabelAliasGID"}
colnames["label_ipi"] = {0: "LabelIPIID", 1: "LabelIPI"}
colnames["label_isni"] = {0: "LabelISNIID", 1: "LabelISNI"}
colnames["label_type"] = {0: "LabelTypeID", 1: "LabelTypeName", 5: "LabelTypeGID"}
colnames["label"] = {0: "LabelID", 1: "LabelGID", 2: "LabelName"}

ts = Timestat("Loading Label Data")
files = dumpDir.glob("label*")
labelData = getData(files, colnames)
labelData = setIndex(labelData)
ts.stop()

savename = saveDir.join("LabelData.p")
io.save(idata=labelData, ifile=savename)

# Recording

In [None]:
colnames = {}
#colnames["recording_alias_type"] = {0: "RecordingAliasTypeID", 1: "RecordingAliasTypeName"}
#colnames["recording_alias"] = {0: "RecordingAliasID", 1: "RecordingID", 2: "RecordingAliasName", 3: "RecordingAliasLang", 7: "recordingAliasName2"}
colnames["recording"] = {0: "RecordingID", 1: "RecordingGID", 2: "RecordingName", 3: "ArtistID", 4: "TimeLength"} #, 5: "RecordingDescr"}

ts = Timestat("Loading Recording Data")
files = dumpDir.glob("recording*")
recordingData = getData(files, colnames)
recordingData = setIndex(recordingData)
ts.stop()

## Append Data And Create Master Recording DataFrame

In [None]:
ts = Timestat("Dropping Last Columns")
recordingData['recording'].drop(["RecordingGID"], axis=1, inplace=True)
ts.stop()

savename = saveDir.join("RecordingDataFrame.p")
ts = Timestat("Saving Master Recording DataFrame To {0} (~1.1 min)".format(savename.str))
io.save(idata=recordingData, ifile=savename)
ts.stop()

# Track

In [None]:
colnames = {}
colnames["track"] = {0: "TrackID", 1: "TrackGID", 2: "RecordingID", 3: "NA3", 4: "TrackNum", 5: "TrackNumName", 6: "TrackName", 7: "ArtistID", 8: "TimeLength"}

# Release
# 2373946	7c5d14b4-cf40-4eb1-89e6-d448125d94f3	1987-12-12: Hampton Coliseum, Hampton, VA, USA	197	2128729	3	\N	120	28	\N		0	-1	2019-03-15 09:46:50.446876+00
            
# Release Group
# 2128729	f979a1c6-b6c2-4aad-b5b6-709c6c216752	1987-12-12: Hampton Coliseum, Hampton, VA, USA	197	1		0	2019-03-15 09:46:45.458509+00

# Recording
# 24365864	6a355a78-06c0-4e06-a15e-05d6e533255e	Sunday Bloody Sunday	197	366000		0	2019-03-15 12:29:22.049215+00	f

# Track
# 27710495	fc969b10-fdba-4b0c-82d9-aa6a7179e41f	24365864	2571008	7	7	Sunday Bloody Sunday	197	366000	0	2019-03-15 12:29:22.049215+00	f
ts = timestat("Loading Recording Data")
files = glob("mbdump/track")
trackData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(trackData.keys()))
trackData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in trackData.items() if key in colnames} if colnames is not None else trackData
print("Keys: {0}".format(trackData.keys()))
ts.stop()

# Release

In [None]:
colnames = {}
#colnames["release_alias_type"] = {0: "ReleaseAliasTypeID", 1: "ReleaseAliasTypeName"}
#colnames["release_alias"] = {0: "ReleaseAliasID", 1: "ReleaseID", 2: "ReleaseAliasName", 3: "ReleaseAliasLang", 7: "ReleaseAliasName2"}
colnames["release_status"] = {0: "ReleaseStatusID", 1: "ReleaseStatusName", 3: "NA3", 3: "ReleaseStatusDescr", 4: "ReleaseStatusGID"}
colnames["release_packaging"] = {0: "ReleasePackagingID", 1: "ReleasePackagingName", 3: "NA3", 4: "ReleasePackagingDescr", 5: "ReleaseStatusGID"}
colnames["release_label"] = {0: "Index", 1: "ReleaseID", 2: "LabelID", 3: "CatalogNumber"}
colnames["release_country"] = {0: "ReleaseID", 1: "ReleaseCountryID", 2: "ReleaseCountryYear", 3: "ReleaseCountryMonth", 4: "ReleaseCountryDay"}
colnames["release_unknown_country"] = {0: "ReleaseID", 1: "ReleaseCountryYear", 2: "ReleaseCountryMonth", 3: "ReleaseCountryDay"}
colnames["release"] = {0: "ReleaseID", 1: "ReleaseGID", 2: "ReleaseName", 3: "ArtistID", 4: "ReleaseGroupID", 5: "ReleaseStatusID", 
                       6: "ReleasePackagingID", 7: "LanguageID", 8: "ScriptID", 9: "ReleaseBarcode", 10: "ReleaseComment", 11: "NA11", 12: "NA12"}

ts = Timestat("Loading Release Data")
files = [ifile for ifile in dumpDir.glob("release*") if "group" not in str(ifile)]
releaseData = getData(files, colnames)
releaseData = setIndex(releaseData)
ts.stop()

## Append Data And Create Master Release DataFrame

In [None]:
tsRelease = Timestat("Appending Release Data")

ts = Timestat("Creating Release Country DateTime For {0} Releases".format(releaseData['release_country'].shape[0]))
tmp = releaseData['release_country'][["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"]]
tmp.columns = ["year", "month", "day"]
releaseData['release_country']['ReleaseDate'] = to_datetime(tmp, errors='ignore')
releaseData['release_country'].drop(["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Creating Release Unknown Country DateTime For {0} Releases".format(releaseData['release_unknown_country'].shape[0]))
tmp = releaseData['release_unknown_country'][["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"]]
tmp.columns = ["year", "month", "day"]
releaseData['release_unknown_country']['ReleaseDate'] = to_datetime(tmp, errors='ignore')
releaseData['release_unknown_country'].drop(["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Country Area")
try:                         
    areaData  = io.get(saveDir.join("AreaData.p"))
    dAreaName = areaData['area']['AreaName']
except:
    raise ValueError("Error loading area data")    
releaseData['release_country']["Country"] = releaseData['release_country']['ReleaseCountryID'].apply(lambda x: dAreaName.get(x) if not isna(x) else None)
releaseData['release_country'].drop(["ReleaseCountryID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Release Packaging Name")
dReleasePackagingName = releaseData['release_packaging']['ReleasePackagingName'].to_dict()
releaseData['release']["Packaging"] = releaseData['release']['ReleasePackagingID'].apply(lambda x: dReleasePackagingName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ReleasePackagingID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Release Status Name")
dReleaseStatusName = releaseData['release_status']['ReleaseStatusName'].to_dict()
releaseData['release']["Status"] = releaseData['release']['ReleaseStatusID'].apply(lambda x: dReleaseStatusName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ReleaseStatusID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Release Language Name")
try:                         
    areaData  = io.get(saveDir.join("LanguageData.p"))
    dLanguageName = languageData['language']['LanguageName']
except:
    raise ValueError("Error loading language data")
releaseData['release']["Language"] = releaseData['release']['LanguageID'].apply(lambda x: dLanguageName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["LanguageID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Release Script Name")
try:                         
    areaData  = io.get(saveDir.join("ScriptData.p"))
    dScriptName = scriptData['script']['ScriptName']
except:
    raise ValueError("Error loading script data")
releaseData['release']["Script"] = releaseData['release']['ScriptID'].apply(lambda x: dScriptName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ScriptID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Country/Release Date")
tmp = concat([releaseData['release_country'], releaseData['release_unknown_country']]).reset_index()
releaseIDDate = tmp.sort_values(by="ReleaseDate").drop_duplicates(subset="ReleaseID")
releaseIDDate.index = releaseIDDate['ReleaseID']
releaseIDDate.drop(["ReleaseID"], axis=1, inplace=True)
releaseData['release'] = releaseData['release'].join(releaseIDDate)
ts.stop()

ts = Timestat("Joining Label Data")
try:                         
    areaData  = io.get(saveDir.join("LabelData.p"))
    dLabelName = labelData['label']['LabelName']
except:
    raise ValueError("Error loading label data")
releaseData['release_label']["Label"] = releaseData['release_label']['LabelID'].apply(lambda x: dLabelName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseLabel = releaseData['release_label'].drop_duplicates(subset="ReleaseID")[["ReleaseID", "Label"]].copy(deep=True)
releaseLabel["Label"].index = releaseLabel["ReleaseID"]
releaseData['release'].join(releaseLabel)
ts.stop()

ts = Timestat("Dropping Last Columns")
releaseData['release'].drop(["ReleaseComment", "ReleaseBarcode", "NA11", "NA12"], axis=1, inplace=True)
ts.stop()


savename = saveDir.join("ReleaseDataFrame.p")
ts = Timestat("Saving Master Release DataFrame To {0}".format(savename.str))
io.save(idata=releaseData['release'], ifile=savename)
ts.stop()


tsRelease.stop()

# Release-Group

In [None]:
colnames = {}
#colnames["release_group_alias_type"] = {0: "ReleaseGroupAliasTypeID", 1: "ReleaseGroupAliasTypeName"}
#colnames["release_group_alias"] = {0: "ReleaseGroupAliasID", 1: "NA1", 2: "ReleaseGroupAliasName", 3: "ReleaseGroupAliasLang", 7: "ReleaseGroupAliasName2"}
colnames["release_group_primary_type"] = {0: "ReleaseGroupPrimaryTypeID", 1: "ReleaseGroupPrimaryTypeName", 3: "NA3"}
colnames["release_group_secondary_type"] = {0: "ReleaseGroupSecondaryTypeID", 1: "ReleaseGroupSecondaryTypeName"}
colnames["release_group"] = {0: "ReleaseGroupID", 1: "ReleaseGroupGID", 2: "ReleaseGroupName", 3: "ArtistID", 4: "ReleaseGroupPrimaryTypeID", 5: "ReleaseGroupComment", 6: "NA6"}
colnames["release_group_secondary_type_join"] = {0: "ReleaseGroupID", 1: "ReleaseGroupSecondaryTypeID"}

ts = Timestat("Loading Release Data")
files = [ifile for ifile in dumpDir.glob("release*") if "group" in str(ifile)]
releaseGroupData = getData(files, colnames)
releaseGroupData = setIndex(releaseGroupData)
ts.stop()

## Append Data And Create Master ReleaseGroup DataFrame

In [None]:
tsReleaseGroup = Timestat("Appending ReleaseGroup Data")

ts = Timestat("Joining Secondary Type Names")
dReleaseGroupSecondaryTypeName = releaseGroupData['release_group_secondary_type']['ReleaseGroupSecondaryTypeName'].to_dict()
releaseGroupData['release_group_secondary_type_join']['ReleaseGroupSecondaryType'] = releaseGroupData['release_group_secondary_type_join']['ReleaseGroupSecondaryTypeID'].apply(lambda x: dReleaseGroupSecondaryTypeName.get(x) if not isna(x) else None)
releaseGroupData['release_group_secondary_type_join'].drop(["ReleaseGroupSecondaryTypeID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Primary Type Names")
dReleaseGroupPrimaryTypeName = releaseGroupData['release_group_primary_type']['ReleaseGroupPrimaryTypeName'].to_dict()
releaseGroupData['release_group']['ReleaseGroupPrimaryType'] = releaseGroupData['release_group']['ReleaseGroupPrimaryTypeID'].apply(lambda x: dReleaseGroupPrimaryTypeName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseGroupData['release_group'].drop(["ReleaseGroupPrimaryTypeID"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Joining Release Group And Secondary Type Join Data")
releaseGroupData['release_group'] = releaseGroupData['release_group'].join(releaseGroupData['release_group_secondary_type_join'])
ts.stop()

ts = Timestat("Dropping Last Columns")
releaseGroupData['release_group'].drop(["NA6", "ReleaseGroupComment"], axis=1, inplace=True)
ts.stop()


savename = saveDir.join("ReleaseGroupDataFrame.p")
ts = Timestat("Saving Master ReleaseGroup DataFrame To {0} (~20 sec)".format(savename.str))
io.save(idata=releaseGroupData['release_group'], ifile=savename)
ts.stop()


tsReleaseGroup.stop()

del releaseGroupData

# Artist

In [None]:
colnames = {}
colnames["artist_credit"] = {0: "ArtistCreditID", 1: "ArtistCreditName", 2: "ArtistCreditNum", 3: "NA3"}
colnames["artist_type"] = {0: "ArtistTypeID", 1: "ArtistTypeName", 2: "NA2", 3: "NA3", 4: "ArtistTypeDescr", 5: "ArtistTypeGID"}
colnames["artist_isni"] = {0: "ArtistID", 1: "ISNICode"}
colnames["artist_alias_type"] = {0: "ArtistAliasTypeID", 1: "ArtistAliasTypeName", 5: "ArtistAliasTypeGID"}
colnames["artist_alias"] = {0: "ArtistAliasID", 1: "ArtistID", 2: "ArtistAliasName", 3: "ArtistAliasLang", 7: "ArtistAliasSortName"}
colnames["artist"] = {0: "ArtistID", 1: "ArtistGID", 2: "ArtistName", 3: "ArtistSortName",
                      4: "FormedYear", 5: "FormedMonth", 6: "FormedDay", 
                      7: "DisbandedYear", 8: "DisbandedMonth", 9: "DisbandedDay", 
                      10: "ArtistTypeID", 11: "CountryAreaID", 12: "GenderID", 13: "ArtistDescr", 14: "NA14", 17: "FoundedInAreaID", 18: "DisbandedInAreaID"}

ts = Timestat("Loading Artist Data (~20 sec)")
files = [ifile for ifile in dumpDir.glob("artist*") if str(ifile) not in ["mbdump/artist_credit_name"]]
artistData = getData(files, colnames)
artistData = setIndex(artistData)
ts.stop()

## Append Data And Create Master Artist DataFrame

In [None]:
tsArtist = Timestat("Appending Artist Data")

ts = Timestat("Creating Formed/Disbanded DateTime For {0} Artists (~7 sec)".format(artistData['artist'].shape[0]))
artistData['artist']['Formed']    = convertToDatetime(artistData['artist']["FormedYear"], artistData['artist']["FormedMonth"], artistData['artist']["FormedDay"])
artistData['artist']['Disbanded'] = convertToDatetime(artistData['artist']["DisbandedYear"], artistData['artist']["DisbandedMonth"], artistData['artist']["DisbandedDay"])
artistData['artist'].drop(["FormedYear", "FormedMonth", "FormedDay", "DisbandedYear", "DisbandedMonth", "DisbandedDay"], axis=1, inplace=True)
ts.stop()


ts = Timestat("Joining Artist Type")
dArtistTypeName = artistData['artist_type']["ArtistTypeName"]
artistData['artist']["ArtistType"] = artistData['artist']['ArtistTypeID'].apply(lambda x: dArtistTypeName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["ArtistTypeID"], axis=1, inplace=True)
ts.stop()


ts = Timestat("Joining Gender Type")
try:
    genderData  = io.get(saveDir.join("GenderData.p"))
    dGenderName = genderData['gender']['GenderName']
except:
    raise ValueError("Error loading gender data")
artistData['artist']['Gender'] = artistData['artist']['GenderID'].apply(lambda x: dGenderName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["GenderID"], axis=1, inplace=True)
ts.stop()

                         
ts = Timestat("Joining Area Type")
try:                         
    areaData  = io.get(saveDir.join("AreaData.p"))
    dAreaName = areaData['area']['AreaName']
except:
    raise ValueError("Error loading area data")

                       
artistData['artist']["Country"]     = artistData['artist']['CountryAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist']["FormedIn"]    = artistData['artist']['FoundedInAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist']["DisbandedIn"] = artistData['artist']['DisbandedInAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["CountryAreaID", "FoundedInAreaID", "DisbandedInAreaID"], axis=1, inplace=True)
ts.stop()

                       
ts = Timestat("Joining ISNI")
artistData['artist'] = artistData['artist'].join(artistData['artist_isni'])
ts.stop()
                       

ts = Timestat("Collecting and Joining Artist Aliases (~40 sec)")
artistAliases = DataFrame(Series({artistID: df["ArtistAliasName"].to_list() for artistID,df in artistData['artist_alias'].groupby("ArtistID")}))
artistAliases.columns = ["Aliases"]
artistData['artist'] = artistData['artist'].join(artistAliases)
ts.stop()


ts = Timestat("Dropping Last Columns")
artistData['artist'].drop(["ArtistDescr", "NA14"], axis=1, inplace=True)
ts.stop()


savename = saveDir.join("ArtistDataFrame.p")
ts = Timestat(f"Saving Master Artist DataFrame To {savename.str} (~20 sec)")
io.save(idata=artistData['artist'], ifile=savename)
ts.stop()

tsArtist.stop()

## Artist <=> Work

In [None]:
colnames = {}
colnames["l_artist_work"] = {0: "LookupID", 1: "WorkGroupID", 2: "ArtistID", 3: "WorkID", 6: "NA6", 7: "NA7"}

ts = Timestat("Loading Artist <=> Work Data")
files = dumpDir.glob("l_artist_work")
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

In [None]:
ts = Timestat("Merging Artist <=> Work Lookup")

try:
    workData = io.get(saveDir.join("WorkDataFrame.p"))
    wData = workData['work'].reset_index()
except:
    raise ValueError("Error loading work data")
    
try:
    lData = lookupData['l_artist_work'].reset_index()
except:
    raise ValueError("Error loading work data")
    
mergedWorkData = merge(wData,lData,on='WorkID')
ts.stop()

ts = Timestat("Dropping Last Columns")
mergedWorkData.drop(["WorkID", "LookupID", "NA6", "NA7"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Grouping By ArtistID (~31 sec)")
artistWorks = Series({artistID: list(zip(artistIDWorks["WorkGroupID"], artistIDWorks["WorkTypeName"], artistIDWorks["WorkName"])) for artistID,artistIDWorks in mergedWorkData.groupby("ArtistID")})
ts.stop()

savename = saveDir.join("ArtistWorkDataFrame.p")
ts = Timestat("Saving Master Artist Work DataFrame To {0} (~6 sec)".format(savename.str))
io.save(idata=artistWorks, ifile=savename)
ts.stop()

## Artist <=> Recording

In [None]:
colnames = {}
colnames["l_artist_recording"] = {0: "LookupID", 1: "RecordingGroupID", 2: "ArtistID", 3: "RecordingID", 6: "NA6", 7: "NA7"}

ts = Timestat("Loading Artist <=> Recording Data")
files = dumpDir.glob("l_artist_recording")
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

In [None]:
recordingData = io.get(saveDir.join("RecordingDataFrame.p"))

In [None]:
ts = Timestat("Merging Artist <=> Recording Lookup (~16 sec)")

try:
    recordingData = io.get(saveDir.join("RecordingDataFrame.p"))
    rData = recordingData['recording'].reset_index().drop(["ArtistID"], axis=1)
except:
    raise ValueError("Error loading recording data")
    
lData = lookupData['l_artist_recording'].reset_index()
mergedRecordingData = merge(rData,lData,on='RecordingID')
ts.stop()

ts = Timestat("Dropping Last Columns")
mergedRecordingData.drop(["RecordingID", "RecordingGroupID", "LookupID", "NA6", "NA7"], axis=1, inplace=True)
ts.stop()

In [None]:
ts = Timestat("Grouping By ArtistID (~31 sec)")
artistRecordings = Series({artistID: list(zip(artistIDRecordings["RecordingName"], artistIDRecordings["TimeLength"])) for artistID,artistIDRecordings in mergedRecordingData.groupby("ArtistID")})
ts.stop()

savename = saveDir.join("ArtistRecordingDataFrame.p")
ts = Timestat("Saving Master Artist Recording DataFrame To {0} (~54 sec)".format(savename))
io.save(idata=artistRecordings, ifile=savename)
ts.stop()

## Artist <=> Release

In [None]:
#### Seems not to be used ####
#### Seems not to be used ####
#### Seems not to be used ####


colnames = {}
colnames["l_artist_release"] = {0: "LookupID", 1: "ReleaseGroupID", 2: "ArtistID", 3: "ReleaseID", 6: "NA6", 7: "NA7"}

ts = Timestat("Loading Artist <=> Release Data")
files = dumpDir.glob("l_artist_release")
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

In [None]:
savename = saveDir.join("ReleaseDataFrame.p")
releaseData = FileIO().get(savename)

## Artist <=> URL

In [None]:
colnames = {}
colnames["l_artist_url"] = {0: "LookupID", 1: "URLGroupID", 2: "ArtistID", 3: "URLID", 6: "NA6", 7: "NA7"}

ts = Timestat("Loading Artist <=> URL Data")
files = dumpDir.glob("l_artist_url")
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

In [None]:
ts = Timestat("Joining URLs")

try:
    urlData = io.get(saveDir.join("URLData.p"))
except:
    raise ValueError("Error loading URL data")
lookupData['l_artist_url']["URL"] = lookupData['l_artist_url']['URLID'].apply(lambda x: urlData['url']["URLName"].get(x))
ts.stop()

urlType={'26038': 'Discogs',
 '26039': 'Myspace',
 '26040': 'IMDB',
 '26041': 'Wikipedia',
 '26042': 'Artist',
 '26048': 'LastFMMisc',
 '26052': 'Apple',
 '26055': 'YouTube',
 '26056': 'Facebook',
 '26062': 'GeniusMisc',
 '26068': 'VGMDB',
 '26316': 'DeezerSpotify',
 '28613': 'AllMusic',
 '30134': 'Soundcloud',
 '41329': 'Video',
 '49052': 'RateYourMusicMisc',
 '94979': 'SecondhandSongs',
 '106477': 'VIAG',
 '117675': 'Wikidata',
 '139284': 'Bandcamp',
 '195003': 'IMSLP',
 '199852': 'Songkick',
 '204138': 'Setlist.fm',
 '215573': 'Last.fm',
 '240791': 'BandsInTown',
 '624633': 'AppleTidalNapster',
 '697028': 'PureVolume',
 '732275': 'CDBaby',
 '748510': 'GooglePlus',
 '753046': 'GooglePlay',
 '771457': 'BBC'}

def getURLGroupName(url):
    if "discogs." in url:
        return "Discogs"
    elif "myspace." in url:
        return "Myspace"
    elif "imdb." in url:
        return "IMDB"
    elif "youtube." in url:
        return "YouTube"
    elif "allmusic." in url:
        return "AllMusic"
    elif "last.fm" in url:
        return "LastFM"
    elif "soundcloud." in url:
        return "Soundcloud"
    elif "directlyrics." in url:
        return "DirectLyrics"
    elif "facebook." in url:
        return "Facebook"
    elif "tumblr." in url:
        return "Tumblr"
    elif "viaf." in url:
        return "VIAF"
    elif "wikidata." in url:
        return "Wikidata"
    elif "rateyourmusic." in url:
        return "RateYourMusic"
    elif "muzikum." in url:
        return "Muzikum"
    elif "spotify." in url:
        return "Spotify"
    elif "archive." in url:
        return "Archive"
    elif "play.google." in url:
        return "GooglePlay"
    elif "genius." in url:
        return "Genius"
    elif "musicmoz." in url:
        return "Musicmoz"
    elif "imvdb." in url:
        return "IMVBD"
    elif "musik-sammler." in url:
        return "MusikSammler"
    elif "whosampled." in url:
        return "WhoSampled"
    elif "setlist." in url:
        return "SetListFM"
    elif "secondhandsongs." in url:
        return "SecondhandSongs"
    elif "apple." in url:
        return "Apple"
    elif "deezer." in url:
        return "Deezer"
    elif "twitter." in url:
        return "Twitter"
    elif "songkick." in url:
        return "Songkick"
    elif "instagram." in url:
        return "Instagram"
    elif "tidal." in url:
        return "Tidal"
    elif "bbc." in url:
        return "BBC"
    elif "musixmatch." in url:
        return "MusixMatch"
    elif "napster." in url:
        return "Napster"
    elif "junodownload." in url:
        return "JunoDownload"
    elif "beatport." in url:
        return "Beatport"
    elif "bandsintown." in url:
        return "BandsInTown"
    elif "bandcamp." in url:
        return "BandCamp"
    else:
        return "Misc"

def getURLType(urlGroupID):
    urlGroupName = urlType.get(str(urlGroupID),"Misc")
    return urlGroupName

ts = Timestat("Getting URL Group Name")
lookupData['l_artist_url']['URLGroupName'] = lookupData['l_artist_url']['URL'].apply(getURLGroupName)
ts.stop()

ts = Timestat("Dropping Last Columns")
lookupData['l_artist_url'].drop(["NA6", "NA7"], axis=1, inplace=True)
ts.stop()

ts = Timestat("Grouping By ArtistID (~2 min)")
artistURLs = Series({artistID: list(zip(artistIDURLs["URLGroupName"], artistIDURLs["URL"])) for artistID,artistIDURLs in lookupData['l_artist_url'].groupby("ArtistID")})
ts.stop()

savename = saveDir.join("ArtistURLDataFrame.p")
ts = Timestat("Saving Master Artist URL DataFrame To {0} (~11 sec)".format(savename.str))
io.save(idata=artistURLs, ifile=savename)
ts.stop()

# Master Artist Summary Data

# Raw MDBIO Data

In [None]:
from utils import FileInfo, DirInfo, Timestat, FileIO
from musicdb.musicbrainz import MusicDBIO
mdbio = MusicDBIO(verbose=True, mod=False, mkDirs=False)

## Artist Data

In [None]:
ts = Timestat("Loading Master Data (~30 sec)")
artistDataFrameFile    = saveDir.join("ArtistDataFrame.p")
if not artistDataFrameFile.exists():
    raise ValueError(f"{artistDataFrameFile.str} doesn't exist")
masterArtistData       = io.get(artistDataFrameFile)
if not isinstance(masterArtistData, DataFrame):
    raise ValueError("Master Artist Data is not a DataFrame")
    
artistURLDataFrameFile = saveDir.join("ArtistURLDataFrame.p")
if not artistURLDataFrameFile.exists():
    raise ValueError(f"{artistURLDataFrameFile.str} doesn't exist")
masterArtistURLData    = io.get(artistURLDataFrameFile)
if not isinstance(masterArtistURLData, Series):
    raise ValueError("Master Artist Data is not a DataFrame")
masterArtistURLData.name = "URLs"

masterArtistData       = masterArtistData.join(masterArtistURLData)
masterArtistData.index.name = "mbidx"
masterArtistData.head()
ts.stop()



In [None]:
ts = Timestat(f"Saving {masterArtistData.shape[0]} Artists To Raw MB Data")
mdbio.rdio.getFilename("SearchArtist")
io.save(idata=masterArtistData, ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistDataFrame.post.p")
ts.stop()

In [None]:
tmp = io.get(ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistGTRDataFrame.p")

In [None]:
io.save(idata=tmp, ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistGTRDataFrame.post.p")

In [None]:
from master import MasterParams, MusicDBPermDir
from sys import prefix
from pandas import Series, DataFrame, concat, Timestamp
from base import MusicDBDir, MusicDBData
from lib.musicbrainz import MusicDBIO
artistIDs = artistData["ArtistName"].copy(deep=True)
artistIDs.index = artistData["ArtistGID"]

mdbio = MusicDBIO(verbose=True, mod=True, mkDirs=False)
mdbpd = MusicDBPermDir()
db    = mdbio.db
permDBDir = mdbpd.getDBPermPath(db)
permDir = MusicDBDir(permDBDir)
allArtists = MusicDBData(path=permDir, fname="{0}AllArtists".format(db.lower()))
allArtists.save(data=artistIDs)

## Release Data

In [None]:
masterReleaseDataFile    = saveDir.join("ReleaseDataFrame.p")
if not masterReleaseDataFile.exists():
    raise ValueError(f"{masterReleaseDataFile.str} doesn't exist")

In [None]:
masterReleaseData = io.get(masterReleaseDataFile)

In [None]:
s = to_datetime(masterReleaseDataYearLang["Release"], errors='coerce', format="%Y%m%d")

In [None]:
from pandas import NaT
s.map(lambda x: x.year if not NaT else None)

In [None]:
s.map(lambda x: x.year)

In [None]:
from pandas import to_datetime

ts = Timestat("Getting ReleaseGroup Year/Language From Releases")
masterReleaseDataYearLang = DataFrame({releaseGroupID: (df["ReleaseDate"].min(), df["Language"].unique()) for releaseGroupID,df in masterReleaseData[["ReleaseGroupID", "Language", "ReleaseDate"]].groupby("ReleaseGroupID")}).T
masterReleaseDataYearLang.columns = ["Release", "Language"]
masterReleaseDataYearLang["Year"] = to_datetime(masterReleaseDataYearLang["Release"], errors='coerce', format="%Y%m%d").map(lambda x: x.year)
def getLang(lang):
    retvals = [value for value in lang if value is not None]
    retval  = retvals[0] if len(retvals) == 1 else None
    return retval
masterReleaseDataYearLang["Language"] = masterReleaseDataYearLang["Language"].apply(getLang)
masterReleaseDataYearLang.index.name = "ReleaseGroupID"
ts.stop()

## ReleaseGroup Data

In [None]:
def createReleaseGroupKey(x):
    key = None
    primary   = x['ReleaseGroupPrimaryType']
    secondary = x['ReleaseGroupSecondaryType']
    if isinstance(primary, str) and isinstance(secondary, str):
        key = " + ".join([primary, secondary])
    elif isinstance(primary, str):
        key = primary
    elif isinstance(secondary, str):
        key = secondary
    else:
        key = "Unknown"
    return key

In [None]:
mdbio = MusicDBIO(verbose=True, mod=False, mkDirs=False)
tmp = io.get("/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistDataFrame.post.p")
ArtistIDtoGIDLookup = tmp[~tmp["ArtistGID"].duplicated()]["ArtistGID"]
io.save(idata=ArtistIDtoGIDLookup, ifile=saveDir.join("ArtistIDtoGIDLookup.p"))

In [None]:
ts = Timestat("Loading Master Data (~30 sec)")
masterReleaseGroupDataFile    = saveDir.join("ReleaseGroupDataFrame.p")
if not masterReleaseGroupDataFile.exists():
    raise ValueError(f"masterReleaseGroupDataFile doesn't exist")
masterReleaseGroupData    = io.get(masterReleaseGroupDataFile)
if not isinstance(masterReleaseGroupData, DataFrame):
    raise ValueError("masterReleaseGroupData is not a DataFrame")
masterReleaseGroupData["ReleaseGroupKey"] = masterReleaseGroupData.apply(createReleaseGroupKey, axis=1)

ArtistIDtoGIDLookupFile       = saveDir.join("ArtistIDtoGIDLookup.p")
if not ArtistIDtoGIDLookupFile.exists():
    raise ValueError(f"ArtistIDtoGIDLookupFile doesn't exist")
ArtistIDtoGIDLookup       = io.get(ArtistIDtoGIDLookupFile)
if not isinstance(ArtistIDtoGIDLookup, Series):
    raise ValueError("ArtistIDtoGIDLookup is not a dict")
ts.stop()

In [None]:
releaseGroupData = masterReleaseGroupData.join(masterReleaseDataYearLang)
releaseGroupData["ArtistGID"] = releaseGroupData["ArtistID"].map(ArtistIDtoGIDLookup)

In [None]:
ts = Timestat("Getting ReleaseGroup Data (~8 mins)")
artistReleaseGroupData = {}
for i,(artistGID,artistData) in enumerate(releaseGroupData.groupby("ArtistGID")):
    artistReleaseGroupData[artistGID] = {str(idx): row.to_list() for idx,row in artistData[["ReleaseGroupName", "ReleaseGroupKey", "ReleaseGroupGID", "Language", "Year"]].iterrows()}
    if (i+1) % 250000 == 0 or (i+1) % 25000 == 0:
        ts.update(n=i+1, cmt=f"Processed {i+1} GIDs")
ts.stop()



In [None]:
mdbio.rdio.getFilename("SearchReleaseGroup")

In [None]:
ts = Timestat(f"Saving {len(artistReleaseGroupData)} Artist ReleaseGroups To Raw MB Data")
io.save(idata=Series(artistReleaseGroupData), ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistReleaseGroupDataFrame.post.p")
# mdbio.data.saveSearchReleaseGroupData(data=Series(artistReleaseGroupData))
ts.stop()

## Work Data

In [None]:

ArtistIDtoGIDLookupFile       = saveDir.join("ArtistIDtoGIDLookup.p")
if not ArtistIDtoGIDLookupFile.exists():
    raise ValueError(f"ArtistIDtoGIDLookupFile doesn't exist")
ArtistIDtoGIDLookup       = io.get(ArtistIDtoGIDLookupFile)

In [None]:
len(ArtistIDtoGIDLookup)

In [None]:
ts = Timestat("Loading Master Data (~30 sec)")
masterWorkDataFile    = saveDir.join("ArtistWorkDataFrame.p")
if not masterWorkDataFile.exists():
    raise ValueError(f"{masterWorkDataFile.str} doesn't exist")
masterArtistWorkData    = io.get(masterWorkDataFile)
if not isinstance(masterArtistWorkData, Series):
    raise ValueError("masterWorkData is not a Series")

ArtistIDtoGIDLookupFile       = saveDir.join("ArtistIDtoGIDLookup.p")
if not ArtistIDtoGIDLookupFile.exists():
    raise ValueError(f"ArtistIDtoGIDLookupFile doesn't exist")
ArtistIDtoGIDLookup       = io.get(ArtistIDtoGIDLookupFile)
if not isinstance(ArtistIDtoGIDLookup, Series):
    raise ValueError("ArtistIDtoGIDLookup is not a dict")
ts.stop()

In [None]:
mdbio.data.saveSearchWorkData

In [None]:
ts = Timestat("Grouping Work Data")
artistWorkData = {ArtistIDtoGIDLookup.get(artistID): artistIDData for artistID,artistIDData in masterArtistWorkData.items()}
artistWorkData = Series({k: v for k,v in artistWorkData.items() if all([x is not None for x in [k,v]])})
ts.stop()

ts = Timestat(f"Saving {len(artistWorkData)} Artist Work To Raw MB Data")
io.save(idata=artistWorkData, ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistWorkDataFrame.post.p")

#mdbio.data.saveSearchWorkData(data=Series(artistWorkData))
ts.stop()

## Recording Data

In [None]:
ts = Timestat("Loading Master Data (~30 sec)")
masterRecordingDataFile    = saveDir.join("ArtistRecordingDataFrame.p")
if not masterRecordingDataFile.exists():
    raise ValueError(f"{masterRecordingDataFile.str} doesn't exist")
masterArtistRecordingData    = io.get(masterRecordingDataFile)
if not isinstance(masterArtistRecordingData, Series):
    raise ValueError("masterRecordingData is not a Series")

ArtistIDtoGIDLookupFile       = saveDir.join("ArtistIDtoGIDLookup.p")
if not ArtistIDtoGIDLookupFile.exists():
    raise ValueError(f"ArtistIDtoGIDLookupFile doesn't exist")
ArtistIDtoGIDLookup       = io.get(ArtistIDtoGIDLookupFile)
if not isinstance(ArtistIDtoGIDLookup, Series):
    raise ValueError("ArtistIDtoGIDLookup is not a dict")
ts.stop()

In [None]:
ts = Timestat("Grouping Recording Data")
artistRecordingData = {ArtistIDtoGIDLookup.get(artistID): artistIDData for artistID,artistIDData in masterArtistRecordingData.items()}
artistRecordingData = Series({k: v for k,v in artistRecordingData.items() if all([x is not None for x in [k,v]])})
ts.stop()

ts = Timestat(f"Saving {len(artistRecordingData)} Artist Recording To Raw MB Data")
io.save(idata=artistRecordingData, ifile="/Volumes/Piggy/Discog/artists-musicbrainz/search/ArtistRecordingDataFrame.post.p")
# mdbio.data.saveRawRecordingMBData(data=Series(artistRecordingData))
ts.stop()

In [None]:
masterArtistRecordingData.name = "Recording"
masterArtistRecordingData.index.name = "ArtistID"
masterArtistRecordingData.reset_index()
masterArtistRecordingData["ArtistGID"] = masterArtistRecordingData["ArtistID"].apply(artistIDtoGIDLookup.get)
saveData = {artistGID: df["Recording"] for artistGID,df in masterArtistRecordingData.groupby("ArtistGID")}

In [None]:
releaseGroupData['ArtistID'].count()

In [None]:
#ts = Timestat("Getting Release Group Data")
for artistID,df in masterReleaseGroupData[["ArtistID", "ReleaseGroupName", "ReleaseGroupKey", "ReleaseGroupGID"]].groupby("ArtistID"):
    print(artistID)
    print(df)
    break
#ts.stop()

In [None]:
masterArtistURLData.name = "URLs"
masterArtistData = masterArtistData.join(masterArtistURLData)
masterArtistData.index.name = "mbidx"
print("Saving Artist DataFrame With {0} Entries To {1}".format(masterArtistData.shape[0], mio.data.getSearchArtistFilename().str))
mio.data.saveSearchArtistData(data=masterArtistData)
ts.stop()


In [None]:

    
ts = timestat("Setting ReleaseGroup Key (~1/2 min)")
masterReleaseGroupData["ReleaseGroupKey"] = masterReleaseGroupData.apply(createReleaseGroupKey, axis=1)
ts.stop()

In [None]:
masterReleaseGroupData    = io.get(savedir.join("{0}.p".format("ReleaseGroupDataFrame")).path)
masterArtistRecordingData = io.get(savedir.join("{0}.p".format("ArtistRecordingDataFrame")).path)
masterArtistWorkData      = io.get(savedir.join("{0}.p".format("ArtistWorkDataFrame")).path)


## Artist Data

In [None]:
from parseRawDataBase import parseRawDataBase
from timeUtils import timestat
from fsUtils import fsInfo
from pandas import Series

        
class parseRawMusicBrainzData(parseRawDataBase):
    def __init__(self, verbose=True):
        super().__init__(db="MusicBrainz", verbose=verbose)
        self.rms = []
        
    
    def parseArtistData(self, masterArtistData):
        if self.verbose: ts = timestat("Parsing Raw {0} Data(masterArtistData)".format(self.db))

        for modVal,artistModValData in masterArtistData.groupby("ModVal"):
            modValData = {}
            N = artistModValData.shape[0]
            if self.verbose: tsParse = timestat("Parsing {0} ModVal={1} Entries".format(modVal, N))
            pModVal = self.utils.getPrintModValue(N)
            for i,(artistMBID,artistData) in enumerate(artistModValData.iterrows()):
                rData = self.dbIO.rawIO.getArtistData(artistData)
                artistID = rData.ID.ID
                if artistID is None:
                    continue
                modValData[artistID] = rData

            if self.verbose: print("Saving [{0}] Artist {1} Entries".format(len(modValData), "DB Data"))
            self.dbIO.saveArtistModValData(modVal, modValData)
            
        if self.verbose: ts.stop()

In [None]:
%autoreload
from dbIOGate import dbIOGate
from dbMusicBrainzIO import dbMusicBrainzIO
gate = dbIOGate()
gate.get("MusicBrainz")

In [None]:
prd = parseRawMusicBrainzData()
prd.parseArtistData(masterArtistData)

In [None]:

    
    
    ##########################################################################################
    # DB ModVal Data Utils
    ##########################################################################################
    def getParseArtistModValDictData(self, modVal, force=False):
        modValData = {} if force is True else self.dbIO.getArtistModValData(modVal)
        #modValData = modValData.to_dict() if isinstance(modValData,Series) else {}
        return modValData
    
    def saveParseArtistModValDictData(self, modVal, modValData):
        #modValData = Series(modValData) if isinstance(modValData,dict) else modValData
        self.dbIO.saveArtistModValData(modVal, modValData)
        
    def getParseReleaseModValDictData(self, modVal, force=False):
        modValData = {} if force is True else self.dbIO.getReleaseModValData(modVal)
        #modValData = modValData.to_dict() if isinstance(modValData,Series) else {}
        return modValData
    
    def saveParseReleaseModValDictData(self, modVal, modValData):
        #modValData = Series(modValData) if isinstance(modValData,dict) else modValData
        self.dbIO.saveReleaseModValData(modVal, modValData)
        
    def getParseWorkModValDictData(self, modVal, force=False):
        modValData = {} if force is True else self.dbIO.getWorkModValData(modVal)
        #modValData = modValData.to_dict() if isinstance(modValData,Series) else {}
        return modValData
    
    def saveParseWorkModValDictData(self, modVal, modValData):
        #modValData = Series(modValData) if isinstance(modValData,dict) else modValData
        self.dbIO.saveWorkModValData(modVal, modValData)
        
    def getParseRecordingModValDictData(self, modVal, force=False):
        modValData = {} if force is True else self.dbIO.getRecordingModValData(modVal)
        #modValData = modValData.to_dict() if isinstance(modValData,Series) else {}
        return modValData
    
    def saveParseRecordingModValDictData(self, modVal, modValData):
        #modValData = Series(modValData) if isinstance(modValData,dict) else modValData
        self.dbIO.saveRecordingModValData(modVal, modValData)
        
        
        
    #####################################################################################################################
    # Parse Raw Data
    #####################################################################################################################
    def parseArtistData(self, modVal, expr='< 0 Days', force=False):
        self.parseData(modVal, "Artist", expr, force)
    def parseReleaseData(self, modVal, expr='< 0 Days', force=False):
        self.parseData(modVal, "Release", expr, force)
    def parseWorkData(self, modVal, expr='< 0 Days', force=False):
        self.parseData(modVal, "Work", expr, force)
    def parseRecordingData(self, modVal, expr='< 0 Days', force=False):
        self.parseData(modVal, "Recording", expr, force)
    def parse(self, modVal, expr='< 0 Days', force=False):
        self.parseArtistData(modVal, expr, force)
        self.parseReleaseData(modVal, expr, force)
        self.parseWorkData(modVal, expr, force)
        self.parseRecordingData(modVal, expr, force)
        
            
    def mergeMediaData(self, prevMediaData, newMediaData):
        for mediaType,mediaTypeData in newMediaData.items():
            mtd  = {release.code: release for release in mediaTypeData}
            pmtd = {release.code: release for release in prevMediaData.get(mediaType,[])}
            pmtd.update(mtd)
            prevMediaData[mediaType] = list(pmtd.values())
            
    def updateMediaCounts(self, artistIDData):
        counts = {mediaType: len(mediaTypeData) for mediaType,mediaTypeData in artistIDData.media.media.items()}
        artistIDData.mediaCounts = self.dbIO.rawIO.makeRawMediaCountsData(counts)
    
                            
    def createModValData(self, modVal):
        if self.verbose: ts = timestat("Creating ModValData From Parsed Raw ModVal={0} Data".format(modVal))
            
        parseArtistModValData      = self.getParseArtistModValDictData(modVal)
        parseReleaseModValData      = self.getParseReleaseModValDictData(modVal)
        parseWorkModValData        = self.getParseWorkModValDictData(modVal)
        parseRecordingModValData = self.getParseRecordingModValDictData(modVal)

        modValData = {}
        for parseModValData in [parseArtistModValData, parseReleaseModValData, parseWorkModValData, parseRecordingModValData]:
            for artistID,artistIDData in parseModValData.items():
                if artistID is None:
                    continue
                if modValData.get(artistID) is None:
                    modValData[artistID] = artistIDData
                else:
                    self.mergeMediaData(modValData[artistID].media.media, artistIDData.media.media)
                    self.updateMediaCounts(modValData[artistID])
                        
                        
        if self.verbose: print("Saving [{0}] ModVal={1} {2} Entries".format(len(modValData), modVal, "DB Data"))
        self.utils.saveModValData(modVal, modValData)
        if self.verbose: ts.stop()

In [None]:
ts = timestat("Creating Artist Data")
rawIO = rawMusicBrainzDataIO()
for modVal,artistModValData in masterArtistData.groupby("ModVal"):
    modValData = {}
    N = artistModValData.shape[0]
    tsMod = timestat("Creating DB Data From {0} Artists For ModVal={1}".format(N,modVal))
    for i,(artistMBID,artistData) in enumerate(artistModValData.iterrows()):
        rData = rawIO.getArtistData(artistData)
        artistID = rData.ID.ID
        if artistID is None:
            continue
        modValData[artistID] = rData
    dbIO.saveArtistModValData(modVal, modValData)        
ts.stop()

In [None]:
rData.show()

In [None]:
masterReleaseGroupData    = io.get(savedir.join("{0}.p".format("ReleaseGroupDataFrame")).path)
masterArtistRecordingData = io.get(savedir.join("{0}.p".format("ArtistRecordingDataFrame")).path)
masterArtistWorkData      = io.get(savedir.join("{0}.p".format("ArtistWorkDataFrame")).path)


In [None]:

savedir = setDir(basedir, "MusicBrainzMetadata")
tsAll = timestat("Creating DB Data")
Nmod = 100
for n,modVal in enumerate(range(Nmod)):
    ts = timestat("Creating ModData Subset")
    artistModData = masterArtistData[masterArtistData["MyArtistID"].apply(lambda x: int(x)%Nmod) == modVal]
    releaseGroupModData = masterReleaseGroupData[masterReleaseGroupData["ArtistID"].isin(artistModData.index)]
    ts.stop()

    modValData = {}


In [None]:
from artistDBBase import artistDBBase, artistDBDataClass
from artistDBBase import artistDBNameClass, artistDBMetaClass, artistDBIDClass, artistDBURLClass, artistDBPageClass
from artistDBBase import artistDBProfileClass, artistDBMediaClass, artistDBMediaAlbumClass
from artistDBBase import artistDBMediaDataClass, artistDBMediaCountsClass, artistDBFileInfoClass
from artistDBBase import artistDBTextClass, artistDBLinkClass
from strUtils import fixName
from dbUtils import utilsDiscogs
from hashlib import md5

def getMediaCounts(media):
    amcc = artistDBMediaCountsClass()

    credittype = "Releases"
    if amcc.counts.get(credittype) == None:
        amcc.counts[credittype] = {}
    for creditsubtype in media.media.keys():
        amcc.counts[credittype][creditsubtype] = int(len(media.media[creditsubtype]))

    return amcc

savedir = setDir(basedir, "MusicBrainzMetadata")
tsAll = timestat("Creating DB Data")
Nmod = 100
for n,modVal in enumerate(range(Nmod)):
    ts = timestat("Creating ModData Subset")
    artistModData = masterArtistData[masterArtistData["MyArtistID"].apply(lambda x: int(x)%Nmod) == modVal]
    releaseGroupModData = masterReleaseGroupData[masterReleaseGroupData["ArtistID"].isin(artistModData.index)]
    ts.stop()

    modValData = {}
    N = artistModData.shape[0]
    tsMod = timestat("Creating DB Data From {0} Artists For ModVal={1}".format(N,modVal))
    for i,(artistID,artistData) in enumerate(artistModData.iterrows()):
        artistName  = str(artistData["ArtistName"])
        artistGID   = artistData['ArtistGID']
        artistURL   = "https://musicbrainz.org/artist/{0}".format(artistGID)
        myID        = artistData["MyArtistID"]
        #if artistGID != "070d193a-845c-479f-980e-bef15710653e":
        #    continue
        #if myID != '251108434349887660386335524263902329399':
        #    continue

        generalData = {}
        generalData["SortName"]   = artistData["ArtistSortName"]
        generalData["Aliases"]    = artistData["Aliases"]
        generalData["Gender"]     = artistData["Gender"]
        generalData["County"]     = artistData["Country"]
        generalData["Formed"]     = artistData["Formed"]
        generalData["Disbanded"]  = artistData["Disbanded"]
        generalData["ArtistType"] = artistData["ArtistType"]
        generalData["ISNI"]       = artistData["ISNICode"]
        generalData = {k: v for k,v in generalData.items() if v is not None}
        generalData = generalData if len(generalData) > 0 else None

        
        ########################################################################
        # Get URLs
        ########################################################################
        externalData = {}
        artistURLs = masterArtistURLData.get(artistID, [])
        for (urlType,url) in artistURLs:
            adblink      = artistDBLinkClass(None)
            adblink.href = url
            adblink.err  = None
            if externalData.get(urlType) is None:
                externalData[urlType] = []
            externalData[urlType].append(adblink)
        externalData = externalData if len(externalData) > 0 else None
        
            
        
        ########################################################################
        # Get Release Groups
        ########################################################################
        artistReleaseGroupData = releaseGroupModData[releaseGroupModData["ArtistID"] == artistID]
        mediaData = {}
        for mediaName,mediaNameData in artistReleaseGroupData.groupby("ReleaseGroupKey"):
            mediaData[mediaName] = []
            for code, releaseGroupInfo in mediaNameData.iterrows():
                album        = releaseGroupInfo['ReleaseGroupName']
                albumURL     = "https://musicbrainz.org/releasegroup/{0}".format(releaseGroupInfo['ReleaseGroupGID'])
                albumArtists = [artistName]
            
                amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None)
                mediaData[mediaName].append(amdc)
                
            
        ########################################################################
        # Get Works
        ########################################################################
        artistWorks = masterArtistWorkData.get(artistID)  
        if artistWorks:
            for workID,workType,workName in artistWorks:
                mediaName = "OtherWork" if workType is None else workType
                if mediaData.get(mediaName) is None:
                    mediaData[mediaName] = []
                m = md5()
                codes = {}
                m.update(str(workID).encode('utf-8'))
                m.update(str(mediaName).encode('utf-8'))
                m.update(str(workName).encode('utf-8'))
                hashval = m.hexdigest()
                code    = str(int(hashval, 16) % int(1e6))
                if codes.get(code) is not None:
                    continue
                codes[code] = True

                amdc = artistDBMediaDataClass(album=str(workName), url=None, artist=None, code=code, year=None)
                mediaData[mediaName].append(amdc)
                
            
        ########################################################################
        # Get Recordings
        ########################################################################
        artistRecordings = masterArtistRecordingData.get(artistID)        
        artistRecordings = Series(artistRecordings).drop_duplicates()
        if len(artistRecordings) > 0:
            mediaName = "Recordings"
            if mediaData.get(mediaName) is None:
                mediaData[mediaName] = []
            codes = {}
            for idx,(recName,recTime) in artistRecordings.iteritems():
                m = md5()
                m.update(str(recName).encode('utf-8'))
                m.update(str(recTime).encode('utf-8'))
                hashval = m.hexdigest()
                code    = str(int(hashval, 16) % int(1e6))
                if codes.get(code) is not None:
                    continue
                codes[code] = True
                
                amdc = artistDBMediaDataClass(album=str(recName), url=None, artist=None, code=code, year=None)
                mediaData[mediaName].append(amdc)

        
        artist      = artistDBNameClass(name=artistName, err=None)
        meta        = artistDBMetaClass(title=None, url=artistURL)
        url         = artistDBURLClass(url=artistURL)
        ID          = artistDBIDClass(ID=myID)
        pages       = artistDBPageClass(ppp=1, tot=1, redo=False, more=False)
        profile     = artistDBProfileClass(general=generalData, external=externalData)
        media       = artistDBMediaClass()
        media.media = mediaData
        mediaCounts = getMediaCounts(media)
        info        = artistDBFileInfoClass(info=None)
        
        modValData[myID] = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info)
        if (i+1) % 7500 == 0 or (i+1) == 2500:
            tsMod.update(n=i+1, N=N)
    tsMod.stop()
            
    outdir = setDir(basedir, "MusicBrainzDBData")
    io.save(idata=Series(modValData), ifile=setFile(outdir, "{0}-{1}.p".format(modVal, "DB")))
    tsAll.update(n=n, N=Nmod)
    print("\n")
tsAll.stop()

In [None]:
modValData['108541848016828757278131944962756872900'].show()

In [None]:
modValData['251108434349887660386335524263902329399'].show()

# Merge With Known DB

In [None]:
ts = timestat("Merging DBs")
for n,modVal in enumerate(range(100)):
    newDB = Series(io.get("/Volumes/Seagate/DB/MusicBrainzDBData/{0}-DB.p".format(modVal)))
    known = io.get("/Users/tgadfort/dbdiscogs/artists-musicbrainz-db/{0}-DB.p".format(modVal))
    
    toMerge = newDB[~newDB.index.isin(known.index)]
    fullDB = concat([known,toMerge]).sort_index()
    io.save(idata=fullDB, ifile="/Users/tgadfort/dbdiscogs/artists-musicbrainz-db/full/{0}-DB.p".format(modVal))
    ts.update(n=n+1,N=100)
ts.stop()

In [None]:
known.shape

In [None]:
tmp['172552485256597266680385033568580864600'].show()

In [None]:
masterArtistData = artistData['artist'][["ArtistGID", "ArtistName", "ArtistSortName", "Formed", "Disbanded"]].copy(deep=True)
masterArtistNumAlbums = artistIDNumReleaseGroups.join(artistIDNumRelease, how='outer')
masterArtistData = masterArtistData.join(masterArtistNumAlbums)
masterArtistData["NumReleaseGroups"] = masterArtistData["NumReleaseGroups"].fillna(0).apply(int)
masterArtistData["NumReleases"] = masterArtistData["NumReleases"].fillna(0).apply(int)

In [None]:
masterArtistData

In [None]:
masterartistNumAlbums

In [None]:
artistIDNumReleaseGroups.shape

In [None]:
artistIDNumRelease.shape

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["ArianaGrande"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["BuddyHolly"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["Bono"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["Rupaul"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["U2"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["DMB"]]

In [None]:
artistData['artist']["NA18"].value_counts()

In [None]:
artistData['artist']["NA9"].value_counts()

In [None]:
artistData['artist']['NA12'].unique()

In [None]:
artistData['artist']['NA5'].unique()

In [None]:
artistData['artist'][artistData['artist']['ArtistGID'] == '7f347782-eb14-40c3-98e2-17b6e1bfe56c']

In [None]:
artistData["artist"][artistData["artist"]["ArtistID"] == 502]

In [None]:
artistData["artist"]['NA10'].unique()

In [None]:
artistData["artist"][artistData["artist"]["ArtistID"] == 197]

# Artist Lookup

In [None]:
colnames["l_artist_url"]={0: "ArtistURLLID", 1: "URLGroupID", 2: "ArtistID", 3: "URLID"}
colnames["l_artist_release_group"]={0: "ArtistReleaseGroupLID", 1: "ReleaseGroupGroupID", 2: "ArtistID", 3: "ReleaseGroupID"}
colnames["l_artist_release"]={0: "ArtistReleaseLID", 1: "ReleaseGroupID", 2: "ArtistID", 3: "ReleaseID"}

ts = timestat("Loading Artist Data")
files = glob("mbdump/l_artist_*")
lookupData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
lookupData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in lookupData.items() if key in colnames} if lookupData is not None else lookupData
print("Keys: {0}".format(lookupData.keys()))
ts.stop()

In [None]:
files = glob("mbdump/l_artist_release")
lookupData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}

In [None]:
lookupData['l_artist_release']["ReleaseGroupID"].nunique()

In [None]:
lookupData['l_artist_release'][lookupData['l_artist_release']['ArtistID'] == 502]

In [None]:
key='l_artist_url'
lookupData['l_artist_url'] = lookupData['l_artist_url'][list(colnames[key].keys())].rename(columns=colnames[key])

In [None]:
print(urlData['url'][urlData['url']["URLName"].eq("https://www.discogs.com/artist/6520")])
print(urlData['url'][urlData['url']["URLName"].eq("https://www.allmusic.com/artist/mn0000219203")])

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"]["URLID"].isin([3017,993955])]

In [None]:
from pandas import merge
dmbAU = lookupData["l_artist_url"][lookupData["l_artist_url"]['ArtistID'] == 502].copy(deep=True)
u2AU  = lookupData["l_artist_url"][lookupData["l_artist_url"]['ArtistID'] == 197].copy(deep=True)

In [None]:
dmbURLs = merge(dmbAU, urlData['url'], how='left', on=["URLID"]).copy(deep=True)
u2URLs  = merge(u2AU, urlData['url'], how='left', on=["URLID"]).copy(deep=True)

In [None]:
dmbURLs["URLDomain"] = dmbURLs["URLName"].apply(lambda x: x.replace("https://", "").replace("http://", "").split('/')[0])

In [None]:
u2URLs["URLDomain"] = u2URLs["URLName"].apply(lambda x: x.replace("https://", "").replace("http://", "").split('/')[0])

In [None]:
u2URLs[["NA1", "URLDomain"]].sort_values(by="NA1").T

In [None]:
dmbURLs[["NA1", "URLDomain"]].sort_values(by="NA1").T

In [None]:
artistData["artist"][artistData["artist"].eq(8723).any(1)]

In [None]:
urlData['url'].shape

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"].eq(1025971).any(1)]

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"].eq(2625).any(1)]

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"][2].eq(502).any(1)]

In [None]:
DMB={AllMusic = 1025971 (c94225e3-2f0c-4c6d-9115-9f268fb7c31b), Discogs = 2625 (7a157b6e-d01d-4248-9995-edb05652c5b2)}

In [None]:
artistData['artist']

In [None]:
colnames = {0: "ArtistID", 1: "NA1", 2: "NA2": 3: "NA3"}
lookupData["l_artist_artist"][lookupData["l_artist_artist"].eq(502).any(1)]

In [None]:
urlData['url'][urlData['url']["URLName"].eq("https://www.discogs.com/artist/6520")]
urlData['url'][urlData['url']["URLName"].eq("https://www.allmusic.com/artist/mn0000219203")]

In [None]:
502
07e748f1-075e-428d-85dc-ce3be434e906