In [1]:
###########################################################################
## Basic
###########################################################################
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
 

###########################################################################
## Warnings
###########################################################################
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

# Basic Functions

In [4]:
from fileIO import fileIO
from fsUtils import fileUtil, dirUtil
from pandas import read_csv, DataFrame, Series
from fsUtils import setDir, setFile
from timeUtils import timestat
from numpy import nan
from pandas import to_datetime, NaT, isna, concat

from glob import glob
from fileIO import csvIO, pickleIO


def loadData(ifile):
    mbdata = csvIO().get(ifile, delimiter="\t", header=None, error_bad_lines=False)
    #mbdata = read_csv(ifile, delimiter="\t", header=None)
    mbdata = mbdata.replace('\\N', nan)
    return mbdata


def getData(files, colnames):
    data = {fileUtil(ifile).basename: loadData(ifile) for ifile in files}
    print("Keys: {0}".format(data.keys()))
    data = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in data.items() if key in colnames} if colnames is not None else data
    print("Keys: {0}".format(data.keys()))
    return data


def setIndex(data):
    for key,df in data.items():
        colname = df.columns[0]
        df.index = df[colname]
        df.drop([colname], axis=1, inplace=True)
    return data


def createDate(year, month, day):
    if all([isinstance(x,str) for x in [year,month,day]]):
        return to_datetime('{0}-{1}-{2}'.format(year, month, day), format='%Y-%m-%d', errors='ignore')
    elif all([isinstance(x,str) for x in [year,month]]):
        return to_datetime('{0}-{1}'.format(year, month), format='%Y-%m', errors='ignore')
    elif all([isinstance(x,str) for x in [year]]):
        return to_datetime('{0}'.format(year), format='%Y', errors='ignore')
    return NaT


def convertToDatetime(year, month, day):
    year  = year.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    month = month.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    day   = day.apply(lambda x: int(x) if (not isna(x) and x.isdigit()) else -1)
    tmp   = DataFrame(year).join(month).join(day)
    tmp.columns = ["year", "month", "day"]
    return to_datetime(tmp, errors='coerce')


basedir = "./"
basedir = "/Volumes/Seagate/DB"

aIDs={"ArianaGrande": 823336, "BuddyHolly": 10937, "Rupaul": 34318, "U2": 197, "DMB": 502, "Bono": 35575, "Mozart": 11285, "JohnMayer": 33563}

In [None]:
!ln -s '/Volumes/Seagate/DB/mbdump'

# DB Data

In [None]:
lengthData = open("flength.csv").readlines()
lengthData = [x.replace("\n", "").strip().split() for x in lengthData]
lengthData = Series({item[1].split("/")[1]: int(item[0]) for item in lengthData if len(item[1].split("/")) > 1})
lengthData

In [None]:
list(lengthData.index)

In [None]:
#lengthData['release_label']
lengthData[(lengthData <= 6)]

# Lookup Dictionary

In [73]:
colnames = {}
colnames['gender'] = {0: "GenderID", 1: "GenderName"} #, 3: "NA3", "GenderDescr"}

ts = timestat("Loading Gender Data")
files = [ifile for ifile in glob("mbdump/gender*")]
genderData = getData(files, colnames)
genderData = setIndex(genderData)
ts.stop()

dGenderName = genderData['gender']['GenderName'].to_dict()
del genderData

Current Time is Sat Dec 11, 2021 22:17 for Loading Gender Data
Keys: dict_keys(['gender'])
Keys: dict_keys(['gender'])
Process [Loading Gender Data] Took 0.0 Seconds


In [74]:
colnames = {}
colnames["area_type"] = {0: "AreaTypeID", 1: "AreaTypeName", 3: "AlsoAreaTypeID", 4: "AreaTypeDescr", 5: "AreaTypeGID"}
colnames["area"]      = {0: "AreaID", 1: "AreaGID", 2: "AreaName", 3: "AreaTypeID"}
#colnames["area_gid_redirect"] = {0: "AreaGIDUUID", 1: "AreaGID"}
#colnames["area_alias_type"] = {0: "AreaAliasTypeID", 1: "AreaAliasTypeName", 5: "AreaAliasTypeUUID"}
#colnames["area_alias"] = {0: "AreaAliasID", 1: "NA1", 2: "AreaAlias", 3: "AreaLang", 6: "AliasTypeID", 7: "AreaSortName"}

ts = timestat("Loading Area Data")
files = glob("mbdump/area*")
areaData = getData(files, colnames)
areaData = setIndex(areaData)
ts.stop()

dAreaName = areaData['area']['AreaName'].to_dict()
del areaData

Current Time is Sat Dec 11, 2021 22:17 for Loading Area Data
Keys: dict_keys(['area', 'area_alias', 'area_alias_type', 'area_gid_redirect', 'area_type'])
Keys: dict_keys(['area', 'area_type'])
Process [Loading Area Data] Took 0.7 Seconds


In [75]:
colnames = {}
colnames['isrc'] = {0: "ISRCID", 1: "RecordingID", 2: "ISRC"}
colnames['iswc'] = {0: "ISWCID", 1: "WorkID", 2: "ISWC"}
colnames['iso_3166_1'] = {0: "ISO31661ID", 1: "ISO31661"}
colnames['iso_3166_2'] = {0: "ISO31662ID", 1: "ISO31662"}
colnames['iso_3166_3'] = {0: "ISO31663ID", 1: "ISO31663"}

ts = timestat("Loading i* Code Data")
files = glob("mbdump/is*")
icodeData = getData(files, colnames)
icodeData = setIndex(icodeData)
ts.stop()

iSWCData = icodeData['iswc']["ISWC"].copy(deep=True)
iSWCData.index = icodeData['iswc']['WorkID']
iSWCData = iSWCData.drop_duplicates()
del icodeData

Current Time is Sat Dec 11, 2021 22:17 for Loading i* Code Data
Keys: dict_keys(['iso_3166_1', 'iso_3166_2', 'iso_3166_3', 'isrc', 'iswc'])
Keys: dict_keys(['iso_3166_1', 'iso_3166_2', 'iso_3166_3', 'isrc', 'iswc'])
Process [Loading i* Code Data] Took 2.7 Seconds


In [76]:
colnames = {}
colnames["language"] = {0: "LanguageID", 1: "LanguageShort1", 2: "LanguageShort2", 3: "LanguageShort3", 4: "LanguageName", 5: "NA5", 6: "LanguageShort"}

ts = timestat("Loading URL Data")
files = glob("mbdump/language*")
languageData = getData(files, colnames)
languageData = setIndex(languageData)
ts.stop()

dLanguageName = languageData['language']['LanguageName'].to_dict()
del languageData

Current Time is Sat Dec 11, 2021 22:17 for Loading URL Data
Keys: dict_keys(['language'])
Keys: dict_keys(['language'])
Process [Loading URL Data] Took 0.0 Seconds


In [77]:
colnames = {}
colnames["script"] = {0: "ScriptID", 1: "ScriptName", 2: "NA2", 3: "ScriptDescr", 4: "NA4"}

ts = timestat("Loading URL Data")
files = glob("mbdump/script")
scriptData = getData(files, colnames)
scriptData = setIndex(scriptData)
ts.stop()

dScriptName = scriptData['script']['ScriptName'].to_dict()
del scriptData

Current Time is Sat Dec 11, 2021 22:17 for Loading URL Data
Keys: dict_keys(['script'])
Keys: dict_keys(['script'])
Process [Loading URL Data] Took 0.0 Seconds


In [78]:
colnames = {}
colnames["label_alias"] = {0: "LabelAliasID", 1: "LabelID", 2: "LabelAliasName", 7: "LabelAliasName2"}
colnames["label_alias_type"] = {0: "LabelAliasTypeID", 1: "LabelAliasTypeName", 5: "LabelAliasGID"}
colnames["label_ipi"] = {0: "LabelIPIID", 1: "LabelIPI"}
colnames["label_isni"] = {0: "LabelISNIID", 1: "LabelISNI"}
colnames["label_type"] = {0: "LabelTypeID", 1: "LabelTypeName", 5: "LabelTypeGID"}
colnames["label"] = {0: "LabelID", 1: "LabelGID", 2: "LabelName"}

ts = timestat("Loading Label Data")
files = glob("mbdump/label*")
labelData = getData(files, colnames)
labelData = setIndex(labelData)
ts.stop()

dLabelName = labelData['label']['LabelName'].to_dict()
del labelData

Current Time is Sat Dec 11, 2021 22:17 for Loading Label Data
Keys: dict_keys(['label', 'label_alias', 'label_alias_type', 'label_gid_redirect', 'label_ipi', 'label_isni', 'label_type'])
Keys: dict_keys(['label', 'label_alias', 'label_alias_type', 'label_ipi', 'label_isni', 'label_type'])
Process [Loading Label Data] Took 0.9 Seconds


# Gender

In [None]:
colnames = {}
colnames['gender'] = {0: "GenderID", 1: "GenderName"} #, 3: "NA3", "GenderDescr"}

ts = timestat("Loading Gender Data")
files = [ifile for ifile in glob("mbdump/gender*")]
genderData = getData(files, colnames)
genderData = setIndex(genderData)
ts.stop()

# Lookup

In [69]:
colnames = {}
colnames["l_label_release"] = {0: "Index", 1: "NA1", 2: "NA2", 3: "ReleaseID"}

ts = timestat("Loading Area Data")
files = glob("mbdump/l_label_release")
lookupData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
lookupData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in lookupData.items()} if colnames is not None else lookupData
print("Keys: {0}".format(lookupData.keys()))
ts.stop()

Current Time is Sat Dec 11, 2021 22:17 for Loading Area Data


NameError: name 'fileInfo' is not defined

In [70]:
lookupData['l_label_release'][lookupData['l_label_release']['ReleaseID'].isin(releaseIDs)]

KeyError: 'l_label_release'

In [None]:
lookupData['l_label_release'].max()

# Area

In [None]:
colnames = {}
colnames["area_type"] = {0: "AreaTypeID", 1: "AreaTypeName", 3: "AlsoAreaTypeID", 4: "AreaTypeDescr", 5: "AreaTypeGID"}
colnames["area"]      = {0: "AreaID", 1: "AreaGID", 2: "AreaName", 3: "AreaTypeID"}
#colnames["area_gid_redirect"] = {0: "AreaGIDUUID", 1: "AreaGID"}
#colnames["area_alias_type"] = {0: "AreaAliasTypeID", 1: "AreaAliasTypeName", 5: "AreaAliasTypeUUID"}
#colnames["area_alias"] = {0: "AreaAliasID", 1: "NA1", 2: "AreaAlias", 3: "AreaLang", 6: "AliasTypeID", 7: "AreaSortName"}

ts = timestat("Loading Area Data")
files = glob("mbdump/area*")
areaData = getData(files, colnames)
areaData = setIndex(areaData)
ts.stop()

# Event

In [None]:
colnames = {}
colnames["event_type"] = {0: "EventTypeID", 1: "EventTypeName", 4: "EventTypeDescr"}
colnames["event_alias_type"] = {0: "EventAliasTypeID", 1: "EventAliasTypeName", 5: "EventAliasTypeGID"}
colnames["event_alias"] = {0: "EventAliasID", 1: "EventID", 2: "EventAliasName", 3: "EventAliasLang", 7: "EventAliasName2"}
colnames["event"] = {0: "EventID", 1: "EventGID", 2: "EventName", 
                     3: "EventStartYear", 4: "EventStartMonth", 5: "EventStartDay", 6: "EventEndYear", 7: "EventEndMonth", 8: "EventEndDay"}

ts = timestat("Loading Event Data")
files = glob("mbdump/event*")
eventData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(eventData.keys()))
eventData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in eventData.items() if key in colnames} if colnames is not None else eventData
print("Keys: {0}".format(eventData.keys()))
ts.stop()

In [None]:
eventData['event']

# ICode

In [None]:
colnames = {}
colnames['isrc'] = {0: "ISRCID", 1: "RecordingID", 2: "ISRC"}
colnames['iswc'] = {0: "ISWCID", 1: "WorkID", 2: "ISWC"}
colnames['iso_3166_1'] = {0: "ISO31661ID", 1: "ISO31661"}
colnames['iso_3166_2'] = {0: "ISO31662ID", 1: "ISO31662"}
colnames['iso_3166_3'] = {0: "ISO31663ID", 1: "ISO31663"}

ts = timestat("Loading i* Code Data")
files = glob("mbdump/is*")
icodeData = getData(files, colnames)
icodeData = setIndex(icodeData)
ts.stop()

iSWCData = icodeData['iswc']["ISWC"].copy(deep=True)
iSWCData.index = icodeData['iswc']['WorkID']
iSWCData = iSWCData.drop_duplicates()
del icodeData

# Medium

In [None]:
colnames = {}
colnames["medium_format"] = {0: "MediumFormatID", 1: "MediumName", 2: "MediumGroupID", 3: "NA3", 4: "MediumIntroYear", 5: "MediumDescr", 6: "MediumGID"}
#colnames["medium_cdtoc"]  = {0: "NA0", 1: "NA1", 2: "NA2", 3: "NA3"}
colnames["medium"] = {0: "ReleaseID_1", 1: "ReleaseID", 2: "SideNum", 3: "MediumFormatID", 4: "NA4", 5: "NA5", 7: "NumTracks"}

ts = timestat("Loading Medium Data")
files = glob("mbdump/medium*")
mediumData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(mediumData.keys()))
mediumData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in mediumData.items() if key in colnames} if colnames is not None else mediumData
print("Keys: {0}".format(mediumData.keys()))
ts.stop()

In [None]:
mediumData['medium_format'].nunique()

In [None]:
mediumData['medium'].nunique()

In [None]:
## 12" Vinyl ; ReleaseID=1310220
#mediumData['medium'][mediumData['medium'].eq(1310220).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1310220   1274390    1       33        4
# 1351220   1310220    1       31        11      <- Matches https://musicbrainz.org/release/6b5f33a8-fc5e-4c1e-b379-c659ce20a1c8


## Digital Media : ReleaseID=1792210
#mediumData['medium'][mediumData['medium'].eq(1792210).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1792210   1694923    1      12	NaN	0	2015-11-30 00:32:22.335038+00	13
# 1904010   1792210    1      12	NaN	0	2016-06-20 18:41:01.106192+00	11   Both Match


# 2x12" Vinyl ; ReleaseID=1680415

#mediumData['medium'][mediumData['medium'].eq(1680415).any(1)]
# MediumID  ReleaseID  Sides?  Format?   ?
# 1680415	1598741	1	1	NaN	0	2015-04-27 04:37:49.277705+00	11
# 1775088	1680415	1	31	NaN	0	2015-10-31 12:02:35.843886+00	5
# 1775089	1680415	2	31	NaN	0	2015-10-31 12:02:35.843886+00	6

# 1792210   1694923    1      12	NaN	0	2015-11-30 00:32:22.335038+00	13
# 1904010   1792210    1      12	NaN	0	2016-06-20 18:41:01.106192+00	11   Both Match

# Pixies Velouria CD w/ 4 Tracks
# 3099, 1162482, 3097, 2259927

## CD
# mediumData['medium'][mediumData['medium'].eq(3099).any(1)]
# 3099	3099	1	1	NaN	0	2011-05-16 14:57:06.530063+00	4

## CD (Status=Promotional)
# mediumData['medium'][mediumData['medium'].eq(1162482).any(1)]
# 1162482	1146184	1	12	NaN	0	2012-04-14 05:35:56.931961+00	1
# 1181517	1162482	1	1	NaN	0	2012-05-30 00:53:24.512335+00	4

## CD
# mediumData['medium'][mediumData['medium'].eq(3097).any(1)]
# 3097	3097	1	1	NaN	0	2012-10-18 19:49:17.567219+00	4

## 12" Vinyl
# mediumData['medium'][mediumData['medium'].eq(2259927).any(1)]
# 2259927	2099781	1	12	NaN	0	2018-01-11 08:03:13.107915+00	5
# 2441117	2259927	1	31	NaN	0	2018-09-28 23:20:29.279923+00	4  <-- This matches Web

In [None]:
mediumData['medium'].nunique()

In [None]:
mediumData['medium'].shape

In [None]:
mediumData['medium_cdtoc'][mediumData['medium_cdtoc'].eq(1598741).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(38).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq('38').any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(63).any(1)]

In [None]:
mediumData['medium_format'][mediumData['medium_format'].eq(64).any(1)]

# Language

In [None]:
colnames = {}
colnames["language"] = {0: "LanguageID", 1: "LanguageShort1", 2: "LanguageShort2", 3: "LanguageShort3", 4: "LanguageName", 5: "NA5", 6: "LanguageShort"}

ts = timestat("Loading URL Data")
files = glob("mbdump/language*")
languageData = getData(files, colnames)
languageData = setIndex(languageData)
ts.stop()

# Script

In [None]:
colnames = {}
colnames["script"] = {0: "ScriptID", 1: "ScriptName", 2: "NA2", 3: "ScriptDescr", 4: "NA4"}

ts = timestat("Loading URL Data")
files = glob("mbdump/script")
scriptData = getData(files, colnames)
scriptData = setIndex(scriptData)
ts.stop()

# URL

In [None]:
colnames = {}
colnames["url"] = {0: "URLID", 1: "URLGID", 2: "URLName"}
#colnames["url_gid_redirect"] = {0: "URLGIDUUID", 1: "URLGIDID"}

ts = timestat("Loading URL Data")
files = glob("mbdump/url*")
urlData = getData(files, colnames)
urlData = setIndex(urlData)
ts.stop()

# Work

In [24]:
colnames = {}
colnames["work_type"] = {0: "WorkTypeID", 1: "WorkTypeName", 3: "WorkTypeRanking", 4: "WorkTypeDescr", 5: "WorkTypeGID"}
#colnames["work_alias"] = {0: "WorkAliasID", 1: "WorkID", 2: "WorkName", 3: "WorkLang", 7: "WorkName2"}
#colnames["work_alias_type"] = {0: "WorkAliasTypeID", 1: "WorkAliasTypeName", 5: "WorkAliasTypeGID"}
colnames['work_attribute_type_allowed_value'] = {0: 'WorkAttributeTypeValueID', 1: "WorkAttributeTypeID", 2: "WorkAttributeTypeValue", 6: "WorkAttributeTypeValueGID"}
colnames["work_attribute_type"] = {0: "WorkAttributeTypeID", 1: "WorkAttributeTypeName", 6: "WorkAttributeTypeDescr"}
colnames["work_attribute"] = {0: "WorkAttributeID", 1: "WorkID", 2: "WorkAttributeTypeID", 3: "WorkAttributeTypeValueID", 4: "WorkAttributeCode"}
colnames["work_language"] = {0: "WorkID", 1: "LanguageID"}
colnames["work"] = {0: "WorkID", 1: "WorkGID", 2: "WorkName", 3: "WorkTypeID"} #, 4: "WorkDescr"}

ts = timestat("Loading Work Data")
files = glob("mbdump/work*")
workData = getData(files, colnames)
workData = setIndex(workData)
ts.stop()

Current Time is Sat Dec 11, 2021 21:18 for Loading Work Data
Keys: dict_keys(['work', 'work_alias', 'work_alias_type', 'work_attribute', 'work_attribute_type', 'work_attribute_type_allowed_value', 'work_gid_redirect', 'work_language', 'work_type'])
Keys: dict_keys(['work', 'work_attribute', 'work_attribute_type', 'work_attribute_type_allowed_value', 'work_language', 'work_type'])
Process [Loading Work Data] Took 5.8 Seconds


## Append Data And Create Master Work DataFrame

In [25]:
ts = timestat("Joining Release Language Name")
workData['work_language']["Language"] = workData['work_language']['LanguageID'].apply(lambda x: dLanguageName.get(x) if not isna(x) else None)
workData['work_language'].drop(["LanguageID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Work Attribute Type")
dWorkAttributeTypeName = workData['work_attribute_type']["WorkAttributeTypeName"].to_dict()
workData['work_attribute_type_allowed_value']["WorkAttributeType"] = workData['work_attribute_type_allowed_value']['WorkAttributeTypeID'].apply(lambda x: dWorkAttributeTypeName.get(x) if not isna(x) else None)
workData['work_attribute_type_allowed_value'].drop(["WorkAttributeTypeID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Work Attribute Type")
dWorkAttributeTypeName = workData['work_attribute_type']["WorkAttributeTypeName"].to_dict()
workData['work_attribute']["WorkAttributeType"] = workData['work_attribute']['WorkAttributeTypeID'].apply(lambda x: dWorkAttributeTypeName.get(x) if not isna(x) else None)
workData['work_attribute'].drop(["WorkAttributeTypeID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Work Type")
dWorkTypeName = workData['work_type']["WorkTypeName"].to_dict()
workData['work']["WorkTypeName"] = workData['work']['WorkTypeID'].apply(lambda x: dWorkTypeName.get(int(x)) if not isna(x) else None)
workData['work'].drop(["WorkTypeID"], axis=1, inplace=True)
ts.stop()

###
# Ignore Work Attributes
###

ts = timestat("Dropping Last Columns")
workData['work'].drop(["WorkGID"], axis=1, inplace=True)
ts.stop()

savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("WorkDataFrame"))
ts = timestat("Saving Master Release DataFrame To {0}".format(savename))
pickleIO().save(idata=workData['work'], ifile=savename)

Current Time is Sat Dec 11, 2021 21:18 for Joining Release Language Name
Process [Joining Release Language Name] Took 0.5 Seconds
Current Time is Sat Dec 11, 2021 21:18 for Joining Work Attribute Type
Process [Joining Work Attribute Type] Took 0.0 Seconds
Current Time is Sat Dec 11, 2021 21:18 for Joining Work Attribute Type
Process [Joining Work Attribute Type] Took 0.3 Seconds
Current Time is Sat Dec 11, 2021 21:18 for Joining Work Type
Process [Joining Work Type] Took 0.9 Seconds
Current Time is Sat Dec 11, 2021 21:19 for Dropping Last Columns
Process [Dropping Last Columns] Took 0.1 Seconds
Current Time is Sat Dec 11, 2021 21:19 for Saving Master Release DataFrame To /Volumes/Seagate/DB/MusicBrainzData/WorkDataFrame.p


# Label

In [None]:
colnames = {}
colnames["label_alias"] = {0: "LabelAliasID", 1: "LabelID", 2: "LabelAliasName", 7: "LabelAliasName2"}
colnames["label_alias_type"] = {0: "LabelAliasTypeID", 1: "LabelAliasTypeName", 5: "LabelAliasGID"}
colnames["label_ipi"] = {0: "LabelIPIID", 1: "LabelIPI"}
colnames["label_isni"] = {0: "LabelISNIID", 1: "LabelISNI"}
colnames["label_type"] = {0: "LabelTypeID", 1: "LabelTypeName", 5: "LabelTypeGID"}
colnames["label"] = {0: "LabelID", 1: "LabelGID", 2: "LabelName"}

ts = timestat("Loading Label Data")
files = glob("mbdump/label*")
labelData = getData(files, colnames)
labelData = setIndex(labelData)
ts.stop()

# Recording

In [52]:
colnames = {}
#colnames["recording_alias_type"] = {0: "RecordingAliasTypeID", 1: "RecordingAliasTypeName"}
#colnames["recording_alias"] = {0: "RecordingAliasID", 1: "RecordingID", 2: "RecordingAliasName", 3: "RecordingAliasLang", 7: "recordingAliasName2"}
colnames["recording"] = {0: "RecordingID", 1: "RecordingGID", 2: "RecordingName", 3: "ArtistID", 4: "TimeLength"} #, 5: "RecordingDescr"}

ts = timestat("Loading Recording Data")
files = [ifile for ifile in glob("mbdump/recording*")]
recordingData = getData(files, colnames)
recordingData = setIndex(recordingData)
ts.stop()

Current Time is Sat Dec 11, 2021 21:52 for Loading Recording Data


b'Skipping line 2180777: expected 9 fields, saw 12\n'
b'Skipping line 5112821: expected 9 fields, saw 12\n'
b'Skipping line 11110739: expected 9 fields, saw 12\nSkipping line 11110746: expected 9 fields, saw 12\n'
b'Skipping line 12675230: expected 9 fields, saw 12\n'
b'Skipping line 22142824: expected 9 fields, saw 12\n'
  mbdata = csvIO().get(ifile, delimiter="\t", header=None, error_bad_lines=False)


Keys: dict_keys(['recording', 'recording_alias', 'recording_alias_type', 'recording_gid_redirect'])
Keys: dict_keys(['recording'])
Process [Loading Recording Data] Took 1.4 Minutes


## Append Data And Create Master Recording DataFrame

In [53]:
ts = timestat("Dropping Last Columns")
recordingData['recording'].drop(["RecordingGID"], axis=1, inplace=True)
ts.stop()

savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("RecordingDataFrame"))
ts = timestat("Saving Master Recording DataFrame To {0} (~1.1 min)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=recordingData['recording'], ifile=savename)
ts.stop()

Current Time is Sat Dec 11, 2021 21:54 for Dropping Last Columns
Process [Dropping Last Columns] Took 1.5 Seconds
Current Time is Sat Dec 11, 2021 21:54 for Saving Master Recording DataFrame To /Volumes/Seagate/DB/MusicBrainzData/RecordingDataFrame.p (~1.1 min)
Process [Saving Master Recording DataFrame To /Volumes/Seagate/DB/MusicBrainzData/RecordingDataFrame.p (~1.1 min)] Took 44.1 Seconds


# Track

In [None]:
colnames = {}
colnames["track"] = {0: "TrackID", 1: "TrackGID", 2: "RecordingID", 3: "NA3", 4: "TrackNum", 5: "TrackNumName", 6: "TrackName", 7: "ArtistID", 8: "TimeLength"}

# Release
# 2373946	7c5d14b4-cf40-4eb1-89e6-d448125d94f3	1987-12-12: Hampton Coliseum, Hampton, VA, USA	197	2128729	3	\N	120	28	\N		0	-1	2019-03-15 09:46:50.446876+00
            
# Release Group
# 2128729	f979a1c6-b6c2-4aad-b5b6-709c6c216752	1987-12-12: Hampton Coliseum, Hampton, VA, USA	197	1		0	2019-03-15 09:46:45.458509+00

# Recording
# 24365864	6a355a78-06c0-4e06-a15e-05d6e533255e	Sunday Bloody Sunday	197	366000		0	2019-03-15 12:29:22.049215+00	f

# Track
# 27710495	fc969b10-fdba-4b0c-82d9-aa6a7179e41f	24365864	2571008	7	7	Sunday Bloody Sunday	197	366000	0	2019-03-15 12:29:22.049215+00	f
ts = timestat("Loading Recording Data")
files = glob("mbdump/track")
trackData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
print("Keys: {0}".format(trackData.keys()))
trackData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in trackData.items() if key in colnames} if colnames is not None else trackData
print("Keys: {0}".format(trackData.keys()))
ts.stop()

# Release

In [79]:
colnames = {}
#colnames["release_alias_type"] = {0: "ReleaseAliasTypeID", 1: "ReleaseAliasTypeName"}
#colnames["release_alias"] = {0: "ReleaseAliasID", 1: "ReleaseID", 2: "ReleaseAliasName", 3: "ReleaseAliasLang", 7: "ReleaseAliasName2"}
colnames["release_status"] = {0: "ReleaseStatusID", 1: "ReleaseStatusName", 3: "NA3", 3: "ReleaseStatusDescr", 4: "ReleaseStatusGID"}
colnames["release_packaging"] = {0: "ReleasePackagingID", 1: "ReleasePackagingName", 3: "NA3", 4: "ReleasePackagingDescr", 5: "ReleaseStatusGID"}
colnames["release_label"] = {0: "Index", 1: "ReleaseID", 2: "LabelID", 3: "CatalogNumber"}
colnames["release_country"] = {0: "ReleaseID", 1: "ReleaseCountryID", 2: "ReleaseCountryYear", 3: "ReleaseCountryMonth", 4: "ReleaseCountryDay"}
colnames["release_unknown_country"] = {0: "ReleaseID", 1: "ReleaseCountryYear", 2: "ReleaseCountryMonth", 3: "ReleaseCountryDay"}
colnames["release"] = {0: "ReleaseID", 1: "ReleaseGID", 2: "ReleaseName", 3: "ArtistID", 4: "ReleaseGroupID", 5: "ReleaseStatusID", 
                       6: "ReleasePackagingID", 7: "LanguageID", 8: "ScriptID", 9: "ReleaseBarcode", 10: "ReleaseComment", 11: "NA11", 12: "NA12"}

ts = timestat("Loading Release Data")
files = [ifile for ifile in glob("mbdump/release*") if "group" not in ifile]
releaseData = getData(files, colnames)
releaseData = setIndex(releaseData)
ts.stop()

Current Time is Sat Dec 11, 2021 22:17 for Loading Release Data
Keys: dict_keys(['release', 'release_alias', 'release_alias_type', 'release_country', 'release_gid_redirect', 'release_label', 'release_packaging', 'release_status', 'release_unknown_country'])
Keys: dict_keys(['release', 'release_country', 'release_label', 'release_packaging', 'release_status', 'release_unknown_country'])
Process [Loading Release Data] Took 20.9 Seconds


## Append Data And Create Master Release DataFrame

In [80]:
tsRelease = timestat("Appending Release Data")

ts = timestat("Creating Release Country DateTime For {0} Releases".format(releaseData['release_country'].shape[0]))
tmp = releaseData['release_country'][["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"]]
tmp.columns = ["year", "month", "day"]
releaseData['release_country']['ReleaseDate'] = to_datetime(tmp, errors='ignore')
releaseData['release_country'].drop(["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"], axis=1, inplace=True)
ts.stop()

ts = timestat("Creating Release Unknown Country DateTime For {0} Releases".format(releaseData['release_unknown_country'].shape[0]))
tmp = releaseData['release_unknown_country'][["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"]]
tmp.columns = ["year", "month", "day"]
releaseData['release_unknown_country']['ReleaseDate'] = to_datetime(tmp, errors='ignore')
releaseData['release_unknown_country'].drop(["ReleaseCountryYear", "ReleaseCountryMonth", "ReleaseCountryDay"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Country Area")
dAreaName = areaData['area']['AreaName'].to_dict()
releaseData['release_country']["Country"] = releaseData['release_country']['ReleaseCountryID'].apply(lambda x: dAreaName.get(x) if not isna(x) else None)
releaseData['release_country'].drop(["ReleaseCountryID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Release Packaging Name")
dReleasePackagingName = releaseData['release_packaging']['ReleasePackagingName'].to_dict()
releaseData['release']["Packaging"] = releaseData['release']['ReleasePackagingID'].apply(lambda x: dReleasePackagingName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ReleasePackagingID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Release Status Name")
dReleaseStatusName = releaseData['release_status']['ReleaseStatusName'].to_dict()
releaseData['release']["Status"] = releaseData['release']['ReleaseStatusID'].apply(lambda x: dReleaseStatusName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ReleaseStatusID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Release Language Name")
dLanguageName = languageData['language']['LanguageName'].to_dict()
releaseData['release']["Language"] = releaseData['release']['LanguageID'].apply(lambda x: dLanguageName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["LanguageID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Release Script Name")
dScriptName = scriptData['script']['ScriptName'].to_dict()
releaseData['release']["Script"] = releaseData['release']['ScriptID'].apply(lambda x: dScriptName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseData['release'].drop(["ScriptID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Country/Release Date")
tmp = concat([releaseData['release_country'], releaseData['release_unknown_country']]).reset_index()
releaseIDDate = tmp.sort_values(by="ReleaseDate").drop_duplicates(subset="ReleaseID")
releaseIDDate.index = releaseIDDate['ReleaseID']
releaseIDDate.drop(["ReleaseID"], axis=1, inplace=True)
releaseData['release'] = releaseData['release'].join(releaseIDDate)
ts.stop()

ts = timestat("Joining Label Data")
dLabelName = labelData['label']['LabelName'].to_dict()
releaseData['release_label']["Label"] = releaseData['release_label']['LabelID'].apply(lambda x: dLabelName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseLabel = releaseData['release_label'].drop_duplicates(subset="ReleaseID")[["ReleaseID", "Label"]].copy(deep=True)
releaseLabel["Label"].index = releaseLabel["ReleaseID"]
releaseData['release'].join(releaseLabel)
ts.stop()

ts = timestat("Dropping Last Columns")
releaseData['release'].drop(["ReleaseComment", "ReleaseBarcode", "NA11", "NA12"], axis=1, inplace=True)
ts.stop()


savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ReleaseDataFrame"))
ts = timestat("Saving Master Release DataFrame To {0}".format(savename))
pickleIO().save(idata=releaseData['release'], ifile=savename)
fsize = fileInfo(savename).fsize
print("Master Release DataFrame is {0}{1}".format(fsize[0], fsize[1]))
ts.stop()


tsRelease.stop()

Current Time is Sat Dec 11, 2021 22:18 for Appending Release Data
Current Time is Sat Dec 11, 2021 22:18 for Creating Release Country DateTime For 6845429 Releases
Process [Creating Release Country DateTime For 6845429 Releases] Took 21.2 Seconds
Current Time is Sat Dec 11, 2021 22:18 for Creating Release Unknown Country DateTime For 205613 Releases
Process [Creating Release Unknown Country DateTime For 205613 Releases] Took 0.5 Seconds
Current Time is Sat Dec 11, 2021 22:18 for Joining Country Area


NameError: name 'areaData' is not defined

# Release-Group

In [None]:
colnames = {}
#colnames["release_group_alias_type"] = {0: "ReleaseGroupAliasTypeID", 1: "ReleaseGroupAliasTypeName"}
#colnames["release_group_alias"] = {0: "ReleaseGroupAliasID", 1: "NA1", 2: "ReleaseGroupAliasName", 3: "ReleaseGroupAliasLang", 7: "ReleaseGroupAliasName2"}
colnames["release_group_primary_type"] = {0: "ReleaseGroupPrimaryTypeID", 1: "ReleaseGroupPrimaryTypeName", 3: "NA3"}
colnames["release_group_secondary_type"] = {0: "ReleaseGroupSecondaryTypeID", 1: "ReleaseGroupSecondaryTypeName"}
colnames["release_group"] = {0: "ReleaseGroupID", 1: "ReleaseGroupGID", 2: "ReleaseGroupName", 3: "ArtistID", 4: "ReleaseGroupPrimaryTypeID", 5: "ReleaseGroupComment", 6: "NA6"}
colnames["release_group_secondary_type_join"] = {0: "ReleaseGroupID", 1: "ReleaseGroupSecondaryTypeID"}

ts = timestat("Loading Release Data")
files = [ifile for ifile in glob("mbdump/release*") if "group" in ifile]
releaseGroupData = getData(files, colnames)
releaseGroupData = setIndex(releaseGroupData)
ts.stop()

## Append Data And Create Master ReleaseGroup DataFrame

In [None]:
tsReleaseGroup = timestat("Appending ReleaseGroup Data")

ts = timestat("Joining Secondary Type Names")
dReleaseGroupSecondaryTypeName = releaseGroupData['release_group_secondary_type']['ReleaseGroupSecondaryTypeName'].to_dict()
releaseGroupData['release_group_secondary_type_join']['ReleaseGroupSecondaryType'] = releaseGroupData['release_group_secondary_type_join']['ReleaseGroupSecondaryTypeID'].apply(lambda x: dReleaseGroupSecondaryTypeName.get(x) if not isna(x) else None)
releaseGroupData['release_group_secondary_type_join'].drop(["ReleaseGroupSecondaryTypeID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Primary Type Names")
dReleaseGroupPrimaryTypeName = releaseGroupData['release_group_primary_type']['ReleaseGroupPrimaryTypeName'].to_dict()
releaseGroupData['release_group']['ReleaseGroupPrimaryType'] = releaseGroupData['release_group']['ReleaseGroupPrimaryTypeID'].apply(lambda x: dReleaseGroupPrimaryTypeName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
releaseGroupData['release_group'].drop(["ReleaseGroupPrimaryTypeID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Release Group And Secondary Type Join Data")
releaseGroupData['release_group'] = releaseGroupData['release_group'].join(releaseGroupData['release_group_secondary_type_join'])
ts.stop()

ts = timestat("Dropping Last Columns")
releaseGroupData['release_group'].drop(["NA6", "ReleaseGroupComment"], axis=1, inplace=True)
ts.stop()


savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ReleaseGroupDataFrame"))
ts = timestat("Saving Master ReleaseGroup DataFrame To {0} (~20 sec)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=releaseGroupData['release_group'], ifile=savename)
fsize = fileInfo(savename).fsize
print("Master ReleaseGroup DataFrame is {0}{1}".format(fsize[0], fsize[1]))
ts.stop()


tsReleaseGroup.stop()

del releaseGroupData

# Artist

In [None]:
colnames = {}
colnames["artist_credit"] = {0: "ArtistCreditID", 1: "ArtistCreditName", 2: "ArtistCreditNum", 3: "NA3"}
colnames["artist_type"] = {0: "ArtistTypeID", 1: "ArtistTypeName", 2: "NA2", 3: "NA3", 4: "ArtistTypeDescr", 5: "ArtistTypeGID"}
colnames["artist_isni"] = {0: "ArtistID", 1: "ISNICode"}
colnames["artist_alias_type"] = {0: "ArtistAliasTypeID", 1: "ArtistAliasTypeName", 5: "ArtistAliasTypeGID"}
colnames["artist_alias"] = {0: "ArtistAliasID", 1: "ArtistID", 2: "ArtistAliasName", 3: "ArtistAliasLang", 7: "ArtistAliasSortName"}
colnames["artist"] = {0: "ArtistID", 1: "ArtistGID", 2: "ArtistName", 3: "ArtistSortName",
                      4: "FormedYear", 5: "FormedMonth", 6: "FormedDay", 
                      7: "DisbandedYear", 8: "DisbandedMonth", 9: "DisbandedDay", 
                      10: "ArtistTypeID", 11: "CountryAreaID", 12: "GenderID", 13: "ArtistDescr", 14: "NA14", 17: "FoundedInAreaID", 18: "DisbandedInAreaID"}

ts = timestat("Loading Artist Data")
files = [ifile for ifile in glob("mbdump/artist*") if ifile not in ["mbdump/artist_credit_name"]]
artistData = getData(files, colnames)
artistData = setIndex(artistData)
ts.stop()

## Append Data And Create Master Artist DataFrame

In [None]:
tsArtist = timestat("Appending Artist Data")

ts = timestat("Creating Formed/Disbanded DateTime For {0} Artists (~7 sec)".format(artistData['artist'].shape[0]))
artistData['artist']['Formed']    = convertToDatetime(artistData['artist']["FormedYear"], artistData['artist']["FormedMonth"], artistData['artist']["FormedDay"])
artistData['artist']['Disbanded'] = convertToDatetime(artistData['artist']["DisbandedYear"], artistData['artist']["DisbandedMonth"], artistData['artist']["DisbandedDay"])
artistData['artist'].drop(["FormedYear", "FormedMonth", "FormedDay", "DisbandedYear", "DisbandedMonth", "DisbandedDay"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Artist Type")
dArtistTypeName = artistData['artist_type']["ArtistTypeName"].to_dict()
artistData['artist']["ArtistType"] = artistData['artist']['ArtistTypeID'].apply(lambda x: dArtistTypeName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["ArtistTypeID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Gender Type")
artistData['artist']['Gender'] = artistData['artist']['GenderID'].apply(lambda x: dGenderName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["GenderID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining Area Type")
artistData['artist']["Country"]     = artistData['artist']['CountryAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist']["FormedIn"]    = artistData['artist']['FoundedInAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist']["DisbandedIn"] = artistData['artist']['DisbandedInAreaID'].apply(lambda x: dAreaName.get(int(x)) if (not isna(x) and x.isdigit()) else None)
artistData['artist'].drop(["CountryAreaID", "FoundedInAreaID", "DisbandedInAreaID"], axis=1, inplace=True)
ts.stop()

ts = timestat("Joining ISNI")
artistData['artist'] = artistData['artist'].join(artistData['artist_isni'])
ts.stop()

ts = timestat("Collecting and Joining Artist Aliases (~40 sec)")
artistAliases = DataFrame(Series({artistID: df["ArtistAliasName"].to_list() for artistID,df in artistData['artist_alias'].groupby("ArtistID")}))
artistAliases.columns = ["Aliases"]
artistData['artist'] = artistData['artist'].join(artistAliases)
ts.stop()


ts = timestat("Dropping Last Columns")
artistData['artist'].drop(["ArtistDescr", "NA14"], axis=1, inplace=True)
ts.stop()


savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ArtistDataFrame"))
ts = timestat("Saving Master Artist DataFrame To {0} (~20 sec)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=artistData['artist'], ifile=savename)
#fsize = fileInfo(savename).fsize
#print("Master Artist DataFrame is {0}{1}".format(fsize[0], fsize[1]))
ts.stop()

tsArtist.stop()
#del artistData

In [None]:
artistData['artist'].head()

## Artist <=> Work

In [49]:
colnames = {}
colnames["l_artist_work"] = {0: "LookupID", 1: "WorkGroupID", 2: "ArtistID", 3: "WorkID", 6: "NA6", 7: "NA7"}

ts = timestat("Loading Artist <=> Work Data")
files = [ifile for ifile in glob("mbdump/l_artist_work")]
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

Current Time is Sat Dec 11, 2021 21:36 for Loading Artist <=> Work Data
Keys: dict_keys(['l_artist_work'])
Keys: dict_keys(['l_artist_work'])
Process [Loading Artist <=> Work Data] Took 2.5 Seconds


In [50]:
ts = timestat("Merging Artist <=> Work Lookup")
wData = workData['work'].reset_index()
lData = lookupData['l_artist_work'].reset_index()
mergedWorkData = merge(wData,lData,on='WorkID')
ts.stop()

ts = timestat("Dropping Last Columns")
mergedWorkData.drop(["WorkID", "LookupID", "NA6", "NA7"], axis=1, inplace=True)
ts.stop()

ts = timestat("Grouping By ArtistID (~31 sec)")
artistWorks = Series({artistID: list(zip(artistIDWorks["WorkGroupID"], artistIDWorks["WorkTypeName"], artistIDWorks["WorkName"])) for artistID,artistIDWorks in mergedWorkData.groupby(["ArtistID"])})
ts.stop()

savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ArtistWorkDataFrame"))
ts = timestat("Saving Master Artist Work DataFrame To {0} (~6 sec)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=artistWorks, ifile=savename)
ts.stop()

Current Time is Sat Dec 11, 2021 21:36 for Merging Artist <=> Work Lookup
Process [Merging Artist <=> Work Lookup] Took 0.9 Seconds
Current Time is Sat Dec 11, 2021 21:36 for Dropping Last Columns
Process [Dropping Last Columns] Took 0.3 Seconds
Current Time is Sat Dec 11, 2021 21:36 for Grouping By ArtistID (~31 sec)
Process [Grouping By ArtistID (~31 sec)] Took 31.2 Seconds
Current Time is Sat Dec 11, 2021 21:37 for Saving Master Artist Work DataFrame To /Volumes/Seagate/DB/MusicBrainzData/ArtistWorkDataFrame.p (~26 sec)
Process [Saving Master Artist Work DataFrame To /Volumes/Seagate/DB/MusicBrainzData/ArtistWorkDataFrame.p (~26 sec)] Took 6.2 Seconds


## Artist <=> Recording

In [51]:
colnames = {}
colnames["l_artist_recording"] = {0: "LookupID", 1: "RecordingGroupID", 2: "ArtistID", 3: "RecordingID", 6: "NA6", 7: "NA7"}

ts = timestat("Loading Artist <=> Recording Data")
files = [ifile for ifile in glob("mbdump/l_artist_recording")]
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

Current Time is Sat Dec 11, 2021 21:50 for Loading Artist <=> Recording Data
Keys: dict_keys(['l_artist_recording'])
Keys: dict_keys(['l_artist_recording'])
Process [Loading Artist <=> Recording Data] Took 29.2 Seconds


In [60]:
ts = timestat("Merging Artist <=> Recording Lookup (~16 sec)")
rData = recordingData['recording'].reset_index().drop(["ArtistID"], axis=1)
lData = lookupData['l_artist_recording'].reset_index()
mergedRecordingData = merge(rData,lData,on='RecordingID')
ts.stop()

ts = timestat("Dropping Last Columns")
mergedRecordingData.drop(["RecordingID", "RecordingGroupID", "LookupID", "NA6", "NA7"], axis=1, inplace=True)
ts.stop()

Current Time is Sat Dec 11, 2021 22:05 for Merging Artist <=> Recording Lookup (~16 sec)
Process [Merging Artist <=> Recording Lookup (~16 sec)] Took 13.1 Seconds
Current Time is Sat Dec 11, 2021 22:05 for Dropping Last Columns
Process [Dropping Last Columns] Took 1.7 Seconds


In [64]:
ts = timestat("Grouping By ArtistID (~31 sec)")
artistRecordings = Series({artistID: list(zip(artistIDRecordings["RecordingName"], artistIDRecordings["TimeLength"])) for artistID,artistIDRecordings in mergedRecordingData.groupby(["ArtistID"])})
ts.stop()

savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ArtistRecordingDataFrame"))
ts = timestat("Saving Master Artist Recording DataFrame To {0} (~54 sec)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=artistRecordings, ifile=savename)
ts.stop()

Current Time is Sat Dec 11, 2021 22:07 for Grouping By ArtistID (~31 sec)
Process [Grouping By ArtistID (~31 sec)] Took 58.1 Seconds
Current Time is Sat Dec 11, 2021 22:08 for Saving Master Artist Recording DataFrame To /Volumes/Seagate/DB/MusicBrainzData/ArtistRecordingDataFrame.p (~6 sec)
Process [Saving Master Artist Recording DataFrame To /Volumes/Seagate/DB/MusicBrainzData/ArtistRecordingDataFrame.p (~6 sec)] Took 54.1 Seconds


## Artist <=> Release

In [65]:
colnames = {}
colnames["l_artist_release"] = {0: "LookupID", 1: "ReleaseGroupID", 2: "ArtistID", 3: "ReleaseID", 6: "NA6", 7: "NA7"}

ts = timestat("Loading Artist <=> Release Data")
files = [ifile for ifile in glob("mbdump/l_artist_release")]
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

Current Time is Sat Dec 11, 2021 22:14 for Loading Artist <=> Release Data
Keys: dict_keys(['l_artist_release'])
Keys: dict_keys(['l_artist_release'])
Process [Loading Artist <=> Release Data] Took 0.9 Seconds


In [81]:
from fileIO import fileIO
savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ReleaseDataFrame"))
releaseData = fileIO().get(savename)

In [82]:
#lookupData['l_artist_release']
#releaseData

Unnamed: 0_level_0,ReleaseGID,ReleaseName,ArtistID,ReleaseGroupID,Packaging,Status,Language,Script,ReleaseDate,Country
ReleaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,896742,,Official,English,Latn,2002-10-14,Germany
10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,896742,,Official,English,Latn,2002-10-14,United Kingdom
12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,104189,,Official,English,Latn,NaT,Germany
26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,20211,94299,,Official,English,Latn,1998-04-01,Japan
49,bc9afecd-685e-482c-9412-27f2d267b1dd,Out Spaced,20211,94125,,Official,[Multiple languages],Latn,1998-11-23,United Kingdom
...,...,...,...,...,...,...,...,...,...,...
1293174,c5f1344a-00a6-4e92-94b6-457f0c3935b9,Hope,3002,197236,Super Jewel Box,Official,English,Latn,2008-05-21,United States
3206280,1ea28fc2-37f4-48ca-a1ce-33f1c5764be5,Book I: The End of Ancient Lands,3116008,2783720,,Official,,,NaT,
3206281,2f923f68-d213-4f30-8a18-6b4796f39c61,Book I: The End of Ancient Lands,3116008,2783720,,Official,,,NaT,
3206282,37d5c7c4-96fd-4245-9830-cf16a5bd3cbb,Book I: The End of Ancient Lands,3116008,2783720,,Official,,,NaT,


## Artist <=> URL

In [None]:
colnames = {}
colnames["l_artist_url"] = {0: "LookupID", 1: "URLGroupID", 2: "ArtistID", 3: "URLID", 6: "NA6", 7: "NA7"}

ts = timestat("Loading Artist <=> URL Data")
files = [ifile for ifile in glob("mbdump/l_artist_url")]
lookupData = getData(files, colnames)
lookupData = setIndex(lookupData)
ts.stop()

In [None]:
ts = timestat("Joining URLs")
lookupData['l_artist_url']["URL"] = lookupData['l_artist_url']['URLID'].apply(lambda x: urlData['url']["URLName"].get(x))
ts.stop()

urlType={'26038': 'Discogs',
 '26039': 'Myspace',
 '26040': 'IMDB',
 '26041': 'Wikipedia',
 '26042': 'Artist',
 '26048': 'LastFMMisc',
 '26052': 'Apple',
 '26055': 'YouTube',
 '26056': 'Facebook',
 '26062': 'GeniusMisc',
 '26068': 'VGMDB',
 '26316': 'DeezerSpotify',
 '28613': 'AllMusic',
 '30134': 'Soundcloud',
 '41329': 'Video',
 '49052': 'RateYourMusicMisc',
 '94979': 'SecondhandSongs',
 '106477': 'VIAG',
 '117675': 'Wikidata',
 '139284': 'Bandcamp',
 '195003': 'IMSLP',
 '199852': 'Songkick',
 '204138': 'Setlist.fm',
 '215573': 'Last.fm',
 '240791': 'BandsInTown',
 '624633': 'AppleTidalNapster',
 '697028': 'PureVolume',
 '732275': 'CDBaby',
 '748510': 'GooglePlus',
 '753046': 'GooglePlay',
 '771457': 'BBC'}

def getURLGroupName(url):
    if "discogs." in url:
        return "Discogs"
    elif "myspace." in url:
        return "Myspace"
    elif "imdb." in url:
        return "IMDB"
    elif "youtube." in url:
        return "YouTube"
    elif "allmusic." in url:
        return "AllMusic"
    elif "last.fm" in url:
        return "LastFM"
    elif "soundcloud." in url:
        return "Soundcloud"
    elif "directlyrics." in url:
        return "DirectLyrics"
    elif "facebook." in url:
        return "Facebook"
    elif "tumblr." in url:
        return "Tumblr"
    elif "viaf." in url:
        return "VIAF"
    elif "wikidata." in url:
        return "Wikidata"
    elif "rateyourmusic." in url:
        return "RateYourMusic"
    elif "muzikum." in url:
        return "Muzikum"
    elif "spotify." in url:
        return "Spotify"
    elif "archive." in url:
        return "Archive"
    elif "play.google." in url:
        return "GooglePlay"
    elif "genius." in url:
        return "Genius"
    elif "musicmoz." in url:
        return "Musicmoz"
    elif "imvdb." in url:
        return "IMVBD"
    elif "musik-sammler." in url:
        return "MusikSammler"
    elif "whosampled." in url:
        return "WhoSampled"
    elif "setlist." in url:
        return "SetListFM"
    elif "secondhandsongs." in url:
        return "SecondhandSongs"
    elif "apple." in url:
        return "Apple"
    elif "deezer." in url:
        return "Deezer"
    elif "twitter." in url:
        return "Twitter"
    elif "songkick." in url:
        return "Songkick"
    elif "instagram." in url:
        return "Instagram"
    elif "tidal." in url:
        return "Tidal"
    elif "bbc." in url:
        return "BBC"
    elif "musixmatch." in url:
        return "MusixMatch"
    elif "napster." in url:
        return "Napster"
    elif "junodownload." in url:
        return "JunoDownload"
    elif "beatport." in url:
        return "Beatport"
    elif "bandsintown." in url:
        return "BandsInTown"
    elif "bandcamp." in url:
        return "BandCamp"
    else:
        return "Misc"

def getURLType(urlGroupID):
    urlGroupName = urlType.get(str(urlGroupID),"Misc")
    return urlGroupName

ts = timestat("Getting URL Group Name")
lookupData['l_artist_url']['URLGroupName'] = lookupData['l_artist_url']['URL'].apply(getURLGroupName)
ts.stop()

ts = timestat("Dropping Last Columns")
lookupData['l_artist_url'].drop(["NA6", "NA7"], axis=1, inplace=True)
ts.stop()

ts = timestat("Grouping By ArtistID (~2 min)")
artistURLs = Series({artistID: list(zip(artistIDURLs["URLGroupName"], artistIDURLs["URL"])) for artistID,artistIDURLs in lookupData['l_artist_url'].groupby(["ArtistID"])})
ts.stop()

savedir = setDir(basedir, "MusicBrainzData")
savename = setFile(savedir, "{0}.p".format("ArtistURLDataFrame"))
ts = timestat("Saving Master Artist URL DataFrame To {0} (~11 sec)".format(savename))
from fileIO import pickleIO
pickleIO().save(idata=artistURLs, ifile=savename)
ts.stop()

# Master Artist Summary Data

In [40]:
masterArtistRecordingData[2229589]

[('漸漸', '278000'), ('RUN', '251000'), ('龍舌蘭', '285000'), ('我們萬歲', '236000')]

In [41]:
str("龍舌蘭")

'龍舌蘭'

In [38]:
masterArtistData[masterArtistData["ArtistGID"] == "c4596d2d-de50-4388-aca3-f9abb313918d"]

Unnamed: 0,ArtistGID,ArtistName,ArtistSortName,Formed,Disbanded,ArtistType,Gender,Country,FormedIn,DisbandedIn,ISNICode,Aliases,MyArtistID
2229589,c4596d2d-de50-4388-aca3-f9abb313918d,米巍,"Mi, Wei",NaT,NaT,Person,,China,,,,,251108434349887660386335524263902329399


In [6]:
from fsUtils import dirUtil

savedir = dirUtil(basedir).join("MusicBrainzData")
io = fileIO()
ts = timestat("Loading Master Data (~3 sec)")
masterArtistData          = io.get(setFile(savedir, "{0}.p".format("ArtistDataFrame")))
masterReleaseGroupData    = io.get(setFile(savedir, "{0}.p".format("ReleaseGroupDataFrame")))
masterArtistRecordingData = io.get(setFile(savedir, "{0}.p".format("ArtistRecordingDataFrame")))
masterArtistWorkData      = io.get(setFile(savedir, "{0}.p".format("ArtistWorkDataFrame")))
masterArtistURLData       = io.get(setFile(savedir, "{0}.p".format("ArtistURLDataFrame")))
ts.stop()

Current Time is Mon Dec 13, 2021 12:27 for Loading Master Data (~35 sec)
Process [Loading Master Data (~35 sec)] Took 29.8 Seconds


In [7]:
Nmod = 100

In [9]:
from dbArtistsID import artistIDMusicBrainz
mbID = artistIDMusicBrainz()
ts = timestat("Setting My ArtistID From MusicBrainz GID (~12 sec)")
masterArtistData["MyArtistID"] = masterArtistData['ArtistGID'].apply(mbID.getArtistID)
ts.stop()

Current Time is Mon Dec 13, 2021 12:32 for Setting My ArtistID From MusicBrainz GID
Process [Setting My ArtistID From MusicBrainz GID] Took 12.2 Seconds


In [10]:
def createReleaseGroupKey(x):
    key = None
    primary   = x['ReleaseGroupPrimaryType']
    secondary = x['ReleaseGroupSecondaryType']
    if isinstance(primary, str) and isinstance(secondary, str):
        key = " + ".join([primary, secondary])
    elif isinstance(primary, str):
        key = primary
    elif isinstance(secondary, str):
        key = secondary
    else:
        key = "Unknown"
    return key
    
ts = timestat("Setting ReleaseGroup Key (~1/2 min)")
masterReleaseGroupData["ReleaseGroupKey"] = masterReleaseGroupData.apply(createReleaseGroupKey, axis=1)
ts.stop()

Current Time is Mon Dec 13, 2021 12:33 for Setting ReleaseGroup Key (~1 min)
Process [Setting ReleaseGroup Key (~1 min)] Took 23.2 Seconds


In [54]:
from artistDBBase import artistDBBase, artistDBDataClass
from artistDBBase import artistDBNameClass, artistDBMetaClass, artistDBIDClass, artistDBURLClass, artistDBPageClass
from artistDBBase import artistDBProfileClass, artistDBMediaClass, artistDBMediaAlbumClass
from artistDBBase import artistDBMediaDataClass, artistDBMediaCountsClass, artistDBFileInfoClass
from artistDBBase import artistDBTextClass, artistDBLinkClass
from strUtils import fixName
from dbUtils import utilsDiscogs
from hashlib import md5

def getMediaCounts(media):
    amcc = artistDBMediaCountsClass()

    credittype = "Releases"
    if amcc.counts.get(credittype) == None:
        amcc.counts[credittype] = {}
    for creditsubtype in media.media.keys():
        amcc.counts[credittype][creditsubtype] = int(len(media.media[creditsubtype]))

    return amcc

savedir = setDir(basedir, "MusicBrainzMetadata")
tsAll = timestat("Creating DB Data")
Nmod = 100
for n,modVal in enumerate(range(Nmod)):
    ts = timestat("Creating ModData Subset")
    artistModData = masterArtistData[masterArtistData["MyArtistID"].apply(lambda x: int(x)%Nmod) == modVal]
    releaseGroupModData = masterReleaseGroupData[masterReleaseGroupData["ArtistID"].isin(artistModData.index)]
    ts.stop()

    modValData = {}
    N = artistModData.shape[0]
    tsMod = timestat("Creating DB Data From {0} Artists For ModVal={1}".format(N,modVal))
    for i,(artistID,artistData) in enumerate(artistModData.iterrows()):
        artistName  = str(artistData["ArtistName"])
        artistGID   = artistData['ArtistGID']
        artistURL   = "https://musicbrainz.org/artist/{0}".format(artistGID)
        myID        = artistData["MyArtistID"]
        #if artistGID != "070d193a-845c-479f-980e-bef15710653e":
        #    continue
        #if myID != '251108434349887660386335524263902329399':
        #    continue

        generalData = {}
        generalData["SortName"]   = artistData["ArtistSortName"]
        generalData["Aliases"]    = artistData["Aliases"]
        generalData["Gender"]     = artistData["Gender"]
        generalData["County"]     = artistData["Country"]
        generalData["Formed"]     = artistData["Formed"]
        generalData["Disbanded"]  = artistData["Disbanded"]
        generalData["ArtistType"] = artistData["ArtistType"]
        generalData["ISNI"]       = artistData["ISNICode"]
        generalData = {k: v for k,v in generalData.items() if v is not None}
        generalData = generalData if len(generalData) > 0 else None

        
        ########################################################################
        # Get URLs
        ########################################################################
        externalData = {}
        artistURLs = masterArtistURLData.get(artistID, [])
        for (urlType,url) in artistURLs:
            adblink      = artistDBLinkClass(None)
            adblink.href = url
            adblink.err  = None
            if externalData.get(urlType) is None:
                externalData[urlType] = []
            externalData[urlType].append(adblink)
        externalData = externalData if len(externalData) > 0 else None
        
            
        
        ########################################################################
        # Get Release Groups
        ########################################################################
        artistReleaseGroupData = releaseGroupModData[releaseGroupModData["ArtistID"] == artistID]
        mediaData = {}
        for mediaName,mediaNameData in artistReleaseGroupData.groupby("ReleaseGroupKey"):
            mediaData[mediaName] = []
            for code, releaseGroupInfo in mediaNameData.iterrows():
                album        = releaseGroupInfo['ReleaseGroupName']
                albumURL     = "https://musicbrainz.org/releasegroup/{0}".format(releaseGroupInfo['ReleaseGroupGID'])
                albumArtists = [artistName]
            
                amdc = artistDBMediaDataClass(album=album, url=albumURL, artist=albumArtists, code=code, year=None)
                mediaData[mediaName].append(amdc)
                
            
        ########################################################################
        # Get Works
        ########################################################################
        artistWorks = masterArtistWorkData.get(artistID)  
        if artistWorks:
            for workID,workType,workName in artistWorks:
                mediaName = "OtherWork" if workType is None else workType
                if mediaData.get(mediaName) is None:
                    mediaData[mediaName] = []
                m = md5()
                codes = {}
                m.update(str(workID).encode('utf-8'))
                m.update(str(mediaName).encode('utf-8'))
                m.update(str(workName).encode('utf-8'))
                hashval = m.hexdigest()
                code    = str(int(hashval, 16) % int(1e6))
                if codes.get(code) is not None:
                    continue
                codes[code] = True

                amdc = artistDBMediaDataClass(album=str(workName), url=None, artist=None, code=code, year=None)
                mediaData[mediaName].append(amdc)
                
            
        ########################################################################
        # Get Recordings
        ########################################################################
        artistRecordings = masterArtistRecordingData.get(artistID)        
        artistRecordings = Series(artistRecordings).drop_duplicates()
        if len(artistRecordings) > 0:
            mediaName = "Recordings"
            if mediaData.get(mediaName) is None:
                mediaData[mediaName] = []
            codes = {}
            for idx,(recName,recTime) in artistRecordings.iteritems():
                m = md5()
                m.update(str(recName).encode('utf-8'))
                m.update(str(recTime).encode('utf-8'))
                hashval = m.hexdigest()
                code    = str(int(hashval, 16) % int(1e6))
                if codes.get(code) is not None:
                    continue
                codes[code] = True
                
                amdc = artistDBMediaDataClass(album=str(recName), url=None, artist=None, code=code, year=None)
                mediaData[mediaName].append(amdc)

        
        artist      = artistDBNameClass(name=artistName, err=None)
        meta        = artistDBMetaClass(title=None, url=artistURL)
        url         = artistDBURLClass(url=artistURL)
        ID          = artistDBIDClass(ID=myID)
        pages       = artistDBPageClass(ppp=1, tot=1, redo=False, more=False)
        profile     = artistDBProfileClass(general=generalData, external=externalData)
        media       = artistDBMediaClass()
        media.media = mediaData
        mediaCounts = getMediaCounts(media)
        info        = artistDBFileInfoClass(info=None)
        
        modValData[myID] = artistDBDataClass(artist=artist, meta=meta, url=url, ID=ID, pages=pages, profile=profile, mediaCounts=mediaCounts, media=media, info=info)
        if (i+1) % 7500 == 0 or (i+1) == 2500:
            tsMod.update(n=i+1, N=N)
    tsMod.stop()
            
    outdir = setDir(basedir, "MusicBrainzDBData")
    io.save(idata=Series(modValData), ifile=setFile(outdir, "{0}-{1}.p".format(modVal, "DB")))
    tsAll.update(n=n, N=Nmod)
    print("\n")
tsAll.stop()

Current Time is Wed Dec 15, 2021 12:13 for Creating DB Data
Current Time is Wed Dec 15, 2021 12:13 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:13 for Creating DB Data From 18813 Artists For ModVal=0
2500/18813 : Process [Creating DB Data From 18813 Artists For ModVal=0] Has Run For 3.6 Seconds.  ETA is 23.5 Seconds
7500/18813 : Process [Creating DB Data From 18813 Artists For ModVal=0] Has Run For 9.7 Seconds.  ETA is 14.6 Seconds
15000/18813 : Process [Creating DB Data From 18813 Artists For ModVal=0] Has Run For 17.8 Seconds.  ETA is 4.5 Seconds
Process [Creating DB Data From 18813 Artists For ModVal=0] Took 21.8 Seconds
0/100      : Process [Creating DB Data] Has Run For 25.6 Seconds.  ETA is ? Seconds


Current Time is Wed Dec 15, 2021 12:14 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:14 for Creating DB Data From 18789 Artists For ModVal=1
25

2500/18920 : Process [Creating DB Data From 18920 Artists For ModVal=11] Has Run For 3.7 Seconds.  ETA is 24.3 Seconds
7500/18920 : Process [Creating DB Data From 18920 Artists For ModVal=11] Has Run For 9.1 Seconds.  ETA is 13.9 Seconds
15000/18920 : Process [Creating DB Data From 18920 Artists For ModVal=11] Has Run For 17.9 Seconds.  ETA is 4.7 Seconds
Process [Creating DB Data From 18920 Artists For ModVal=11] Took 22.0 Seconds
11/100     : Process [Creating DB Data] Has Run For 5.1 Minutes.  ETA is 41.3 Minutes


Current Time is Wed Dec 15, 2021 12:18 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:18 for Creating DB Data From 18954 Artists For ModVal=12
2500/18954 : Process [Creating DB Data From 18954 Artists For ModVal=12] Has Run For 3.6 Seconds.  ETA is 23.7 Seconds
7500/18954 : Process [Creating DB Data From 18954 Artists For ModVal=12] Has Run For 9.3 Seconds.  ETA is 14.2 Seconds
15000/18954 : Process [Crea

7500/18856 : Process [Creating DB Data From 18856 Artists For ModVal=22] Has Run For 9.2 Seconds.  ETA is 13.9 Seconds
15000/18856 : Process [Creating DB Data From 18856 Artists For ModVal=22] Has Run For 17.1 Seconds.  ETA is 4.4 Seconds
Process [Creating DB Data From 18856 Artists For ModVal=22] Took 20.7 Seconds
22/100     : Process [Creating DB Data] Has Run For 10.3 Minutes.  ETA is 36.5 Minutes


Current Time is Wed Dec 15, 2021 12:23 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:23 for Creating DB Data From 18833 Artists For ModVal=23
2500/18833 : Process [Creating DB Data From 18833 Artists For ModVal=23] Has Run For 4.8 Seconds.  ETA is 31.4 Seconds
7500/18833 : Process [Creating DB Data From 18833 Artists For ModVal=23] Has Run For 11.2 Seconds.  ETA is 16.9 Seconds
15000/18833 : Process [Creating DB Data From 18833 Artists For ModVal=23] Has Run For 19.2 Seconds.  ETA is 4.9 Seconds
Process [Creating DB Dat

15000/18914 : Process [Creating DB Data From 18914 Artists For ModVal=33] Has Run For 21.0 Seconds.  ETA is 5.5 Seconds
Process [Creating DB Data From 18914 Artists For ModVal=33] Took 26.7 Seconds
33/100     : Process [Creating DB Data] Has Run For 15.5 Minutes.  ETA is 31.5 Minutes


Current Time is Wed Dec 15, 2021 12:29 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:29 for Creating DB Data From 18971 Artists For ModVal=34
2500/18971 : Process [Creating DB Data From 18971 Artists For ModVal=34] Has Run For 4.7 Seconds.  ETA is 31.0 Seconds
7500/18971 : Process [Creating DB Data From 18971 Artists For ModVal=34] Has Run For 10.2 Seconds.  ETA is 15.6 Seconds
15000/18971 : Process [Creating DB Data From 18971 Artists For ModVal=34] Has Run For 18.3 Seconds.  ETA is 4.8 Seconds
Process [Creating DB Data From 18971 Artists For ModVal=34] Took 22.2 Seconds
34/100     : Process [Creating DB Data] Has Run For 15.9 Minutes.

Process [Creating DB Data From 19113 Artists For ModVal=44] Took 20.4 Seconds
44/100     : Process [Creating DB Data] Has Run For 20.4 Minutes.  ETA is 26.0 Minutes


Current Time is Wed Dec 15, 2021 12:34 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:34 for Creating DB Data From 18919 Artists For ModVal=45
2500/18919 : Process [Creating DB Data From 18919 Artists For ModVal=45] Has Run For 6.0 Seconds.  ETA is 39.4 Seconds
7500/18919 : Process [Creating DB Data From 18919 Artists For ModVal=45] Has Run For 11.7 Seconds.  ETA is 17.8 Seconds
15000/18919 : Process [Creating DB Data From 18919 Artists For ModVal=45] Has Run For 21.2 Seconds.  ETA is 5.5 Seconds
Process [Creating DB Data From 18919 Artists For ModVal=45] Took 25.0 Seconds
45/100     : Process [Creating DB Data] Has Run For 20.9 Minutes.  ETA is 25.5 Minutes


Current Time is Wed Dec 15, 2021 12:34 for Creating ModData Subset
Process [Creating ModData Sub

Process [Creating ModData Subset] Took 1.5 Seconds
Current Time is Wed Dec 15, 2021 12:39 for Creating DB Data From 18923 Artists For ModVal=56
2500/18923 : Process [Creating DB Data From 18923 Artists For ModVal=56] Has Run For 3.7 Seconds.  ETA is 24.3 Seconds
7500/18923 : Process [Creating DB Data From 18923 Artists For ModVal=56] Has Run For 10.2 Seconds.  ETA is 15.5 Seconds
15000/18923 : Process [Creating DB Data From 18923 Artists For ModVal=56] Has Run For 20.5 Seconds.  ETA is 5.4 Seconds
Process [Creating DB Data From 18923 Artists For ModVal=56] Took 24.6 Seconds
56/100     : Process [Creating DB Data] Has Run For 26.5 Minutes.  ETA is 20.8 Minutes


Current Time is Wed Dec 15, 2021 12:40 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.4 Seconds
Current Time is Wed Dec 15, 2021 12:40 for Creating DB Data From 18815 Artists For ModVal=57
2500/18815 : Process [Creating DB Data From 18815 Artists For ModVal=57] Has Run For 5.2 Seconds.  ETA is 33.9 Seconds


2500/18770 : Process [Creating DB Data From 18770 Artists For ModVal=67] Has Run For 4.1 Seconds.  ETA is 26.7 Seconds
7500/18770 : Process [Creating DB Data From 18770 Artists For ModVal=67] Has Run For 9.1 Seconds.  ETA is 13.7 Seconds
15000/18770 : Process [Creating DB Data From 18770 Artists For ModVal=67] Has Run For 16.4 Seconds.  ETA is 4.1 Seconds
Process [Creating DB Data From 18770 Artists For ModVal=67] Took 20.0 Seconds
67/100     : Process [Creating DB Data] Has Run For 31.2 Minutes.  ETA is 15.4 Minutes


Current Time is Wed Dec 15, 2021 12:44 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:44 for Creating DB Data From 19052 Artists For ModVal=68
2500/19052 : Process [Creating DB Data From 19052 Artists For ModVal=68] Has Run For 4.1 Seconds.  ETA is 27.1 Seconds
7500/19052 : Process [Creating DB Data From 19052 Artists For ModVal=68] Has Run For 9.3 Seconds.  ETA is 14.3 Seconds
15000/19052 : Process [Cre

7500/18812 : Process [Creating DB Data From 18812 Artists For ModVal=78] Has Run For 9.3 Seconds.  ETA is 14.0 Seconds
15000/18812 : Process [Creating DB Data From 18812 Artists For ModVal=78] Has Run For 16.5 Seconds.  ETA is 4.2 Seconds
Process [Creating DB Data From 18812 Artists For ModVal=78] Took 20.0 Seconds
78/100     : Process [Creating DB Data] Has Run For 35.8 Minutes.  ETA is 10.1 Minutes


Current Time is Wed Dec 15, 2021 12:49 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.1 Seconds
Current Time is Wed Dec 15, 2021 12:49 for Creating DB Data From 18940 Artists For ModVal=79
2500/18940 : Process [Creating DB Data From 18940 Artists For ModVal=79] Has Run For 4.0 Seconds.  ETA is 26.3 Seconds
7500/18940 : Process [Creating DB Data From 18940 Artists For ModVal=79] Has Run For 9.1 Seconds.  ETA is 13.9 Seconds
15000/18940 : Process [Creating DB Data From 18940 Artists For ModVal=79] Has Run For 16.3 Seconds.  ETA is 4.3 Seconds
Process [Creating DB Data

15000/18844 : Process [Creating DB Data From 18844 Artists For ModVal=89] Has Run For 16.2 Seconds.  ETA is 4.2 Seconds
Process [Creating DB Data From 18844 Artists For ModVal=89] Took 19.8 Seconds
89/100     : Process [Creating DB Data] Has Run For 40.1 Minutes.  ETA is 5.0 Minutes


Current Time is Wed Dec 15, 2021 12:53 for Creating ModData Subset
Process [Creating ModData Subset] Took 1.0 Seconds
Current Time is Wed Dec 15, 2021 12:53 for Creating DB Data From 18944 Artists For ModVal=90
2500/18944 : Process [Creating DB Data From 18944 Artists For ModVal=90] Has Run For 3.2 Seconds.  ETA is 21.0 Seconds
7500/18944 : Process [Creating DB Data From 18944 Artists For ModVal=90] Has Run For 9.0 Seconds.  ETA is 13.7 Seconds
15000/18944 : Process [Creating DB Data From 18944 Artists For ModVal=90] Has Run For 16.2 Seconds.  ETA is 4.3 Seconds
Process [Creating DB Data From 18944 Artists For ModVal=90] Took 19.9 Seconds
90/100     : Process [Creating DB Data] Has Run For 40.5 Minutes.  

In [47]:
modValData['108541848016828757278131944962756872900'].show()

Artist Data Class
-------------------------
Artist:  Prince
Meta:    None
         https://musicbrainz.org/artist/070d193a-845c-479f-980e-bef15710653e
Info:    None
         None
         2021-12-15 12:06:52.455003
URL:     https://musicbrainz.org/artist/070d193a-845c-479f-980e-bef15710653e
ID:      108541848016828757278131944962756872900
Profile: {'general': {'SortName': 'Prince', 'Aliases': ['O(+>)', 'Formerly Prince', 'Prince', '(us 1) PRINCE', 'Joey Coco', '♀♂', 'The Artist Formerly Known as Prince', 'The Love Symbol', 'Symbol', 'O(+>', 'Prince Rogers Nelson', 'T.A.F.K.A.P.', 'Ƭ̵̬̊', 'Alexander Nevermind', 'Paisley Park'], 'Gender': 'Male', 'County': 'United States', 'Formed': Timestamp('1958-06-07 00:00:00'), 'Disbanded': Timestamp('2016-04-21 00:00:00'), 'ArtistType': 'Person', 'ISNI': '0000000120964892'}, 'genres': None, 'tags': None, 'external': {'BBC': [<artistDBBase.artistDBLinkClass object at 0x7fa3c1b91970>], 'RateYourMusic': [<artistDBBase.artistDBLinkClass object at 0x7fa

In [43]:
modValData['251108434349887660386335524263902329399'].show()

Artist Data Class
-------------------------
Artist:  米巍
Meta:    None
         https://musicbrainz.org/artist/c4596d2d-de50-4388-aca3-f9abb313918d
Info:    None
         None
         2021-12-15 12:03:26.427993
URL:     https://musicbrainz.org/artist/c4596d2d-de50-4388-aca3-f9abb313918d
ID:      251108434349887660386335524263902329399
Profile: {'general': {'SortName': 'Mi, Wei', 'Aliases': nan, 'County': 'China', 'Formed': NaT, 'Disbanded': NaT, 'ArtistType': 'Person', 'ISNI': nan}, 'genres': None, 'tags': None, 'external': {'Discogs': [<artistDBBase.artistDBLinkClass object at 0x7fa3c1294d30>]}, 'extra': None, 'err': None}
Pages:   {'ppp': 1, 'tot': 1, 'pages': 1, 'err': None, 'more': False, 'redo': False}
Media:   {'counts': {'Releases': {'Recordings': 4}}, 'err': None}
   Recordings
      0
      1
      2
      3


# Merge With Known DB

In [None]:
ts = timestat("Merging DBs")
for n,modVal in enumerate(range(100)):
    newDB = Series(io.get("/Volumes/Seagate/DB/MusicBrainzDBData/{0}-DB.p".format(modVal)))
    known = io.get("/Users/tgadfort/dbdiscogs/artists-musicbrainz-db/{0}-DB.p".format(modVal))
    
    toMerge = newDB[~newDB.index.isin(known.index)]
    fullDB = concat([known,toMerge]).sort_index()
    io.save(idata=fullDB, ifile="/Users/tgadfort/dbdiscogs/artists-musicbrainz-db/full/{0}-DB.p".format(modVal))
    ts.update(n=n+1,N=100)
ts.stop()

In [None]:
known.shape

In [None]:
tmp['172552485256597266680385033568580864600'].show()

In [None]:
masterArtistData = artistData['artist'][["ArtistGID", "ArtistName", "ArtistSortName", "Formed", "Disbanded"]].copy(deep=True)
masterArtistNumAlbums = artistIDNumReleaseGroups.join(artistIDNumRelease, how='outer')
masterArtistData = masterArtistData.join(masterArtistNumAlbums)
masterArtistData["NumReleaseGroups"] = masterArtistData["NumReleaseGroups"].fillna(0).apply(int)
masterArtistData["NumReleases"] = masterArtistData["NumReleases"].fillna(0).apply(int)

In [None]:
masterArtistData

In [None]:
masterartistNumAlbums

In [None]:
artistIDNumReleaseGroups.shape

In [None]:
artistIDNumRelease.shape

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["ArianaGrande"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["BuddyHolly"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["Bono"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["Rupaul"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["U2"]]

In [None]:
artistData['artist'][artistData['artist']['ArtistID'] == aIDs["DMB"]]

In [None]:
artistData['artist']["NA18"].value_counts()

In [None]:
artistData['artist']["NA9"].value_counts()

In [None]:
artistData['artist']['NA12'].unique()

In [None]:
artistData['artist']['NA5'].unique()

In [None]:
artistData['artist'][artistData['artist']['ArtistGID'] == '7f347782-eb14-40c3-98e2-17b6e1bfe56c']

In [None]:
artistData["artist"][artistData["artist"]["ArtistID"] == 502]

In [None]:
artistData["artist"]['NA10'].unique()

In [None]:
artistData["artist"][artistData["artist"]["ArtistID"] == 197]

# Artist Lookup

In [None]:
colnames["l_artist_url"]={0: "ArtistURLLID", 1: "URLGroupID", 2: "ArtistID", 3: "URLID"}
colnames["l_artist_release_group"]={0: "ArtistReleaseGroupLID", 1: "ReleaseGroupGroupID", 2: "ArtistID", 3: "ReleaseGroupID"}
colnames["l_artist_release"]={0: "ArtistReleaseLID", 1: "ReleaseGroupID", 2: "ArtistID", 3: "ReleaseID"}

ts = timestat("Loading Artist Data")
files = glob("mbdump/l_artist_*")
lookupData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}
lookupData = {key: val[list(colnames[key].keys())].rename(columns=colnames[key]) for key,val in lookupData.items() if key in colnames} if lookupData is not None else lookupData
print("Keys: {0}".format(lookupData.keys()))
ts.stop()

In [None]:
files = glob("mbdump/l_artist_release")
lookupData = {fileInfo(ifile).basename: loadData(ifile) for ifile in files}

In [None]:
lookupData['l_artist_release']["ReleaseGroupID"].nunique()

In [None]:
lookupData['l_artist_release'][lookupData['l_artist_release']['ArtistID'] == 502]

In [None]:
key='l_artist_url'
lookupData['l_artist_url'] = lookupData['l_artist_url'][list(colnames[key].keys())].rename(columns=colnames[key])

In [None]:
print(urlData['url'][urlData['url']["URLName"].eq("https://www.discogs.com/artist/6520")])
print(urlData['url'][urlData['url']["URLName"].eq("https://www.allmusic.com/artist/mn0000219203")])

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"]["URLID"].isin([3017,993955])]

In [None]:
from pandas import merge
dmbAU = lookupData["l_artist_url"][lookupData["l_artist_url"]['ArtistID'] == 502].copy(deep=True)
u2AU  = lookupData["l_artist_url"][lookupData["l_artist_url"]['ArtistID'] == 197].copy(deep=True)

In [None]:
dmbURLs = merge(dmbAU, urlData['url'], how='left', on=["URLID"]).copy(deep=True)
u2URLs  = merge(u2AU, urlData['url'], how='left', on=["URLID"]).copy(deep=True)

In [None]:
dmbURLs["URLDomain"] = dmbURLs["URLName"].apply(lambda x: x.replace("https://", "").replace("http://", "").split('/')[0])

In [None]:
u2URLs["URLDomain"] = u2URLs["URLName"].apply(lambda x: x.replace("https://", "").replace("http://", "").split('/')[0])

In [None]:
u2URLs[["NA1", "URLDomain"]].sort_values(by="NA1").T

In [None]:
dmbURLs[["NA1", "URLDomain"]].sort_values(by="NA1").T

In [None]:
artistData["artist"][artistData["artist"].eq(8723).any(1)]

In [None]:
urlData['url'].shape

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"].eq(1025971).any(1)]

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"].eq(2625).any(1)]

In [None]:
lookupData["l_artist_url"][lookupData["l_artist_url"][2].eq(502).any(1)]

In [None]:
DMB={AllMusic = 1025971 (c94225e3-2f0c-4c6d-9115-9f268fb7c31b), Discogs = 2625 (7a157b6e-d01d-4248-9995-edb05652c5b2)}

In [None]:
artistData['artist']

In [None]:
colnames = {0: "ArtistID", 1: "NA1", 2: "NA2": 3: "NA3"}
lookupData["l_artist_artist"][lookupData["l_artist_artist"].eq(502).any(1)]

In [None]:
urlData['url'][urlData['url']["URLName"].eq("https://www.discogs.com/artist/6520")]
urlData['url'][urlData['url']["URLName"].eq("https://www.allmusic.com/artist/mn0000219203")]

In [None]:
502
07e748f1-075e-428d-85dc-ce3be434e906