# Master DB Matching Notebook

In [14]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))


################################################################################
## General Stuff
################################################################################
from ioUtils import saveJoblib, loadJoblib, saveFile, getFile
import urllib
from urllib.parse import quote
from collections import Counter
from searchUtils import findExt, findSubExt, findPatternExt, findNearest
from timeUtils import clock, elapsed
from fsUtils import moveFile, setFile, setDir, setSubDir, isFile, isDir, mkDir
from fileUtils import getFileBasics, getBasename
from listUtils import getFlatList
from time import sleep


################################################################################
## Music Stuff
################################################################################

### MultiArtist
from multiArtist import multiartist

### My Music DB
from myMusicDBMap import myMusicDBMap
from musicDBMap import musicDBMap
from matchDBArtist import matchDBArtist
from masterDBMatchClass import masterDBMatchClass

### Master DB code
from masterdb import masterdb
from mainDB import mainDB


import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.7.7 (default, Mar 26 2020, 10:32:53) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2020-12-04 21:20:07.205074


# Main DB

In [2]:
### Master DB code
%load_ext autoreload
%autoreload
from masterdb import masterdb
from mainDB import mainDB
maindb = mainDB(mdb=None, create=False, debug=False)
maindb.loadDBDataMap()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Summary Statistics For DB: Discogs
    Using Known Artists: False
    Found 754732 ID -> Name entries
    Found 688448 Name -> ID entries
    Found 0 Albums
Summary Statistics For DB: AllMusic
    Using Known Artists: False
    Found 73906 ID -> Name entries
    Found 67335 Name -> ID entries
    Found 466400 Albums
Summary Statistics For DB: MusicBrainz
    Using Known Artists: False
    Found 137893 ID -> Name entries
    Found 115993 Name -> ID entries
    Found 0 Albums
Summary Statistics For DB: LastFM
    Using Known Artists: False
    Found 115953 ID -> Name entries
    Found 115584 Name -> ID entries
    Found 992668 Albums
Summary Statistics For DB: RockCorner
    Using Known Artists: False
    Found 1262 ID -> Name entries
    Found 1262 Name -> ID entries
    Found 14481 Albums
Summary Statistics For DB: AceBootlegs
    Using Known Artists: False
    Found 341 ID -> Name entries
    Found

In [3]:
def initializeDB(mdbmaps, dbName):
    mdbmap = musicDBMap("Music", init=False, copy=False)
    for primaryKey,artistName in mdbmap.getArtists().items():
        artistData = mdbmap.getArtistDataByKey(primaryKey)

        primaryDBID         = artistData.getDBID(dbName)
        if primaryDBID is not None:
            if mdbmaps[dbName].isKnownByKey(primaryDBID) is False:
                secondaryArtistName = maindb.getArtistDBNameFromID(dbName, primaryDBID)

                mdbmaps[dbName].addArtistByKey(primaryDBID, artistName=secondaryArtistName, artistID=primaryDBID)
                for db,dbID in artistData.getDict().items():
                    mdbmaps[dbName].addArtistDataByKey(primaryDBID, db, dbID)

    print("Initialized DB With {0} Artists".format(len(mdbmaps[dbName].getArtists())))
    mdbmaps[dbName].save()
    
    
def addToInitializedDB(mdbmaps, initializedDB, primaryDB):
    for primaryKey,artistName in mdbmaps[primaryDB].getArtists().items():
        artistData = mdbmaps[primaryDB].getArtistDataByKey(primaryKey)

        primaryDBID         = artistData.getDBID(initializedDB)
        if primaryDBID is not None:
            if mdbmaps[initializedDB].isKnownByKey(primaryDBID) is False:
                secondaryArtistName = maindb.getArtistDBNameFromID(initializedDB, primaryDBID)

                mdbmaps[initializedDB].addArtistByKey(primaryDBID, artistName=secondaryArtistName, artistID=primaryDBID)
                for db,dbID in artistData.getDict().items():
                    mdbmaps[initializedDB].addArtistDataByKey(primaryDBID, db, dbID)

    print("Added To Initialized DB With {0} Artists".format(len(mdbmaps[initializedDB].getArtists())))
    mdbmaps[initializedDB].save()
    


loadAll = False
if loadAll:
    mdbmaps = {}
    for db in maindb.getDBs():
        print("{0: <30}".format(db), end="")
        mdbmaps[db] = musicDBMap(db, init=False)
    #mdbmaps = {db: musicDBMap(db, init=False) for db in maindb.getDBs()}
else:
    if False:
        mdbmaps = {}
        mdbmaps["AllMusic"] = musicDBMap("AllMusic", init=False)
        mdbmaps["MusicBrainz"] = musicDBMap("MusicBrainz", init=False)
        mdbmaps["Discogs"] = musicDBMap("Discogs", init=False)
        mdbmaps["LastFM"] = musicDBMap("LastFM", init=False)
        mdbmaps["RockCorner"] = musicDBMap("RockCorner", init=False)
        mdbmaps["RateYourMusic"] = musicDBMap("RateYourMusic", init=False)
        mdbmaps["AceBootlegs"] = musicDBMap("AceBootlegs", init=False)
        mdbmaps["MusicStack"] = musicDBMap("MusicStack", init=False)
        mdbmaps["CDandLP"] = musicDBMap("CDandLP", init=False)

In [15]:
mdbmaps = {}
mdbmaps["AllMusic"] = musicDBMap("AllMusic", init=False, copy=False)
if len(mdbmaps["AllMusic"].getArtists()) == 0:
    initializeDB(mdbmaps, "AllMusic")

  Loaded 35289 previously matched entries


In [None]:
mdbmaps["MusicBrainz"] = musicDBMap("MusicBrainz", init=False, copy=False)
if len(mdbmaps["MusicBrainz"].getArtists()) == 0:
    initializeDB(mdbmaps, "MusicBrainz")
    addToInitializedDB(mdbmaps, "MusicBrainz", "AllMusic")
mdbmaps["Discogs"] = musicDBMap("Discogs", init=True, copy=False)
if len(mdbmaps["Discogs"].getArtists()) == 0:
    initializeDB(mdbmaps, "Discogs")
    addToInitializedDB(mdbmaps, "Discogs", "AllMusic")

In [None]:
dbName = "Discogs"
for primaryKey,artistName in mdbmaps["AllMusic"].getArtists().items():
    artistData = mdbmaps["AllMusic"].getArtistDataByKey(primaryKey)

    primaryDBID         = artistData.getDBID(dbName)
    if primaryDBID is not None:
        if mdbmaps[dbName].isKnownByKey(primaryDBID) is False:
            secondaryArtistName = maindb.getArtistDBNameFromID(dbName, primaryDBID)
            print("Will Add {0} to Discogs".format(secondaryArtistName))
            continue
            mdbmaps[dbName].addArtistByKey(primaryDBID, artistName=secondaryArtistName, artistID=primaryDBID)
            for db,dbID in artistData.getDict().items():
                mdbmaps[dbName].addArtistDataByKey(primaryDBID, db, dbID)

#print("Initialized DB With {0} Artists".format(len(mdbmaps[dbName].getArtists())))
#mdbmaps[dbName].save()


# Create Master DB Match Class

In [16]:
mdbmc = masterDBMatchClass(maindb, mdbmaps)

Loading Artist Names


In [20]:
mdbmc.getDataToMatch("AllMusic", maxValues=100, maxAlbums=100, minAlbums=0, sort=True, useKnown=False, dbMatches=4)

Found 0 ignores
Total                -> 73905
After Ignores        -> 73905
After DB Matches     -> 4432
After MaxAlbums      -> 4258
After MinAlbums      -> 4258
After MaxValues      -> 100


{'AllMusic': [['0000406294',
   {'ArtistName': 'Megadeth',
    'ArtistAlbums': ["Peace Sells...But Who's Buying?",
     'So Far, So Good...So What!',
     'Rust in Peace',
     'Countdown to Extinction',
     'Youthanasia',
     'Cryptic Writings',
     'Risk',
     'The World Needs a Hero',
     'Rude Awakening',
     'The System Has Failed',
     'United Abominations',
     'That One Night: Live in Buenos Aires',
     'Endgame',
     'Rust in Peace Live',
     'The Big Four: Live from Sophia, Bulgaria',
     'Th1rt3en',
     'Super Collider',
     'Countdown to Extinction: Live',
     'Dystopia',
     'Killing Is My Business... And Business Is Good!',
     'Exposure of a Dream',
     'Live in Brazil 1991',
     'Wake Up Dead',
     'Anarchy in the U.K.',
     'In My Darkest Hour',
     'Mary Jane',
     'No More Mr. Nice Guy',
     'Hangar 18 [CD Single]',
     'Holy Wars',
     'Symphony of Destruction',
     'Foreclosure of a Dream',
     'Sweating Bullets [Cassette Single]',
     

****

# Matching Code

In [None]:
from tqdm import tqdm
from multiprocessing import Pool
from functools import partial
import time

def matchDBArtistWithAlbums(item, *args, **kwargs):    
    #time.sleep(0.0025)

    
    primaryKey = item[0]
    artistData = item[1]
    artistName   = artistData["ArtistName"]
    artistID     = primaryKey
    artistAlbums = artistData["ArtistAlbums"]
    
    mdbMatcher = matchDBArtist(maindb)
    mdbMatcher.setArtistInfo(artistName, artistID, artistAlbums)
    mdbMatcher.setThresholds(matchNumArtistName=kwargs['numArtistName'], matchArtistNameCutoff=kwargs['artistNameCutoff'], 
                             matchArtistAlbumCutoff=kwargs['artistAlbumCutoff'], matchNumArtistAlbums=kwargs['numArtistAlbums'],
                             matchScore=kwargs['score'])
    mcs    = mdbMatcher.findPotentialArtistAlbumMatches()
    retval = [primaryKey,artistName,artistID,mcs]
    return retval
    #result = findNearest(name, artists, 1, kwargs['cutoff'])


def multiProc(func, argument_list, num_processes):
    pool = Pool(processes=num_processes)
    result_list_tqdm = []
    for result in tqdm(pool.imap(func=func, iterable=argument_list), total=len(argument_list)):
        result_list_tqdm.append(result)
    return result_list_tqdm


def copyMapData(mdbmap):
    mdbmap.saveCopy()

def saveMapData(mdbmap, result_list):
    for item in result_list:
        primaryKey = item[0] 
        artistName = item[1]
        artistID   = item[2]
        mcs        = item[3]
        mdbmap.addArtistByKey(primaryKey, artistName=artistName, artistID=artistID)
        for db,mc in mcs.items():
            matchID    = mc.matchID
            matchScore = mc.matchScore
            if matchID is not None:
                mdbmap.addArtistDataByKey(primaryKey, db, matchID)

    mdbmap.save()

In [None]:
def matchItAll(mdbmaps, thresholds, mdbmc=None, db=None, toMatch=None):
    if toMatch is None:
        toMatch   = mdbmc.getDataToMatch(db, maxValues=10000, maxAlbums=50000)

    if len(toMatch) == 0:
        return

    num_processes = 3
    func = matchDBArtistWithAlbums
    pfunc = partial(matchDBArtistWithAlbums, **thresholds) # Giving some arguments for kwargs
    #argument_list = list(inputs.items()) # [random.randint(0, 100) for _ in range(num_jobs)]
    dbName = list(toMatch.keys())[0]
    copyMapData(mdbmaps[dbName])

    argument_list = toMatch[dbName]
    if len(argument_list) == 0:
        return
    print("Running imap multiprocessing for {0} artists ...".format(len(argument_list)))
    result_list = multiProc(func=pfunc, argument_list=argument_list,
                                           num_processes=num_processes)


    start, cmt = clock("Saving...")
    saveMapData(mdbmaps[dbName], result_list)
    elapsed(start, cmt)
    print("\nSleeping for 10 seconds...\n")
    #sleep(10)
    
    #mdbmc.matchMutualMaps()

In [4]:
def getThresholds(minAlbums):
    thresholds = {}
    thresholds[1000] = {'numArtistName': 1, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 9, 'score': 10.0}
    thresholds[500]  = {'numArtistName': 1, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 5, 'score': 5.0}
    thresholds[200]  = {'numArtistName': 1, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 3, 'score': 2.5}
    thresholds[100]  = {'numArtistName': 1, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 2, 'score': 1.5}
    thresholds[50]   = {'numArtistName': 2, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 2, 'score': 1.0}
    thresholds[20]   = {'numArtistName': 2, 'artistNameCutoff': 0.95, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 2, 'score': 1.0}
    thresholds[10]   = {'numArtistName': 5, 'artistNameCutoff': 0.90, 'artistAlbumCutoff': 0.9, 'numArtistAlbums': 2, 'score': 1.0}
    
    return thresholds[minAlbums]

In [None]:
artistIgnores = getFlatList([getFile(x) for x in ["ignores.p", "ignores2.p"]])
artistIgnores

In [None]:
matchData = mdbmc.getDBMatchData("AllMusic", returnData=False)

In [None]:
dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/9ddd7abc-9e1b-471d-8031-583bc6bc8be9")
mdbmaps["AllMusic"].addArtistDataByKey("0000317716", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/0e85eb79-1c05-44ba-827c-7b259a3d941a")
mdbmaps["AllMusic"].addArtistDataByKey("0000155453", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/846be3c9-5f94-46ab-97b9-531335dd3658")
mdbmaps["AllMusic"].addArtistDataByKey("0000678420", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/aeb71bd8-447d-4415-8ea1-2b7d664f67e1")
mdbmaps["AllMusic"].addArtistDataByKey("0000132940", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/274774a7-1cde-486a-bc3d-375ec54d552d")
mdbmaps["AllMusic"].addArtistDataByKey("0001436021", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/521df2bd-01f6-456e-9d5f-f081068819c2")
mdbmaps["AllMusic"].addArtistDataByKey("0000851639", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/cdb34f45-c9d3-4f14-a79a-bc1da62455cc")
mdbmaps["AllMusic"].addArtistDataByKey("0000182286", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/d5c55b61-78b8-40c9-be1b-de7517c3aebb")
mdbmaps["AllMusic"].addArtistDataByKey("0000825208", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/7ef691ec-b82a-4c58-852a-45ae2ed6d7b6")
mdbmaps["AllMusic"].addArtistDataByKey("0002601127", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/f19ad155-d809-4770-ab8d-7579467d9f55")
mdbmaps["AllMusic"].addArtistDataByKey("0000607283", "MusicBrainz", dbID)

dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "https://musicbrainz.org/artist/b97b14b0-d318-47aa-804f-1b6b43b1418b")
mdbmaps["AllMusic"].addArtistDataByKey("0000497617", "MusicBrainz", dbID)

if False:
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)
    
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", "")
    mdbmaps["AllMusic"].addArtistDataByKey("", "MusicBrainz", dbID)

In [None]:
mdbmaps["AllMusic"].save()

In [None]:
mdbmaps["AllMusic"].getArtistDataByKey("0000497617").show()

In [None]:
def addDiscogs(mdbmaps, amID, dbID):
    mdbmaps["AllMusic"].addArtistDataByKey(amID, "Discogs", dbID)

def addMusicBrainz(mdbmaps, amID, url):
    dbID = maindb.getArtistDBIDFromUtil("MusicBrainz", url)
    mdbmaps["AllMusic"].addArtistDataByKey(amID, "MusicBrainz", dbID)

In [None]:
addMusicBrainz(mdbmaps, '0002897709', 'https://musicbrainz.org/artist/95bad1d4-ba1f-4060-940b-d415d66e934e')
addMusicBrainz(mdbmaps, '0000465015', 'https://musicbrainz.org/artist/7b854ae0-0ca5-4061-ba69-8d0e5a1b2288')
addMusicBrainz(mdbmaps, '0000098900', 'https://musicbrainz.org/artist/4c45495f-b9f3-4d03-9fac-d44c8a77f5de')

In [None]:
amDF = mdbmaps["AllMusic"].getDF()

In [None]:
for primaryKey in amDF.head(1000).index:
    print(dbs)
    break

In [None]:
ifile="/Users/tgadfort/discogs/artists-discogs/42/435642.p"

In [None]:
amDB = getFile("/Users/tgadfort/Music/Discog/artists-discogs-db/55-DB.p")

In [None]:
for artistID,artistData in amDB.items():
    mediaCounts = artistData.mediaCounts.counts
    if 
    

In [None]:
result = maindb.dbdata["Discogs"]["Artists"].getData(ifile)
result.mediaCounts.counts

{'numArtistName': 5,
 'artistNameCutoff': 0.9,
 'artistAlbumCutoff': 0.9,
 'numArtistAlbums': 2,
 'score': 1.0}

In [12]:
mdbMatcher = matchDBArtist(maindb)
mdbMatcher.setArtistInfo("Mr. Children", "test", None)
kwargs = getThresholds(10)
mdbMatcher.setThresholds(matchNumArtistName=kwargs['numArtistName'], matchArtistNameCutoff=kwargs['artistNameCutoff'], 
                         matchArtistAlbumCutoff=kwargs['artistAlbumCutoff'], matchNumArtistAlbums=kwargs['numArtistAlbums'],
                         matchScore=kwargs['score'])
mcs = mdbMatcher.findPotentialArtistNameMatchesWithoutAlbums()

In [13]:
{db: mc.matchID for db, mc in mcs.items()}

{'Discogs': '2758452',
 'AllMusic': '0000543495',
 'MusicBrainz': '281845262899707761273838005437849234280',
 'LastFM': None,
 'RockCorner': None,
 'AceBootlegs': None,
 'CDandLP': None,
 'RateYourMusic': None,
 'MusicStack': None}

In [None]:
mcs    = mdbMatcher.findPotentialArtistAlbumMatches()
retval = [primaryKey,artistName,artistID,mcs]

In [None]:
mdbmc = masterDBMatchClass(maindb, mdbmaps)

In [None]:
df = mdbmc.getMasterDF("AllMusic")

In [None]:
dfNAs = df[(~df["AllMusic"].isna()) & (df["Discogs"].isna())] # & (~df["MusicBrainz"].isna())]

In [None]:
dfNAs = dfNAs[dfNAs["Albums"] < 120]

In [None]:
addDiscogs(mdbmaps, "0000151844", "364982")   ### Cliff Burton
addDiscogs(mdbmaps, "0000758194", "888651")   ### Beny Moré
addDiscogs(mdbmaps, "0000839043", "3226635")   ### Rolando Alejandro
addDiscogs(mdbmaps, "0000536971", "853521")   ### Guy N'Sangue
addDiscogs(mdbmaps, "0000774277", "619399")   ### Kin Vassy
addDiscogs(mdbmaps, "0002579551", "2179788")   ### Harmony Samuels
addDiscogs(mdbmaps, "0000190702", "5352577")   ### Starlite Orchestra
addDiscogs(mdbmaps, "0001665752", "208220")   ### Gary Moore
addDiscogs(mdbmaps, "0000002330", "27986")   ### Sergio Mendes
addDiscogs(mdbmaps, "0000577994", "435555")   ### Czech Philharmonic Orchestra
addDiscogs(mdbmaps, "0000820227", "433832")   ### Shalini Vijayan
addDiscogs(mdbmaps, "0002297154", "435642")   ### Felipe Tichauer
addDiscogs(mdbmaps, "0003351776", "4709198")   ### Aaquil Brown
addDiscogs(mdbmaps, "0003735010", "7241622")   ### Felipe Mejía Saldarriaga
addDiscogs(mdbmaps, "0000153555", "228742")   ### Benji Madden
addDiscogs(mdbmaps, "0000496092", "788254")   ### Morris Stoloff
addDiscogs(mdbmaps, "0000935778", "2265712")   ### Alejandro Abaroa
addDiscogs(mdbmaps, "0000229821", "398912")   ### Deanie Parker
addDiscogs(mdbmaps, "0000871744", "2307957")   ### Memo Mendez-Guiu
addDiscogs(mdbmaps, "0000089262", "236672")   ### Héctor Castillo
addDiscogs(mdbmaps, "0000608865", "1212570")   ### Brandy Norwood
addDiscogs(mdbmaps, "0002170424", "1447773")   ### Roberto Benaglio
addDiscogs(mdbmaps, "0000125199", "9660")   ### Jagz Kooner
addDiscogs(mdbmaps, "0000471765", "815550")   ### Sal Lozano
addDiscogs(mdbmaps, "0002275194", "1567977")   ### Angele Vannier

mdbmaps["AllMusic"].save()

In [None]:
for idx,row in dfNAs[(dfNAs["DBMatches"] == 1) & (dfNAs["Albums"] < 50)].head(50).iterrows():
    print("addDiscogs(mdbmaps, {0}, {1})   ### {2}".format("\"{0}\"".format(idx), "\"{0}\"".format(""), row["Artist"]))

In [None]:
mergeDF

# Match Everything

In [None]:
dbName = "MusicBrainz"

In [None]:
mdbmaps["AllMusic"].getArtistDataByID("0000017558").show()
#mdbmaps["AllMusic"].addArtistDataByKey("0000742285", "MusicBrainz", "108136271887775575975068557696538788286")
#mdbmaps["AllMusic"].save()

In [None]:
ignores = getFile("ignores2.p")
ignores = ignores + ["Philharmonia Orchestra", "[unknown]", "Various Artists"]
saveFile(idata=ignores, ifile="ignores2.p")

In [None]:
mdbmaps["MusicBrainz"].addArtistByKey("72876013401729229602621125692163988869", artistName="Gustav Mahler", artistID="72876013401729229602621125692163988869")
mdbmaps["MusicBrainz"].addArtistDataByKey("72876013401729229602621125692163988869", "MusicBrainz", "72876013401729229602621125692163988869")
mdbmaps["MusicBrainz"].addArtistDataByKey("72876013401729229602621125692163988869", "AllMusic", "0000017558")
mdbmaps["MusicBrainz"].addArtistDataByKey("72876013401729229602621125692163988869", "Discogs", "239236")
mdbmaps["MusicBrainz"].save()

In [None]:
[(item[0],item[1]["ArtistName"]) for item in toMatch["MusicBrainz"]]

In [None]:
maxAlbums = 1000
minAlbums = 500
for i in range(1):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=250, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 500
minAlbums = 200
for i in range(1):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=250, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 200
minAlbums = 100
for i in range(1):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=250, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 100
minAlbums = 50
for i in range(1):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=500, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 50
minAlbums = 20
for i in range(40):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=500, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 20
minAlbums = 10
for i in range(100):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=500, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 10
minAlbums = 5
for i in range(100):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=1000, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
maxAlbums = 5
minAlbums = 3
for i in range(10000):
    toMatch   = mdbmc.getDataToMatch(dbName, maxValues=2500, maxAlbums=maxAlbums, minAlbums=minAlbums, ignores=["ignores.p", "ignores2.p"])
    if len(toMatch[dbName]) == 0:
        break
    matchItAll(mdbmaps, getThresholds(minAlbums), toMatch=toMatch)

In [None]:
from pandas import Series, DataFrame

minAlbums = 10
maxAlbums = 1000
maxValues = 1000
sort = True
db = "AllMusic"
matchData  =  mdbmc.getDBMatchData(db)
salbums    = Series({primaryKey: len(artistData["ArtistAlbums"]) for primaryKey,artistData in matchData.items()}).sort_values(ascending=False)
known = mdbmaps[db].getArtists()
mdDF = DataFrame(matchData).T
artistIgnores = getFlatList([getFile(x) for x in ["ignores.p", "ignores2.p"]])
print("Found {0} ignores".format(len(artistIgnores)))


nAlbums    = salbums.to_dict()

if sort is True:
    sortedKeys = nAlbums.keys()
else:
    sortedKeys = matchData.keys()

cuts = {"Total": salbums.shape[0]}

togetSAlbums = salbums[salbums.index.isin(mdDF[~mdDF["ArtistName"].isin(artistIgnores)].index)]
cuts["After Ignores"] = togetSAlbums.shape[0]

togetSAlbums = togetSAlbums[~togetSAlbums.index.isin(known.keys())]
cuts["After Known"] = togetSAlbums.shape[0]

togetSAlbums = togetSAlbums[togetSAlbums < maxAlbums]
cuts["After MaxAlbums"] = togetSAlbums.shape[0]

togetSAlbums = togetSAlbums[togetSAlbums >= minAlbums]
cuts["After MinAlbums"] = togetSAlbums.shape[0]

togetSAlbums = togetSAlbums.head(maxValues)
cuts["After MaxValues"] = togetSAlbums.shape[0]

sMatchData = Series(matchData)
toMatch = sMatchData[sMatchData.index.isin(togetSAlbums.index)].to_dict()


In [None]:
ignores = {"Classical": {}, "Work": {}, "Soundtracks": {}}
dfs = {}
for item in toMatch["AllMusic"]:
    primaryID    = item[0]
    artistName   = item[1]["ArtistName"]
    artistAlbums = item[1]["ArtistAlbums"]

    if False:
        if ignores["Classical"].get(primaryID) is not None:
            continue
        elif sum(["Classical" in x for x in artistAlbums]) > 5:
            ignores["Classical"][primaryID] = artistName
            continue
        elif sum(["Beethoven" in x for x in artistAlbums]) > 1:
            continue
        elif sum(["Bach" in x for x in artistAlbums]) > 1:
            ignores["Classical"][primaryID] = artistName
            continue
        
    
    if ignores["Work"].get(primaryID) is not None:
        continue
    elif sum(["Work from " in x for x in artistAlbums]) > 0:
        ignores["Work"][primaryID] = artistName
        continue
    elif sum(["Work From " in x for x in artistAlbums]) > 0:
        ignores["Work"][primaryID] = artistName
        continue
    elif sum(["Work at " in x for x in artistAlbums]) > 0:
        ignores["Work"][primaryID] = artistName
        continue
    elif sum(["Work At " in x for x in artistAlbums]) > 0:
        ignores["Work"][primaryID] = artistName
        continue
        
    
    if ignores["Soundtracks"].get(primaryID) is not None:
        continue
    elif sum(["Movie Hits " in x for x in artistAlbums]) > 0:
        ignores["Soundtracks"][primaryID] = artistName
        continue
    elif sum(["Soundtrack " in x for x in artistAlbums]) > 0:
        ignores["Soundtracks"][primaryID] = artistName
        continue
        
        
    dfs[primaryID] = {"ArtistName": artistName, "Albums": artistAlbums[:5]}
    if len(dfs) > 100:
        break

In [None]:
saveFile(idata=getFlatList([list(x.values()) for x in ignores.values()]), ifile="ignores.p")

In [None]:
ignores = getFile("ignores.p")

In [None]:
DataFrame(dfs).T

### Match With Music DB

In [None]:
mdbmaps["AllMusic"].save()

****
****
****

# Analyze Matched DB Data

In [18]:
from difflib import SequenceMatcher

dbRenames   = {} #getFile("relDBRenames2.yaml")
savedDBName = "MusicBrainz"
savedDBName = "AllMusic"

toget = {}

for primaryKey,artistName in mdbmaps[savedDBName].getArtists().items():
    primaryArtistName = maindb.getArtistDBNameFromID(savedDBName, primaryKey)
    if primaryArtistName.find("Star Academy") != -1:
        continue
    artistData = mdbmaps[savedDBName].getArtistDataByKey(primaryKey)
    for db,dbID in artistData.getDict().items():
        if db in ["DatPiff", "MetalStorm"]:
            continue
        if dbID is not None:
            secondaryArtistName = maindb.getArtistDBNameFromID(db, dbID)
            if dbRenames.get(secondaryArtistName) is not None:
                secondaryArtistName = dbRenames[secondaryArtistName]
            if secondaryArtistName is None:
                if toget.get(db) is None:
                    toget[db] = []
                toget[db].append(primaryArtistName)
                continue
            s = SequenceMatcher(None, primaryArtistName, secondaryArtistName)
            ratio = s.ratio()
            maxRatio = 0.8
            if ratio >= maxRatio:
                continue
                
            if ratio >= maxRatio-0.4:
                if dbRenames.get(secondaryArtistName) is not None:
                    if dbRenames[secondaryArtistName] == primaryArtistName:
                        continue
                    else:
                        if secondaryArtistName.find("Star Academy") != -1:
                            del dbRenames[secondaryArtistName]
                            continue
                        if False:
                            if secondaryArtistName == "Wild Billy Childish and the Musicians of the British Empire":
                                del dbRenames[secondaryArtistName]
                                continue                            
                            if secondaryArtistName == "Wild Billy Childish & The Musicians Of The British Empire":
                                del dbRenames[secondaryArtistName]
                                continue                            
                            if secondaryArtistName == "Glenn Hughes":
                                del dbRenames[secondaryArtistName]
                                continue                          
                            if secondaryArtistName == "Formula V" or secondaryArtistName == "Formula 3":
                                del dbRenames[secondaryArtistName]
                                continue                          
                        raise ValueError("Multi Values For {0}: [{1} , {2}]".format(secondaryArtistName, primaryArtistName, dbRenames[secondaryArtistName]))
                dbRenames[secondaryArtistName] = primaryArtistName
                print("[{0: <30} {1: <4} {2: >30}] \t --> ({3}) {4} / {5} ({6})".format(primaryArtistName,round(ratio,2),secondaryArtistName,savedDBName,primaryKey,dbID,db))
                
            
print("Done.")

Done.


In [None]:
mdbmaps["AllMusic"].addArtistDataByKey("0000046737", "LastFM", None)
mdbmaps["AllMusic"].addArtistDataByKey("0001548072", "LastFM", None)
mdbmaps["AllMusic"].addArtistDataByKey("0000050262", "LastFM", None)
mdbmaps["AllMusic"].save()

In [None]:
saveFile(idata=dbRenames, ifile="relDBRenames2.yaml")
len(dbRenames)

In [8]:
from masterArtistNameDB import masterArtistNameDB
dbManDB = masterArtistNameDB("db", init=False)
manDB   = masterArtistNameDB("master", init=False)

  Loading data from /Users/tgadfort/opt/anaconda3/envs/py37/discogs/dbArtistNameDB.p
  There are currently 3556 artist keys.
  There are currently 3769 renamed artist keys.
  Loading data from /Users/tgadfort/opt/anaconda3/envs/py37/discogs/masterArtistNameDB.p
  There are currently 885 artist keys.
  There are currently 1183 renamed artist keys.


In [None]:
toget

In [16]:
saveFile(ifile="dbRenameTmp.p", idata=dbManDB.dbRenames)
primaryRenames = getFile("dbRenameTmp.p")
dbManDB = masterArtistNameDB("db", init=True)
dbManDB.forceReload(primaryRenames)
dbManDB.addRenames(getFile("relDBRenames2.yaml"))
dbManDB.save()
dbManDB.saveRenames()
dbManDB = masterArtistNameDB("db", init=False)

  --> This file is 81.1kB.
  Loading data from /Users/tgadfort/opt/anaconda3/envs/py37/discogs/dbArtistNameDB.p
  Initializing a fresh DB for db
  There are currently 0 artist keys.
  There are currently 0 renamed artist keys.
Trying to add 4 renamed artist keys
There are currently 3560 artist keys.
There are currently 3773 renamed artist keys.
There are currently 3560 artist keys.
There are currently 3773 renamed artist keys.
Saving 3560 artist keys
  --> This file is 95.7kB.
Saving 3773 renamed artist keys
  Loading data from /Users/tgadfort/opt/anaconda3/envs/py37/discogs/dbArtistNameDB.p
  There are currently 3560 artist keys.
  There are currently 3773 renamed artist keys.


In [9]:
redos = {}
dels = []
for oldername,bestname in dbRenames.items():
    if oldername in dbManDB.artistNameDB.keys() and bestname not in dbManDB.artistNameDB.keys():
        print(oldername,bestname)

        redos[bestname] = oldername
        dels.append(oldername)
        print('\t',oldername,'\t',bestname)
    elif oldername in manDB.artistNameDB.keys() and bestname not in manDB.artistNameDB.keys():
        print(oldername,bestname)

        redos[bestname] = oldername
        dels.append(oldername)
        print('\t',oldername,'\t',bestname)

print("# Renames: {0}".format(len(dbRenames)))
print("# Dels: {0}".format(len(dels)))
for oldername in dels:
    del dbRenames[oldername]
    
print("# Redos: {0}".format(len(redos)))
dbRenames.update(redos)
print("# Renames: {0}".format(len(dbRenames)))

saveFile(idata=dbRenames, ifile="relDBRenames2.yaml")
len(dbRenames)

# Renames: 5
# Dels: 0
# Redos: 0
# Renames: 5


5

In [None]:
mdbmaps["AllMusic"].getArtistDataByKey("0000280681").show()

In [None]:
dbManDB.addRenames(getFile("relDBRenames2.yaml"))

In [None]:
print("="*20,"Master","="*20)
manDB.findArtist("Ted Leo & the Pharmacists")
print("="*20,"DB","="*20)
dbManDB.findArtist("Ted Leo & the Pharmacists")
print("="*40)

In [None]:
matches = findNearest("Patti Labelle & The Bluebelles", list(dbManDB.artistNameDB.keys()), 3, cutoff=0.85)
for matchName,renameNames in {matchName: dbManDB.artistNameDB[matchName] for matchName in matches}.items():
    print(matchName)
    for renameName in renameNames:
        print("\t",renameName)

In [None]:
dbManDB.artistNameDB["Patti LaBelle & The Bluebelles"]

In [None]:
manDB.artistNameDB

In [None]:
#mdbmaps["AllMusic"].addArtistDataByKey("0001441485", "LastFM", None)
#mdbmaps["AllMusic"].addArtistDataByKey("0001520419", "LastFM", None)
#mdbmaps["AllMusic"].addArtistDataByKey("0001964534", "LastFM", None)
#mdbmaps["AllMusic"].save()

In [None]:
#mdbmaps["AllMusic"].getArtistDataByID("0001441485").show()

In [10]:
def isAscii(ele):
    return len(ele) == len(ele.encode())

dbRenames = getFile("relDBRenames2.yaml")
redos = {}
dels = []
for k,v in dbRenames.items():    
    if isAscii(v) and not isAscii(k):
        if '’' in k or "“" in k:
            continue
        redos[v] = k
        dels.append(k)
        print('\t',k,'\t',v)
print(len(dbRenames))
for k in dels:
    del dbRenames[k]
    
print(len(dbRenames))
dbRenames.update(redos)
print(len(dbRenames))


saveFile(idata=dbRenames, ifile="relDBRenames2.yaml")
len(dbRenames)

	 Hi‐Fi Set 	 Hi-Fi Set
	 Memo Méndez Guiú 	 Memo Mendez-Guiu
5
3
5


5

In [11]:
dbRenames = getFile("relDBRenames2.yaml")
redos = {}
dels = []
for k,v in dbRenames.items():    
    if " the " in v and " The " in k:
        if k == v.replace(" the ", " The "):
            redos[v] = k
            dels.append(k)
            print('\t',k,'\t',v)
        
print(len(dbRenames))
for k in dels:
    del dbRenames[k]
    
print(len(dbRenames))
dbRenames.update(redos)
print(len(dbRenames))


saveFile(idata=dbRenames, ifile="relDBRenames2.yaml")
len(dbRenames)

5
5
5


5

In [12]:
dbRenames = getFile("relDBRenames2.yaml")
redos = {}
dels = []
for k,v in dbRenames.items():    
    if '’' in v or "“" in v:
        if not '’' in k and not "“" in v:
            redos[v] = k
            dels.append(k)
            print('\t',k,'\t',v)

print(len(dbRenames))
for k in dels:
    del dbRenames[k]
    
print(len(dbRenames))
dbRenames.update(redos)
print(len(dbRenames))


saveFile(idata=dbRenames, ifile="relDBRenames2.yaml")
len(dbRenames) 

5
5
5


5

****
****
****
****

In [13]:
dbRenames = getFile("relDBRenames.yaml")

redos = {}
dels = []
for k,v in dbRenames.items():
    if isAscii(v) and not isAscii(k):
        redos[v] = k
        dels.append(k)
        print(k,'\t',v)

Anders “Anden” Matthesen 	 Anders "Anden" Matthesen
Andrew Bird’s Bowl of Fire 	 Andrew Bird's Bowl of Fire
Aphrodite’s Child 	 Aphrodite's Child
Billy “The Kid” Emerson 	 Billy "The Kid" Emerson
Black Angel’s Death Song 	 Black Angel's Death Song
Blazin’ Squad 	 Blazin' Squad
Bomfunk MC’s 	 Bomfunk MC's
Carlos “Patato” Valdés 	 Carlos "Patato" Valdes
Cat’s Eyes 	 Cat's Eyes
Compton’s Most Wanted 	 Compton's Most Wanted
Cookin’ on 3 Burners 	 Cookin' on 3 Burners
Da’ T.R.U.T.H. 	 Da' T.R.U.T.H.
Des’ree 	 Des'ree
Dr. Buzzard’s Original Savannah Band 	 Dr. Buzzard's Original Savannah Band
Eddie “Flashin” Fowlkes 	 Eddie Flashin' Fowlkes
Elephant’s Memory 	 Elephant's Memory
Eli “Paperboy” Reed 	 Eli "Paperboy" Reed
Elliott Sharp’s Terraplane 	 Elliott Sharp's Terraplane
Elm City Girls’ Choir 	 Elm City Girls' Choir
Fool’s Gold 	 Fool's Gold
Gandalf’s Fist 	 Gandalf's Fist
Gerry O’Connor 	 Gerry O'Connor
Hustler’s Convention 	 Hustlers Convention
I’m From Barcelona 	 I'm from Barcelona
Ja

In [None]:
db1 = getFile("relDBRenames.yaml")
db2 = getFile("relDBRenames2.yaml")

print(len(db1))
print(len(db2))
db = {**db1, **db2}
saveFile(idata=db, ifile="dbRenames.yaml")

In [None]:
len(getFile("/Users/tgadfort/opt/anaconda3/envs/py37/discogs/dbDBRenames.yaml"))

In [None]:
dbManDB = masterArtistNameDB("db", init=False)

In [None]:
from masterArtistNameDB import masterArtistNameDB
dbManDB = masterArtistNameDB("db", init=False)
#dbManDB.addRenames(getFile("relDBRenames.yaml"))
dbManDB.addRenames(getFile("relDBRenames2.yaml"))
dbManDB.save()
dbManDB.saveRenames()


In [None]:
dbManDB.findArtist("Sharon Jones & the Dap-Kings")

In [None]:
manDB = masterArtistNameDB("master", init=False)
artistNames = manDB.artistNameDB.keys()
for artistName in artistNames:
    if dbManDB.renamed(artistName) != artistName:
        print(artistName)
#manDB

In [None]:
dbManDB = masterArtistNameDB("db", init=False)

In [None]:
mdbmap = musicDBMap("Music", init=False, copy=False)
results = mdbmap.getArtistDataByName("Gioacchino Rossini")
mdbmap.removeArtistByName("Gioacchino Rossini")
mdbmap.addArtistByName("Gioachino Rossini")
for db,dbID in results.getDict().items():
    if dbID is not None:
        mdbmap.addArtistDataByName("Gioachino Rossini", db, dbID)
mdbmap.save()

# Check For Bad Match

In [None]:
from masterArtistNameDB import masterArtistNameDB
dbmanDB   = masterArtistNameDB("db")

In [None]:
from difflib import SequenceMatcher

badMatches = {}

for primaryKey,artistName in mdbmaps["AllMusic"].getArtists().items():
    primaryArtistName = maindb.getArtistDBNameFromID("AllMusic", primaryKey)
    artistData = mdbmaps["AllMusic"].getArtistDataByKey(primaryKey)
    for db,dbID in artistData.getDict().items():
        if db in ["DatPiff", "MetalStorm"]:
            continue
        if dbID is not None:
            secondaryArtistName = maindb.getArtistDBNameFromID(db, dbID)
            secondaryArtistName = dbmanDB.renamed(secondaryArtistName)
            s = SequenceMatcher(None, str(primaryArtistName), str(secondaryArtistName))
            ratio = s.ratio()
            if ratio >= 1.0:
                continue
                
            maxRatio = 0.85
            if ratio >= maxRatio - 0.005:
                print("[{0: <30} {1: <4} {2: >30}] --> (AllMusic) {3} / {4} ({5})".format(primaryArtistName,round(ratio,3),secondaryArtistName,primaryKey,dbID,db))
                badMatches[secondaryArtistName] = primaryArtistName
                
saveFile(idata=badMatches, ifile="badMatches.yaml")

In [None]:
from masterArtistNameDB import masterArtistNameDB
dbmanDB   = masterArtistNameDB("db")

In [None]:
badRenames = getFile("badMatches.yaml")

In [None]:
dbmanDB.addRenames(badRenames)
dbmanDB.save()
dbmanDB.saveRenames()

In [None]:
for dbArtistName in dbKeys:
    if manDB.renamed(dbArtistName) != dbArtistName:
        print(dbArtistName, manDB.renamed(dbArtistName))

# MultiArtist

In [None]:
mdbmap = musicDBMap("Music", init=False, copy=False)

In [None]:
mularts  = multiartist(cutoff=0.9, discdata=None, exact=False)

In [None]:
saveFile(idata=[artistName for artistID,artistName in mdbmap.getArtists().items() if len(mularts.getArtistNames(artistName)) > 1],
         ifile="../multiartist/multiDelimArtistsMusic.yaml")

In [None]:
saveFile(idata=[artistName for artistID,artistName in maindb.dbdata["AllMusic"]["Disc"].getArtistIDToNameData().items() if artistName is not None and len(mularts.getArtistNames(artistName)) > 1],
         ifile="../multiartist/multiDelimArtistsAllMusic.yaml")

In [None]:
saveFile(idata=[artistName for artistID,artistName in maindb.dbdata["MusicBrainz"]["Disc"].getArtistIDToNameData().items() if artistName is not None and len(mularts.getArtistNames(artistName)) > 1],
         ifile="../multiartist/multiDelimArtistsMusicBrainz.yaml")

In [None]:
saveFile(idata=[artistName for artistID,artistName in maindb.dbdata["RateYourMusic"]["Disc"].getArtistIDToNameData().items() if artistName is not None and len(mularts.getArtistNames(artistName)) > 1],
         ifile="../multiartist/multiDelimArtistsRateYourMusic.yaml")

In [None]:
saveFile(idata=[artistName for artistID,artistName in maindb.dbdata["RockCorner"]["Disc"].getArtistIDToNameData().items() if artistName is not None and len(mularts.getArtistNames(artistName)) > 1],
         ifile="../multiartist/multiDelimArtistsRockCorner.yaml")

In [None]:
from glob import glob
from listUtils import getFlatList
saveFile(idata=sorted(list(set(getFlatList([getFile(x) for x in glob("../multiartist/multiDelimArtists*.yaml")])))),
         ifile="../multiartist/multiDelimArtists.p")

****
****
****
****

In [None]:
manual      = False
test        = False
cutoff      = 0.7
primaryDB   = "AceBootlegs"
secondaryDB = "Discogs"
Nmatches = 0
if manual:
    test = True

for primaryKey in mdbmaps[primaryDB].getArtists():
    artistName = primaryKey[0]
    artistID   = primaryKey[1]
    artistData = mdbmaps[primaryDB].getArtistData(artistName, artistID)
    if artistData is not None:
        matchIDs = artistData.getDict()
        secondaryID   = matchIDs[secondaryDB]
        if secondaryID is None:
            continue
        secondaryName = maindb.getArtistDBNameFromID(secondaryDB, secondaryID)
        if secondaryName is None:
            continue
            
        if mdbmaps[secondaryDB].isKnown(secondaryName, secondaryID):
            continue
        
        s = SequenceMatcher(None, str(artistName), str(secondaryName))
        if s.ratio() >= cutoff:
            print("[{0: <30} {1: <4} {2: >30}] --> {3} / {4}".format(artistName,round(s.ratio(),2),secondaryName,artistID,secondaryID))
            Nmatches += 1
            if manual is True:
                print("mdbmaps[\"{0}\"].addArtist(\"{1}\", \"{2}\")".format(secondaryDB, secondaryName, secondaryID))
                print("mdbmaps[\"{0}\"].addArtistData(\"{1}\", \"{2}\", \"{3}\", \"{4}\")".format(secondaryDB, secondaryName, secondaryID, secondaryDB, secondaryID))
                print("mdbmaps[\"{0}\"].addArtistData(\"{1}\", \"{2}\", \"{3}\", \"{4}\")".format(secondaryDB, secondaryName, secondaryID, primaryDB, artistID))
            elif test is False:
                mdbmaps[secondaryDB].addArtist(secondaryName, secondaryID)
                mdbmaps[secondaryDB].addArtistData(secondaryName, secondaryID, secondaryDB, secondaryID)
                mdbmaps[secondaryDB].addArtistData(secondaryName, secondaryID, primaryDB, artistID)
                
                
if test is False and Nmatches > 0:
    mdbmaps[secondaryDB].save()
print("Found {0} mutual matches".format(Nmatches))

In [None]:
mdbmaps["Discogs"].addArtist("Alt-J", "2830806")
mdbmaps["Discogs"].addArtistData("Alt-J", "2830806", "Discogs", "2830806")
mdbmaps["Discogs"].addArtistData("Alt-J", "2830806", "RateYourMusic", "791685")
mdbmaps["Discogs"].save()

# Match Mutual DBs

In [None]:
from difflib import SequenceMatcher
from pandas import Series, DataFrame, isna, isnull

def getDBDF(mdbmc, dbName, dbShort, keepNA=False):
    amdf = DataFrame(mdbmaps[dbName].getDF().T)
    print("Total Size: {0}".format(amdf.shape[0]))
    dbcols = amdf.columns

    matchData  =  mdbmc.getDBMatchData(dbName)
    mAlbDF = DataFrame(Series({primaryKey: len(albums) for primaryKey,albums in matchData.items()}))

    amdf = amdf.join(mAlbDF)
    cols = list(amdf.columns)
    cols[-1] = "{0}Albums".format(dbShort)
    amdf.columns = cols

    amdf.reset_index(inplace=True)
    columns = list(amdf.columns)
    columns[0] = "{0}ArtistName".format(dbShort)
    columns[1] = "{0}Key".format(dbShort)
    amdf.columns = columns
    if keepNA is False:
        amdf = amdf[~amdf[dbName].isna()]

    colOrder = columns[:2] + [columns[-1]] + list(dbcols)
    amdf = amdf[colOrder]

    amdf = amdf.sort_values("{0}Albums".format(dbShort), ascending=False)

    print(" Good Size: {0}".format(amdf.shape[0]))
    return amdf


def checkDBIDMatch(amdf, dbName, dbShort):
    wrong = amdf[amdf["{0}Key".format(dbShort)] != amdf[dbName]]
    idxs  = list(wrong.index)
    print("Found {0} wrongly assigned index".format(len(idxs)))
    return idxs
 
    
#####################################################################################################################
# Fill Matched DB Data From Overlapping Data In A Previously Matched DB
#####################################################################################################################
def mutualMatch(amdf, mdbmaps, primaryInfo, secondaryInfo, debug=False, test=True, ratioCut=0.9):
    primaryDBName  = primaryInfo[0]
    primaryDBShort = primaryInfo[1]
    primaryDBArtistName = "{0}ArtistName".format(primaryDBShort)
    primaryDBKey        = "{0}Key".format(primaryDBShort)

    secondaryDBName  = secondaryInfo[0]
    secondaryDBShort = secondaryInfo[1]

    debug = debug
    amrcdf = amdf[~amdf[secondaryDBName].isna()][[primaryDBArtistName, primaryDBKey, secondaryDBName]]
    for i,row in amrcdf.iterrows():

        ## Primary DB Key --> Secondary ID
        amDBName = row[primaryDBArtistName]
        amDBKey  = row[primaryDBKey]
        rcDBKey  = row[secondaryDBName]

        ## Secondary ID --> Secondary DB Key
        primKey  = mdbmaps[secondaryDBName].getPrimaryKeyFromID(rcDBKey)
        if primKey is None:
            continue
        rcDBName = primKey[0]
        rcDBKey  = primKey[1]


        s = SequenceMatcher(None, str(amDBName), str(rcDBName))

        if debug is True:
            print("{0: <30}{1: <30}{2: <30}{3}".format("", primaryDBName, secondaryDBName, "Match"))
            print("{0: <30}{1: <30}{2: <30}{3}".format("{0} ID   --> {1} ID".format(primaryDBShort, secondaryDBShort), amDBKey, rcDBKey, round(s.ratio(),2)))
            print("{0: <30}{1: <30}{2: <30}".format("{0} Name --> {1} Name".format(primaryDBShort, secondaryDBShort), amDBName, rcDBName))


        amDBData = mdbmaps[primaryDBName].getArtistData(amDBName, amDBKey)
        rcDBData = mdbmaps[secondaryDBName].getArtistData(rcDBName, rcDBKey)

        amRCDBKey = rcDBKey
        rcAMDBKey = rcDBData.getDBID(primaryDBName)
        if debug is True:
            print("{0: <30}{1: <30}{2: <30}".format("{0} ID   --> {1} ID".format(secondaryDBShort, primaryDBShort), str(rcAMDBKey), str(amRCDBKey)))    

        if s.ratio() > ratioCut:
            if rcAMDBKey != amDBKey:
                print("{0: <50}{1: <75}\t--->\t[{2}/{3}]".format("Setting {0} DBData For {1}".format(secondaryDBName,primaryDBName),"[{0: <30} {1: <4} {2: >30}]".format(amDBName,round(s.ratio(),2),rcDBName),amDBKey,rcDBKey))
                if test is True:
                    continue
                mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, primaryDBName, amDBKey)

        if debug:
            print("\n")

    if test is True:
        return
    mdbmaps[secondaryDBName].save()
    
    
    
#####################################################################################################################
# Fill Fresh DB Data From A Previously Matched DB
#####################################################################################################################
def mutualMatchFromPreviousDBMatch(amdf, mdbmaps, primaryInfo, secondaryInfo, debug=False, test=True, ratioCut=0.9):

    primaryDBName  = primaryInfo[0]
    primaryDBShort = primaryInfo[1]
    primaryDBArtistName = "{0}ArtistName".format(primaryDBShort)
    primaryDBKey        = "{0}Key".format(primaryDBShort)

    secondaryDBName  = secondaryInfo[0]
    secondaryDBShort = secondaryInfo[1]



    amrcdf = amdf[~amdf[secondaryDBName].isna()][[primaryDBArtistName, primaryDBKey, secondaryDBName]]
    for i,row in amrcdf.iterrows():

        ## Primary DB Key --> Secondary ID
        amDBName = row[primaryDBArtistName]
        amDBKey  = row[primaryDBKey]
        rcDBKey  = row[secondaryDBName]

        if amDBKey is None:
            continue

        #if debug is True:
        #    print(amDBName,'\t',amDBKey,'\t',rcDBKey,'\t-->\t',end="")

        ## Secondary ID --> Secondary DB Key
        primKey = toMatchKeys.get(rcDBKey)
        #if debug is True:
        #    print(primKey)
        #primKey  = mdbmaps[secondaryDBName].getPrimaryKeyFromID(rcDBKey)

        if primKey is None:
            continue
        rcDBName = primKey[0]
        rcDBKey  = primKey[1]



        s = SequenceMatcher(None, str(amDBName), str(rcDBName))

        if debug is True:
            print("{0: <30}{1: <30}{2: <30}{3}".format("", primaryDBName, secondaryDBName, "Match"))
            print("{0: <30}{1: <30}{2: <30}{3}".format("{0} ID   --> {1} ID".format(primaryDBShort, secondaryDBShort), amDBKey, rcDBKey, round(s.ratio(),2)))
            print("{0: <30}{1: <30}{2: <30}".format("{0} Name --> {1} Name".format(primaryDBShort, secondaryDBShort), amDBName, rcDBName))

        continue
        if not mdbmaps[secondaryDBName].isKnown(rcDBName, rcDBKey):
            mdbmaps[secondaryDBName].addArtist(rcDBName, rcDBKey)
            mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, secondaryDBName, rcDBKey)


        amDBData = mdbmaps[primaryDBName].getArtistData(amDBName, amDBKey)
        rcDBData = mdbmaps[secondaryDBName].getArtistData(rcDBName, rcDBKey)

        amRCDBKey = rcDBKey
        rcAMDBKey = rcDBData.getDBID(primaryDBName)
        if debug is True:
            print("{0: <30}{1: <30}{2: <30}".format("{0} ID   --> {1} ID".format(secondaryDBShort, primaryDBShort), str(rcAMDBKey), str(amRCDBKey)))    

        if s.ratio() > ratioCut:
            if rcAMDBKey != amDBKey:
                print("{0: <50}{1: <75}\t--->\t[{2}/{3}]".format("Setting {0} DBData For {1}".format(secondaryDBName,primaryDBName),"[{0: <30} {1: <4} {2: >30}]".format(amDBName,round(s.ratio(),2),rcDBName),amDBKey,rcDBKey))
                if test is True:
                    continue
                mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, primaryDBName, amDBKey)


        if debug:
            print("\n")

    if test is False:
        mdbmaps[secondaryDBName].save()    
    
    

def isSame(x):
    name  = x.MyArtistName
    match = x["{0}ArtistName".format(dbShort)]
    if not all([name,match]):
        return None
    if any([isna(x) for x in [name,match]]):
        return None

    same = False
    if all([name,match]):
        if str(name) == str(match):
            same = True
    else:
        raise ValueError([name,match])
    return same

def ratio(x):
    name  = x.MyArtistName
    match = x["{0}ArtistName".format(dbShort)]
    same  = x.Same
    if same is None:
        return None
    else:
        if all([name,match]):
            s = SequenceMatcher(None, str(name), str(match))
            return s.ratio()
        return -1.0

# Fill Secondary DB With Previous Matches (No Albums Match)

In [None]:
mbdf = mdbmaps["MusicBrainz"].getDF().T

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("1660fbb6-c55e-4ad3-8342-c424162c30a6")

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("c685765c-b778-4ed6-9035-8f7088431765")

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("5bf05dd4-3ac1-4b8b-b9a1-163790d10192")

In [None]:
mdbmaps["MusicBrainz"].addArtist("The Highwomen", "286831209235815715269312378569239355331")
mdbmaps["MusicBrainz"].addArtistData("The Highwomen", "286831209235815715269312378569239355331", "MusicBrainz", "286831209235815715269312378569239355331")
mdbmaps["MusicBrainz"].addArtistData("The Highwomen", "286831209235815715269312378569239355331", "AllMusic", "0003859078")
mdbmaps["MusicBrainz"].save()

In [None]:
for idx in mbdf.index:
    if idx[0].find("Highway") != -1:
        print(idx)

In [None]:
toget   = getFile(ifile="togetMB.p")
for primaryKey in toget:
    mdbmaps["MusicBrainz"].removeArtist(primaryKey[0], primaryKey[1])
mdbmaps["MusicBrainz"].save()

In [None]:
if False:
    results = {}
    for item in matchData[:100]:
        print(item[0])
        mdbMatcher.setArtistInfo(artistName, artistID, artistAlbums)
        #mdbMatcher.findPotentialArtistNameMatches()
        #mc = mdbMatcher.findPotentialArtistAlbumMatchesByDB('Discogs')
        mcs = mdbMatcher.findPotentialArtistAlbumMatches()
        results[(artistName,artistID)] = {db: [mc.matchID, mc.matchScore] for db, mc in mcs.items()}

In [None]:
df = mdbmap.getDF().T

In [None]:
from pandas import Series
matchData  = mdbmc.getDBMatchData("AllMusic")
sortedData = Series({primaryKey: len(albums) for primaryKey,albums in matchData.items()}).sort_values(ascending=False).to_dict()

In [None]:
sortedData

In [None]:
matchData