In [1]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))


################################################################################
## General Stuff
################################################################################
from timeUtils import clock, elapsed
from listUtils import getFlatList
from time import sleep
from pandas import DataFrame, Series
from ioUtils import getFile, saveFile
from searchUtils import findDirs, findNearest, findAll
from fileUtils import getDirBasics, getBaseFilename
from unicodedata import normalize
from fsUtils import moveDir, setDir, mkDir, isDir, removeDir
from pandasUtils import getRowDataByColValue, getRowData
import operator
from glob import glob
from os.path import join
from collections import Counter


################################################################################
## Mp3 Stuff
################################################################################
from mp3id import mp3ID


################################################################################
## Database Stuff
################################################################################
from dbBase import dbBase
from mainDB import mainDB
from multiArtist import multiartist
from matchAlbums import matchAlbums
from masterdb import masterdb
from masterDBMatchClass import masterDBMatchClass


################################################################################
## Music Stuff
################################################################################
from musicUtils import *
from myMusicDBMap import myMusicDBMap
from musicBase import myMusicBase
from musicDBMap import musicDBMap
from matchDBArtist import matchDBArtist
from matchMyMusic import matchMyMusic
from matchMusicName import myMusicName
from mergeDB import searchForMutualDBEntries, searchForMutualArtistDBEntries


################################################################################
## Pandas Stuff
################################################################################
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


## Python Version
import sys
print("Python: {0}".format(sys.version))

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))



Python: 3.7.7 (default, Mar 26 2020, 10:32:53) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2020-11-17 19:30:51.787256


# My Music Database Map

In [2]:
from difflib import SequenceMatcher
from pandas import Series, DataFrame, isna, isnull

def getDBDF(mdbmc, dbName, dbShort, keepNA=False):
    amdf = DataFrame(mdbmaps[dbName].getDF().T)
    print("Total Size: {0}".format(amdf.shape[0]))
    dbcols = amdf.columns

    matchData  =  mdbmc.getDBMatchData(dbName)
    mAlbDF = DataFrame(Series({primaryKey: len(albums) for primaryKey,albums in matchData.items()}))

    amdf = amdf.join(mAlbDF)
    cols = list(amdf.columns)
    cols[-1] = "{0}Albums".format(dbShort)
    amdf.columns = cols

    amdf.reset_index(inplace=True)
    columns = list(amdf.columns)
    columns[0] = "{0}ArtistName".format(dbShort)
    columns[1] = "{0}Key".format(dbShort)
    amdf.columns = columns
    if keepNA is False:
        amdf = amdf[~amdf[dbName].isna()]

    colOrder = columns[:2] + [columns[-1]] + list(dbcols)
    amdf = amdf[colOrder]

    amdf = amdf.sort_values("{0}Albums".format(dbShort), ascending=False)

    print(" Good Size: {0}".format(amdf.shape[0]))
    return amdf


def checkDBIDMatch(amdf, dbName, dbShort):
    wrong = amdf[amdf["{0}Key".format(dbShort)] != amdf[dbName]]
    idxs  = list(wrong.index)
    print("Found {0} wrongly assigned index".format(len(idxs)))
    return idxs
 
    
#####################################################################################################################
# Fill Matched DB Data From Overlapping Data In A Previously Matched DB
#####################################################################################################################
def mutualMatch(amdf, mdbmaps, primaryInfo, secondaryInfo, debug=False, test=True, ratioCut=0.9):
    primaryDBName  = primaryInfo[0]
    primaryDBShort = primaryInfo[1]
    primaryDBArtistName = "{0}ArtistName".format(primaryDBShort)
    primaryDBKey        = "{0}Key".format(primaryDBShort)

    secondaryDBName  = secondaryInfo[0]
    secondaryDBShort = secondaryInfo[1]

    debug = debug
    amrcdf = amdf[~amdf[secondaryDBName].isna()][[primaryDBArtistName, primaryDBKey, secondaryDBName]]
    for i,row in amrcdf.iterrows():

        ## Primary DB Key --> Secondary ID
        amDBName = row[primaryDBArtistName]
        amDBKey  = row[primaryDBKey]
        rcDBKey  = row[secondaryDBName]

        ## Secondary ID --> Secondary DB Key
        primKey  = mdbmaps[secondaryDBName].getPrimaryKeyFromID(rcDBKey)
        if primKey is None:
            continue
        rcDBName = primKey[0]
        rcDBKey  = primKey[1]


        s = SequenceMatcher(None, str(amDBName), str(rcDBName))

        if debug is True:
            print("{0: <30}{1: <30}{2: <30}{3}".format("", primaryDBName, secondaryDBName, "Match"))
            print("{0: <30}{1: <30}{2: <30}{3}".format("{0} ID   --> {1} ID".format(primaryDBShort, secondaryDBShort), amDBKey, rcDBKey, round(s.ratio(),2)))
            print("{0: <30}{1: <30}{2: <30}".format("{0} Name --> {1} Name".format(primaryDBShort, secondaryDBShort), amDBName, rcDBName))


        amDBData = mdbmaps[primaryDBName].getArtistData(amDBName, amDBKey)
        rcDBData = mdbmaps[secondaryDBName].getArtistData(rcDBName, rcDBKey)

        amRCDBKey = rcDBKey
        rcAMDBKey = rcDBData.getDBID(primaryDBName)
        if debug is True:
            print("{0: <30}{1: <30}{2: <30}".format("{0} ID   --> {1} ID".format(secondaryDBShort, primaryDBShort), str(rcAMDBKey), str(amRCDBKey)))    

        if s.ratio() > ratioCut:
            if rcAMDBKey != amDBKey:
                print("{0: <50}{1: <75}\t--->\t[{2}/{3}]".format("Setting {0} DBData For {1}".format(secondaryDBName,primaryDBName),"[{0: <30} {1: <4} {2: >30}]".format(amDBName,round(s.ratio(),2),rcDBName),amDBKey,rcDBKey))
                if test is True:
                    continue
                mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, primaryDBName, amDBKey)

        if debug:
            print("\n")

    if test is True:
        return
    mdbmaps[secondaryDBName].save()
    
    
    
#####################################################################################################################
# Fill Fresh DB Data From A Previously Matched DB
#####################################################################################################################
def mutualMatchFromPreviousDBMatch(amdf, mdbmaps, primaryInfo, secondaryInfo, debug=False, test=True, ratioCut=0.9):

    primaryDBName  = primaryInfo[0]
    primaryDBShort = primaryInfo[1]
    primaryDBArtistName = "{0}ArtistName".format(primaryDBShort)
    primaryDBKey        = "{0}Key".format(primaryDBShort)

    secondaryDBName  = secondaryInfo[0]
    secondaryDBShort = secondaryInfo[1]



    amrcdf = amdf[~amdf[secondaryDBName].isna()][[primaryDBArtistName, primaryDBKey, secondaryDBName]]
    for i,row in amrcdf.iterrows():

        ## Primary DB Key --> Secondary ID
        amDBName = row[primaryDBArtistName]
        amDBKey  = row[primaryDBKey]
        rcDBKey  = row[secondaryDBName]

        if amDBKey is None:
            continue

        #if debug is True:
        #    print(amDBName,'\t',amDBKey,'\t',rcDBKey,'\t-->\t',end="")

        ## Secondary ID --> Secondary DB Key
        primKey = toMatchKeys.get(rcDBKey)
        #if debug is True:
        #    print(primKey)
        #primKey  = mdbmaps[secondaryDBName].getPrimaryKeyFromID(rcDBKey)

        if primKey is None:
            continue
        rcDBName = primKey[0]
        rcDBKey  = primKey[1]



        s = SequenceMatcher(None, str(amDBName), str(rcDBName))

        if debug is True:
            print("{0: <30}{1: <30}{2: <30}{3}".format("", primaryDBName, secondaryDBName, "Match"))
            print("{0: <30}{1: <30}{2: <30}{3}".format("{0} ID   --> {1} ID".format(primaryDBShort, secondaryDBShort), amDBKey, rcDBKey, round(s.ratio(),2)))
            print("{0: <30}{1: <30}{2: <30}".format("{0} Name --> {1} Name".format(primaryDBShort, secondaryDBShort), amDBName, rcDBName))

        continue
        if not mdbmaps[secondaryDBName].isKnown(rcDBName, rcDBKey):
            mdbmaps[secondaryDBName].addArtist(rcDBName, rcDBKey)
            mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, secondaryDBName, rcDBKey)


        amDBData = mdbmaps[primaryDBName].getArtistData(amDBName, amDBKey)
        rcDBData = mdbmaps[secondaryDBName].getArtistData(rcDBName, rcDBKey)

        amRCDBKey = rcDBKey
        rcAMDBKey = rcDBData.getDBID(primaryDBName)
        if debug is True:
            print("{0: <30}{1: <30}{2: <30}".format("{0} ID   --> {1} ID".format(secondaryDBShort, primaryDBShort), str(rcAMDBKey), str(amRCDBKey)))    

        if s.ratio() > ratioCut:
            if rcAMDBKey != amDBKey:
                print("{0: <50}{1: <75}\t--->\t[{2}/{3}]".format("Setting {0} DBData For {1}".format(secondaryDBName,primaryDBName),"[{0: <30} {1: <4} {2: >30}]".format(amDBName,round(s.ratio(),2),rcDBName),amDBKey,rcDBKey))
                if test is True:
                    continue
                mdbmaps[secondaryDBName].addArtistData(rcDBName, rcDBKey, primaryDBName, amDBKey)


        if debug:
            print("\n")

    if test is False:
        mdbmaps[secondaryDBName].save()    
    
    

def isSame(x):
    name  = x.MyArtistName
    match = x["{0}ArtistName".format(dbShort)]
    if not all([name,match]):
        return None
    if any([isna(x) for x in [name,match]]):
        return None

    same = False
    if all([name,match]):
        if str(name) == str(match):
            same = True
    else:
        raise ValueError([name,match])
    return same

def ratio(x):
    name  = x.MyArtistName
    match = x["{0}ArtistName".format(dbShort)]
    same  = x.Same
    if same is None:
        return None
    else:
        if all([name,match]):
            s = SequenceMatcher(None, str(name), str(match))
            return s.ratio()
        return -1.0

# Load My Music DB

In [3]:
#mdbmap.removeArtist("Marvin Gaye & Tammi Terrell")
#mdbmap.save()
mdbmap = musicDBMap("Music", init=False)

  Loaded 4732 previously matched entries


# Load DB Data

In [None]:
dbDict = {}
dbDict["AllMusic"]      = "AM"
dbDict["Discogs"]       = "DC"
dbDict["MusicBrainz"]   = "MB"
dbDict["LastFM"]        = "LM"
dbDict["RockCorner"]    = "RC"
dbDict["MusicStack"]    = "MS"
dbDict["CDandLP"]       = "CL"
dbDict["AceBootlegs"]   = "AB"
dbDict["RateYourMusic"] = "RM"
dbDict

In [None]:
loadAll = True
if loadAll:
    mdbmaps = {}
    for db in maindb.getDBs():
        print("{0: <30}".format(db), end="")
        mdbmaps[db] = musicDBMap(db, init=False)
    #mdbmaps = {db: musicDBMap(db, init=False) for db in maindb.getDBs()}
else:
    mdbmaps = {}
    mdbmaps["AllMusic"]      = musicDBMap("AllMusic", init=False)
    mdbmaps["MusicBrainz"]   = musicDBMap("MusicBrainz", init=False)
    mdbmaps["RockCorner"]    = musicDBMap("RockCorner", init=False)
    mdbmaps["MusicStack"]    = musicDBMap("MusicStack", init=False)
    mdbmaps["CDandLP"]       = musicDBMap("CDandLP", init=False)
    mdbmaps["AceBootlegs"]   = musicDBMap("AceBootlegs", init=False)
    mdbmaps["RateYourMusic"] = musicDBMap("RateYourMusic", init=False)

In [None]:
mdbmaps={}
mdbmaps["AllMusic"]      = musicDBMap("AllMusic", init=False)
mdbmaps["MusicBrainz"]   = musicDBMap("MusicBrainz", init=False)

# Load Main DB and Match Class

In [None]:
maindb = mainDB(mdb=None, create=False, debug=False)
maindb.loadDBDataMap()

In [None]:
mdbmc = masterDBMatchClass(maindb, mdbmaps)

In [None]:
dbDataMatch = mdbmc.getDBMatchData("Discogs")

In [4]:
mydf = mdbmap.getDF().T

In [None]:
https://www.musicstack.com/discography/joe+satriani
https://www.therockcorner.com
https://www.cdandlp.com/en/def-leppard/artist/
https://www.amoeba.com/music/cd-and-vinyl/#/genre-43

In [13]:
mydf

Unnamed: 0,Unnamed: 1,Discogs,AllMusic,MusicBrainz,AceBootlegs,RateYourMusic,LastFM,DatPiff,RockCorner,CDandLP,MusicStack,MetalStorm
"""Weird Al"" Yankovic",144448e4d28359187bdfa41653d2de95,259422,0000817315,252089069206782159204445716107588788546,,,,,,,,
'68 Comeback,7a713b0e18aceebf99af14f348092a8f,350286,0000726082,223315906270366269211184693736845870471,,,76102572316,,,7789063285768,,
'68 Comeback Special,ef325f05f85b6d7eefccc257a35e6715,4690194,0002768705,334023133475570103002327579969852187749,,,,,,,,
*NSYNC,b968a9b7ce2a374b178b80070284c7c3,69433,0000516929,59621135211322028167760795472347712673,,,67689072579,,,,,
-123 min.,8311291b86c3d52634444dfb330f42b3,2054727,0000422575,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
Hellhole,79714996a98b8157b74985d2b4bc7f87,2707104,0001273006,,,,,,,,,
The Choir of Trinity College Cambridge,24b3c0e696d44b82ff3dcf3d433aca08,,0001488098,197193132978383613585900352857213609510,,,,,,,,
The Choir of King's College Cambridge,3559bca7c05b51850a9011dc0c0cccff,,0000608992,273338754289470679440863230907769944743,,,,,,,,
Mieczysław Horszowski,5423255356443e5212414eced87a51cf,,,,,,,,,,,


In [12]:
mydf[mydf["RateYourMusic"].isna()].to_pickle("/Users/tgadfort/Dropbox/RateYourMusic.p")
mydf[mydf["CDandLP"].isna()].to_pickle("/Users/tgadfort/Dropbox/CDandLP.p")
mydf[mydf["MusicStack"].isna()].to_pickle("/Users/tgadfort/Dropbox/MusicStack.p")

In [None]:
mdbmc.getDataToMatch("Discogs")

In [None]:
getDBDFs = True
if getDBDFs:
    mdfs = {dbNameKey: getDBDF(mdbmc, dbNameKey, dbDict[dbNameKey]) for dbNameKey in mdbmaps.keys()}
    for dbNameKey in mdfs.keys():
        idxs = checkDBIDMatch(mdfs[dbNameKey], dbNameKey, dbDict[dbNameKey])
        print("")

In [None]:
#mutualMatch(amdf, mdbmaps, ("AllMusic", "AM"), ("MusicStack", "MS"), debug=False, test=True)
#mutualMatch(amdf, mdbmaps, ("AllMusic", "AM"), ("MusicBrainz", "AB"), debug=False, test=True)

# Matching Area

In [None]:
mdbmap  = musicDBMap("Music", init=False)
dbName  = "MusicBrainz"
dbShort = "MB"

In [None]:
myamdf = DataFrame(mdbmap.getDF().T[dbName])
myamdf.reset_index(inplace=True)
myamdf.columns = ["MyArtistName", "MyKey", "DBID"]
myamdf = myamdf[~myamdf["DBID"].isna()]

amdf = mdfs[dbName][["{0}ArtistName".format(dbShort), "{0}Key".format(dbShort), "{0}".format(dbName)]].copy(deep=True)
cols = list(amdf.columns)
cols[-1] = "DBID"
amdf.columns = cols

In [None]:
from pandas import merge
mgdf = merge(myamdf, amdf, on=['DBID'], how='inner')
dbShort = dbDict[dbName]
print(dbName,dbShort)
#mgdf[mgdf["AMKey"].isna()]
print(mgdf.shape)

In [None]:
dbRenames = getFile("dbRenames.yaml")
dbRenames

In [None]:
dbRenames = getFile("dbRenames.yaml")
mgdf["{0}ArtistName".format(dbShort)] = mgdf["{0}ArtistName".format(dbShort)].apply(lambda x: dbRenames.get(x) if dbRenames.get(x) else x)
mgdf["Same"]  = mgdf.apply(isSame, axis=1)
mgdf["Ratio"] = mgdf.apply(ratio, axis=1)

In [None]:
rDF = mgdf[((mgdf['Same'] == False) & (mgdf['Ratio'] > 0.0) & (mgdf['Ratio'] < 1.6))][["MyArtistName", "{0}ArtistName".format(dbShort)]]
print("Size ==> {0}".format(mgdf[(mgdf['Same'] == False)].shape[0]))
print("-"*20)
for i,row in rDF.iterrows():
    myName = row.MyArtistName
    amName = row["{0}ArtistName".format(dbShort)]
    #print("mdbmap.addArtistData(\"{0}\", mdbmap.getHash(\"{1}\"), 'MusicBrainz', None)".format(myName, myName))
    #print("mdbmap.addArtistData(\"{0}\", mdbmap.getHash(\"{1}\"), 'MusicBrainz', maindb.dbdata[\"MusicBrainz\"][\"Utils\"].getArtistID(\"\"))".format(myName, myName))
    #continue
    print("{0}: {1}".format(amName, myName))
    if myName.startswith("The ") and not amName.startswith("The "):
        print("The The {0}: {1}".format(amName, myName))

In [None]:
artistName = "Project Z"
row = mgdf[mgdf["MyArtistName"] == artistName]
myKey = row.MyKey.values[0]
print("Artist --> {0}".format(artistName))
print("MyKey  --> {0}".format(myKey))
row

In [None]:
mdbmap.getArtistData("Jay-Z").show()

In [None]:
mdbmap.removeArtist("The English Beat", mdbmap.getHash("The English Beat"))
mdbmap.save()

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("c945ec66-a07c-3986-86a7-6db575d99740")

In [None]:
mdbmap.addArtistData(artistName, myKey, 'MusicBrainz', '327099666880863839613792835568821512284')
mdbmap.save()

In [None]:
mdbmap.removeArtist("Joe King Carrasco and The El Molino Band")
mdbmap.save()

In [None]:
mdbmap.addArtistData("Lil' Boosie", mdbmap.getHash("Lil' Boosie"), 'MusicBrainz', maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("https://musicbrainz.org/artist/8ba17cf6-bec2-4ae4-9820-b1cda47adc08"))
mdbmap.save()

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("https://musicbrainz.org/artist/769b978f-8b3e-42cc-8a3b-9c1121c5dc0f")

In [None]:
maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("769b978f-8b3e-42cc-8a3b-9c1121c5dc0f")

In [None]:
mdbmap.addArtistData("Tone-Loc", mdbmap.getHash("Tone-Loc"), 'MusicBrainz', maindb.dbdata["MusicBrainz"]["Utils"].getArtistID("https://musicbrainz.org/artist/16da4997-78e1-4837-85f6-9247d10cde1f"))
mdbmap.save()

In [None]:
mdbmap.save()

In [None]:
for primaryKey,dbID in mdf.T["AllMusic"].to_dict():
    artistName = primaryKey[0]
    mdbmaps[]

In [None]:
### Master DB code
%load_ext autoreload
%autoreload
maindb = mainDB(mdb=None, create=False, debug=False)
maindb.loadDBDataMap()

In [None]:
mdbmap = musicDBMap("Music", init=False)

In [None]:
if False:
    mdb = myMusicDBMap(debug=False)
    mdb.getFullDBData()
    #mdb.getKnownDBData()
    maindb = mainDB(mdb=mdb, create=False, debug=True)
    dbdata = maindb.dbdata
    #maindb.setDBFull() ## Do this to recreate everything
    #maindb.setDBKnown()
    _, _ = clock("Last Run")

# Find All My Music and What's Known/Unknown

#### Find My Music

In [None]:
%load_ext autoreload
%autoreload
from musicBase import myMusicBase

def getMusicStatus(mdb):
    ## Basic stuff
    mmb = myMusicBase(debug=False)
    mmb.findArtistAlbums()
    
    mmm = matchMyMusic(mdb)
    mmm.getArtistStatus()
    mmm.setMusicBase(mmb)
    unknownArtists = mmm.getUnknownArtists()
    _, _ = clock("Last Run")
    
    return mmb,mmm,unknownArtists

def findUnknownArtists(mmm, unknownArtists):
    if len(unknownArtists) > 0:
        mmm.matchUnknownArtists(ratioCut=0.75)

        for artist in unknownArtists:
            print("mdb.add(\"{0}\", \"{1}\", \"{2}\")".format(artist, "AllMusic", ""))

mmb, mmm, unknownArtists = getMusicStatus(mdb)
findUnknownArtists(mmm, unknownArtists)

In [None]:
#mdb.add("Mieczysław Horszowski", "Discogs", "1233607")
mdb.save()

# Find Mutual Entries

In [None]:
#mdb.rmArtistDBKey("Intro To India", "MusicBrainz")
#mdb.save()

In [None]:
#mdb.add("I Am Kloot", "AllMusic", "0000256067")
if False:
    ignores  = ["Bryan Adams", "Leslie Keith", "Patrick Swayze", "David Frizzell & Shelly West", "Matt Monroe"]
    ignores += ["Antonio Vivaldi", "Franz Liszt", "Georges Bizet", "Hector Berlioz", "Richard Wagner"]
    ignores += ["Robert Schumann", "Blyss"]   
    for artistName in ignores:
        mdb.rmArtistDBKey(artistName, "AllMusic")
    mdb.save()

In [None]:
mdb.getArtistDBData('"Weird Al" Yankovic', "LastFM")['ID']

In [None]:
#searchForMutualDBEntries
%load_ext autoreload
%autoreload
if True:
    #mdb = myMusicDBMap(debug=False)
    #mdb.getFullDBData()
    mmm = matchMyMusic(mdb)
    mmm.searchForMutualDBEntriesByDB("LastFM", cutoff=0.90, maxAdds=100, start=-1, modVal=250, maxAlbumsForSearch=500)
#mmm.getArtistStatus()
#mmm.setMusicBase(mmb)
#unknownArtists = mmm.getUnknownArtists()

_, _ = clock("Last Run")

In [None]:

mdb.save()

In [None]:
name = "Robert Schumann"
dutils = dbdata["LastFM"]["Utils"]
artistID = dutils.getArtistID(name)
print(name,' \t--> ',artistID)
mdb.add(name, "LastFM", artistID)
mdb.save()

# Try Matching Unmatched Albums

In [None]:
dR = 0.1
rC = 0.99

#for db in ['Discogs', 'AllMusic', 'MusicBrainz', 'AceBootlegs', 'RateYourMusic', 'LastFM', 'DatPiff', 'RockCorner', 'CDandLP', 'MusicStack', 'MetalStorm']:
#for db in ['Discogs', 'MusicBrainz', 'AllMusic', 'LastFM']:
for db in ['LastFM']:
    for albumType in [1]:
        print("="*140)
        mmm.matchMyMusicAlbums(db=db, albumType=albumType, ratioCut=rC, maxCut=rC+dR)
        print("\n\n")

In [None]:
mmm.mmn.moveMyMatchedMusicAlbums(show=True)

In [None]:
mmm.mmn.moveMyMatchedMusicAlbums(show=False)
mmb, mmm, unknownArtists = getMusicStatus(mdb)

***
***

# Match Albums From My Matched Artists

In [None]:
resultD = {"ID": None, "Matches": 0, "Score": 0.0, "Best": None}

for artistName,artistDBIDs in artistNameDBIDs.items():
    print(artistName,artistDBIDs)
    for dbArtistID in artistDBIDs:
        print('\t',dbArtistID)
        dbArtistIDAlbums = mdb.getArtistAlbumsFromID(db, dbArtistID, flatten=True)
        print("\t\t",dbArtistIDAlbums)

        
        ### Match Albums (if possible)
        ma = matchAlbums()
        ma.match(unMatchedAlbums, dbArtistIDAlbums)
        if debug:
            print("\t\t{0: <45}{1}\t{2}\t{3}\t{4}".format(dbArtistID, len(dbArtistIDAlbums), ma.near, ma.score, ma.maxval))
        if ma.near < resultD["Matches"]:
            continue
        if ma.score < max([resultD["Score"], cutoff]):
            continue
        resultD = {"ID": dbArtistID, "Matches": ma.near, "Score": ma.score, "Best": ma}
        if debug:
            print("\t\t{0: <45}{1}\t{2}\t{3}\t{4} <-- Match".format(dbArtistID, len(artistAlbums), ma.near, ma.score, ma.maxval))




In [None]:
dR = 0.1
rC = 0.99
mmm.matchMyMusicAlbums(db="LastFM", albumType=1, ratioCut=rC, maxCut=rC+dR)

In [None]:
mmm.moveMyMatchedMusicAlbums(show=False)

In [None]:
from timeUtils import clock, elapsed
from listUtils import getFlatList
from musicBase import myMusicBase
from matchAlbums import matchAlbums
from ioUtils import getFile, saveFile
from fsUtils import isDir, setDir, mkDir, moveDir
from matchMusicName import myMusicName


class matchMyMusic:
    def __init__(self, mdb, debug=False):
        self.debug = debug
        self.mdb   = mdb
        self.mmb   = myMusicBase()
        self.mmn   = myMusicName()
        
        self.unknownArtists = {}
        self.artistAlbums   = {}
        
        self.matchedAlbums  = {}
        
        
    def setMusicBase(self, mmb):
        self.mmb = mmb
        
        
    def getAlbumStatus(self, force=False):
        self.artistAlbums = self.mmb.getArtistAlbums(force=force)
        

    def getArtistStatus(self):
        start, cmt = clock("Matching All Music Artists")

        ######################################################################
        #### Loop Over My Artists and Paths
        ######################################################################
        for primeDir in self.mmb.getPrimeDirectories():
            for artistName, artistPrimeDirs in self.mmb.getArtistPrimeDirMap(primeDir).items():
                if self.debug:
                    print("{0: <50}{1}".format(artistName,artistPrimeDirs))


                ######################################################################
                #### Get Database IDs
                ######################################################################
                isKnown = self.mdb.isKnown(artistName)
                if isKnown is False:
                    self.unknownArtists[artistName] = artistPrimeDirs
                    if self.debug:
                        print("\tUnknown (All)     --> {0}".format(artistName))
                        

        elapsed(start, cmt)
        print("Found {0} unknown artists".format(len(self.unknownArtists)))
        print("Found {0} total artists".format(len(self.artistAlbums)))

        
        
    def getUnknownArtists(self):
        return self.unknownArtists
    
    
    
    def getArtistNameMatchedDirs(self):
        self.artistMatchedDirs = {}
        for primeDir in self.mmb.getPrimeDirectories():
            self.artistMatchedDirs.update(self.mmb.getArtistPrimeDirMap(primeDir))
            

    
    def matchMyMusicAlbumsByArtist(self, db, artistName, albumType=None, ratioCut=0.95, maxCut=0.1):

        matchedAlbums = {}
        

        ######################################################################
        #### Get Artist Album Data
        ######################################################################
        artistAlbumsData = self.mmb.getArtistAlbumsByArtist(artistName)
    
    
        ######################################################################
        #### Get Unmatched Albums
        ######################################################################
        unMatchedAlbums = self.mmb.getUnMatchedAlbumsByArtist(artistName)
        dirval = self.mmb.getArtistMusicDir(artistName)
        if len(unMatchedAlbums) == 0:
            return matchedAlbums
            
            
        ######################################################################
        #### Loop Over Artist Name <-> Prime Map Items
        ######################################################################
        if self.mdb.isKnown(artistName) is True:
            myMusicData = self.mdb.getArtistData(artistName)
            try:
                artistID = myMusicData[db]["ID"]
            except:
                return matchedAlbums
        else:
            return matchedAlbums
            


        ######################################################################
        #### Get Database Albums
        ######################################################################
        artistDBAlbumsFromID = self.mdb.getArtistAlbumsFromID(db, artistID)

        
        ######################################################################
        #### Loop over my albums
        ######################################################################
        for myAlbumName in unMatchedAlbums:

            bestMatchVal = {"Ratio": ratioCut, "Dir": None, "Album": None}

            for mediaType, mediaTypeAlbums in artistDBAlbumsFromID.items():
                if albumType is not None:
                    if mediaType not in self.mdb.getDBAlbumTypeNames(db, albumType):
                        continue

                if self.debug:
                    print("\tMy album: {0}".format(myAlbumName))
                myFormattedAlbum = self.mmn.formatAlbum(myAlbumName, mediaType)

                ma = matchAlbums(cutoff=ratioCut)
                ma.match([myFormattedAlbum], mediaTypeAlbums)

                if ma.maxval < ratioCut or ma.maxval > ratioCut+maxCut:
                    continue
                if ma.maxval < bestMatchVal["Ratio"]:
                    continue

                bestMatch = ma.getBestMatch(myFormattedAlbum)

                bestMatchVal = {"Ratio": ma.maxval, "Dir": dirval, 
                                "Album": {"Name": bestMatch["Name"],
                                          "Code": bestMatch["Code"],
                                          "MediaType": mediaType}}
                matchedAlbums[myAlbumName] = bestMatchVal
                #print("{0: <30}{1: <15}{2: <30} --> {3}".format(artistName, db, myAlbumName, bestMatchVal["Album"]))
                #bestMatchVal["Match"].show(debug=True)
                    
        return matchedAlbums

                
    
    def matchMyMusicAlbums(self, db, albumType=1, ratioCut=0.95, maxCut=0.1):
        self.matchedAlbums = {}

        start, cmt = clock("Checking for Albums Matches Against {0} DB".format(db))
        
        
        print("{0: <40}{1: <15}{2: <45} --> {3}".format("Artist", "Database", "Album Name", "Matched Album"))

        ######################################################################
        #### Get Map of Artists and Unmatched Albums
        ######################################################################
        artistNames = self.mmb.getArtists()
        #artistAlbums = self.mmb.getArtistAlbums()


        ######################################################################
        #### Loop Over Artist Name <-> Prime Map Items
        ######################################################################
        for artistName in artistNames:
            matchedAlbums = self.matchMyMusicAlbumsByArtist(db, artistName, albumType, ratioCut, maxCut)
            if len(matchedAlbums) > 0:
                if self.matchedAlbums.get(db) is None:
                    self.matchedAlbums[db] = {}
                self.matchedAlbums[db][artistName] = matchedAlbums
                for myAlbumName,bestMatchVal in matchedAlbums.items():
                    print("{0: <40}{1: <15}{2: <45} --> {3}".format(artistName, db, myAlbumName, bestMatchVal["Album"]))

            
        elapsed(start, cmt)

        saveFile(ifile=self.mmn.moveFilename, idata=self.matchedAlbums, debug=True)
        print("Found {0} music <-> discogs albums maps".format(len(self.matchedAlbums)))
        

    def matchUnknownArtists(self, albumType=1, ratioCut=0.95):
        unknownArtists = self.getUnknownArtists()
        for unknownArtist in unknownArtists.keys():
            print("# ===>",unknownArtist)
            retval = self.matchUnknownArtist(unknownArtist, albumType, ratioCut)

            for db,dbdata in retval.items():
                bestMatch = {"ID": None, "Matches": 0, "Score": 0.0}
                for artistDBID,artistDBData in dbdata.items():
                    for mediaType,ma in artistDBData.items():
                        if ma.near == 0:
                            continue
                        if ma.near > bestMatch["Matches"]:
                            bestMatch = {"ID": artistDBID, "Matches": ma.near, "Score": ma.score}
                        elif ma.near == bestMatch["Matches"]:
                            if ma.score > bestMatch["Score"]:
                                bestMatch = {"ID": artistDBID, "Matches": ma.near, "Score": ma.score}

                if bestMatch["ID"] is not None:
                    print("mdb.add(\"{0}\", \"{1}\", \"{2}\")".format(unknownArtist, db, bestMatch["ID"]))
            
            
    def matchUnknownArtist(self, unknownArtist, albumType=None, ratioCut=0.95):
        ######################################################################
        #### Get Unknown Artist Albums and Potential DB Artists
        ######################################################################
        unMatchedAlbums = self.mmb.getUnMatchedAlbumsByArtist(unknownArtist)
        artistNameDBIDs = self.mdb.getArtistIDs(unknownArtist)
        
        #print(unknownArtist)
        #print(unMatchedAlbums)
        #print(artistNameDBIDs)
        #return

        
        ######################################################################
        #### Get Database Albums
        ######################################################################
        matches = {}
        for db,artistDBartists in artistNameDBIDs.items():
            
            dbMatches = {}
            for artistDBartist,artistDBIDs in artistDBartists.items():
                for artistDBID in artistDBIDs:
                    dbMatches[artistDBID] = {}
                    artistDBAlbumsFromID = self.mdb.getArtistAlbumsFromID(db, artistDBID)

                    for mediaType, mediaTypeAlbums in artistDBAlbumsFromID.items():
                        if mediaType not in self.mdb.getDBAlbumTypeNames(db, albumType):
                            continue

                        ma = matchAlbums(cutoff=ratioCut)
                        ma.match(unMatchedAlbums, mediaTypeAlbums)
                        #ma.show(debug=True)
                        
                        dbMatches[artistDBID][mediaType] = ma
                        
            matches[db] = dbMatches
            
        return matches
    
    def manuallyMatchUnknownArtist(self, unknownArtist, cutoff=0.8):
        ######################################################################
        #### Get Unknown Artist Albums and Potential DB Artists
        ######################################################################
        unMatchedAlbums = self.mmb.getUnMatchedAlbumsByArtist(unknownArtist)
        artistNameDBIDs = self.mdb.getArtistIDs(unknownArtist, cutoff=cutoff)
        
        print("Unknown Artist:   {0}".format(unknownArtist))
        try:
            print("UnMatched Albums: {0}".format(", ".join(unMatchedAlbums)))
        except:
            print("Could not show the unMatched Albums below:")
            print("-> ",unMatchedAlbums," <-")
        print("="*50)
        print(artistNameDBIDs)
        for db,artistDBartists in artistNameDBIDs.items():
            print("="*50)
            print("   {0}".format(db))
            for artistDBartist,artistDBIDs in artistDBartists.items():
                print("      {0}".format(artistDBartist))
                for artistDBID in artistDBIDs:
                    artistDBAlbumsFromID = self.mdb.getArtistAlbumsFromID(db, artistDBID)
                    albums = [list(mediaTypeAlbums.values()) for mediaTypeAlbums in artistDBAlbumsFromID.values()]
                    print("mdb.add(\"{0}\", \"{1}\", \"{2}\")".format(unknownArtist, db, artistDBID))
                    print("         {0: <45}\t{1}".format(artistDBID, getFlatList(albums)))
                    

                    
    def getArtistDBMatchLists(self, dbartist):
        dbArtistData   = self.mdb.getArtistData(dbartist)
        retval = {"Matched": [], "Unmatched": []}
        albumTypesData = {k: [] for k in [1,2,3,4]}
        for db,dbIDdata in dbArtistData.items():
            try:
                dbID = dbIDdata["ID"]
                retval["Matched"].append(db)
            except:
                retval["Unmatched"].append(db)
        return retval
    
                    
    def getMatchedArtistAlbumsFromDB(self, dbartist, merge=True):
        dbArtistData   = self.mdb.getArtistData(dbartist)
        dbsToSearch    = self.getArtistDBMatchLists(dbartist)
        albumTypesData = {k: [] for k in [1,2,3,4]}
        for db in dbsToSearch["Matched"]:
            dbIDdata = dbArtistData[db]
            try:
                dbID = dbIDdata["ID"]
            except:
                raise ValueError("This db {0} should already be known for {1}".format(db, dbartist))

            dbAlbumsData = self.mdb.getArtistAlbumsFromID(db, dbID)

            for albumType in albumTypesData.keys():
                for mediaType, mediaTypeAlbums in dbAlbumsData.items():
                    if mediaType not in self.mdb.getDBAlbumTypeNames(db, albumType):
                        continue                
                    #print(db,albumType,mediaType,mediaTypeAlbums)
                    albumTypesData[albumType] += list(mediaTypeAlbums.values())

        albumTypesData = {k: list(set(v)) for k,v in albumTypesData.items()}

        ############################
        ## Merge Albums
        ############################
        if merge is True:
            artistAlbums = getFlatList(albumTypesData.values())
        else:
            artistAlbums = albumTypesData

        return artistAlbums

            
    def searchForMutualDBEntries(self, cutoff=0.8, maxAdds=50, start=None):
        ######################################################################
        #### Get Map of Artists and Unmatched Albums
        ######################################################################
        dbartists = self.mdb.getArtists()
        cnts      = 0
        print("Searching for mutual DB matches for {0} artists".format(len(dbartists)))
        for ia,dbartist in enumerate(dbartists):
            if start is not None:
                if ia < start:
                    continue
            if ia % 100 == 0:
                print("## {0}/{1}".format(ia,len(dbartists)))
            if cnts >= maxAdds:
                break
            artistAlbums = self.getMatchedArtistAlbumsFromDB(dbartist, merge=True)
            dbsToSearch  = self.getArtistDBMatchLists(dbartist)

            usefulDBs          = ['Discogs', 'MusicBrainz', 'AllMusic', 'LastFM']
            usefulDBsToSearch  = list(set(dbsToSearch["Unmatched"]).intersection(set(usefulDBs)))


            ########################################################
            ## Loop Over Unmatched DBs
            ########################################################
            for db in usefulDBsToSearch:
                dbMatches = {}
                artistDBartists = self.mdb.getArtistDBIDs(dbartist, db, num=10, cutoff=cutoff, debug=False)
                
                for artistDBartist,artistDBIDs in artistDBartists.items():
                    #print('  ',db,'\t',artistDBartist)
                    for artistDBID in artistDBIDs:
                        #print('    ',artistDBID)
                        dbMatches[artistDBID] = {}
                        artistDBAlbumsFromID = self.mdb.getArtistAlbumsFromID(db, artistDBID)

                        albumTypesData = {k: [] for k in [1,2,3,4]}
                        for albumType in albumTypesData.keys():
                            for mediaType, mediaTypeAlbums in artistDBAlbumsFromID.items():
                                if mediaType not in self.mdb.getDBAlbumTypeNames(db, albumType):
                                    continue
                                albumTypesData[albumType] += list(mediaTypeAlbums.values())

                        albumTypesData = {k: list(set(v)) for k,v in albumTypesData.items()}
                        dbArtistAlbums = getFlatList(albumTypesData.values())
            

                        ma = matchAlbums(cutoff=cutoff)
                        ma.match(artistAlbums, dbArtistAlbums)
                        #ma.show(debug=True)
                        
                        dbMatches[artistDBID] = ma
                
                if len(dbMatches) > 0:
                    bestMatch = {"ID": None, "Matches": 0, "Score": 0.0}
                    for artistDBID,ma in dbMatches.items():
                        if ma.near == 0:
                            continue
                        if ma.near > bestMatch["Matches"]:
                            bestMatch = {"ID": artistDBID, "Matches": ma.near, "Score": ma.score}
                        elif ma.near == bestMatch["Matches"]:
                            if ma.score > bestMatch["Score"]:
                                bestMatch = {"ID": artistDBID, "Matches": ma.near, "Score": ma.score}

                    if bestMatch["ID"] is not None:
                        cnts += 1                 
                        print("mdb.add(\"{0}\", \"{1}\", \"{2}\")".format(dbartist, db, bestMatch["ID"]))
                        




In [None]:
%load_ext autoreload
%autoreload
mmm = matchMyMusic(mdb)

In [None]:

artistData

In [None]:
artistData     = mdb.getArtistData("Michael Jackson")
albumTypes     = [1,2,3,4]
albumTypesData = {k: [] for k in albumTypes}
fullAlbumTypesData = {k: {} for k in albumTypes}
for db,dbArtistData in artistData.items():
    try:
        artistDBID = dbArtistData['ID']
    except:
        continue
    artistDBAlbumsFromID = mdb.getArtistAlbumsFromID(db, artistDBID)

    for albumType in albumTypes:
        for mediaType, mediaTypeAlbums in artistDBAlbumsFromID.items():
            if mediaType not in mdb.getDBAlbumTypeNames(db, albumType):
                continue
            albumTypesData[albumType] += list(mediaTypeAlbums.values())
            if fullAlbumTypesData[albumType].get("{0}-{1}".format(db,mediaType)) is None:
                fullAlbumTypesData[albumType]["{0}-{1}".format(db,mediaType)] = list(mediaTypeAlbums.values())

albumTypesData = {k: list(set(v)) for k,v in albumTypesData.items()}
dbArtistAlbums = getFlatList(albumTypesData.values())

In [None]:
studio = {"AllMusic": ["Albums"], "Discogs": ["Albums"], "MusicBrainz": ["Album"], "RateYourMusic": ["Album"], "RockCorner": ["Albums"]}

In [None]:
albumsMap    = {}
fullAlbumMap = Counter()

dbmapping     = {}
invdbmapping  = {}
for db,mediaTypes in studio.items():
    dbmapping[db]    = {}
    invdbmapping[db] = {}

    for mediaType in mediaTypes:
        dbAlbums      = fullAlbumTypesData[1]["{0}-{1}".format(db,mediaType)]
        for k in dbAlbums:
            dbmapping[db][k.upper()] = k
            dbmapping[db][k] = k.upper()
            fullAlbumMap[k.upper()] += 1
        
#fullAlbumMap

In [None]:
core = set()
noncore = set()
nearest = {k: findNearest(k, list(fullAlbumMap.keys()), 2, 0.9) for k in fullAlbumMap.keys()}
for k,v in nearest.items():
    if len(v) == 2:
        if fullAlbumMap[v[0]] > fullAlbumMap[v[1]]:
            core.add(v[0])
        elif fullAlbumMap[v[1]] > fullAlbumMap[v[0]]:
            core.add(v[1])
        else:
            noncore.add(v[0])
            noncore.add(v[1])
            #print(k,v,[fullAlbumMap[v2] for v2 in v])
    else:
        if fullAlbumMap[k] > 1:
            core.add(k)
        else:
            noncore.add(k)
            #print(k,fullAlbumMap[k])

In [None]:
[core

In [None]:
noncore

In [None]:
fullAlbumTypesData[1]

In [None]:
## Basic stuff
%load_ext autoreload
%autoreload

from musicBase import myMusicBase
mmb = myMusicBase()
mmb.findArtistAlbums()

***
***

In [None]:
DatPiffArtists = maindb.getDBData('DatPiff')['Disc'].getMasterSlimArtistDiscogsDB()['DiscArtist']

In [None]:
retval = {artist: mdb.getArtistDBIDs(db="DatPiff", artistName=artist, cutoff=0.9) for artist in mmb.getArtists()}
retval = {artist: v for artist, v in retval.items() if len(v) > 0}
saveFile(idata=retval, ifile="datPiffMatch.yaml")

In [None]:
matches = getFile("datPiffMatch.yaml")
for artist, match in matches.items():
    values = list(match.values())
    if values[0] is None:
        print(artist)
    if len(values[0]) != 1:
        print(artist)
    print(values[0][0])
    mdb.add(artist, "DatPiff", str(values[0][0]))
mdb.save()

In [None]:
mmm.matchUnknownArtist('Notorious B.I.G.')

# UnMatch Artist

In [None]:

def unMatchArtist(artistName):
    from os.path import join
    from fsUtils import removeDir, isFile, setFile, removeFile

    for musicDir in getMatchedDirs():
        dirval        = join(musicDir, getPrimeDirectory(artistName), artistName)
        if not isDir(dirval):
            continue
            
        matchedDir    = setDir(dirval, "Match")
        mediaTypeDirs = findDirs(matchedDir)
        for mediaTypeDir in mediaTypeDirs:
            for matchDir in findDirs(mediaTypeDir):
                albumName = getUnMatchedDirName(getDirBasics(matchDir)[-1], mediaTypeDir)

                srcDir = matchDir
                dstDir = setDir(dirval, albumName)
                if isDir(dstDir):
                    i = 0
                    while not isDir(dstDir):
                        dstDir = "{0} [Fix-{1}]".format(setDir(dirval, albumName), i)
                        i += 1

                moveDir(srcDir, dstDir, debug=True)

            if isDir(mediaTypeDir):
                DS_Store = setFile(mediaTypeDir, ".DS_Store")
                if isFile(DS_Store):
                    removeFile(DS_Store, debug=True)
                removeDir(mediaTypeDir, debug=True)

        if isDir(matchedDir):
            DS_Store = setFile(matchedDir, ".DS_Store")
            if isFile(DS_Store):
                removeFile(DS_Store, debug=True)
            removeDir(matchedDir, debug=True)
            

In [None]:
unMatchArtist("Sweet")

In [None]:
######################################################################
#### Loop Over Prime Directories
######################################################################
for primeDir in getPrimeDirectories():
    artistPrimeDirMap = getArtistPrimeDirMap(primeDir)

    ######################################################################
    #### Loop Over Artist Name <-> Prime Map Items
    ######################################################################
    for artistName, artistPrimeDirs in artistPrimeDirMap.items():
        unMatchArtist(artistName)


***
***

# Merge DBs After Finding Matches

In [None]:
retval = searchForMutualDBEntries(mdb, minI=-1, cutoff=0.7, maxR=3000)

In [None]:
retval

In [None]:
if len(retval) > 0:
    for artistName,artistResult in retval.items():
        for db,dbResult in artistResult.items():
            mdb.add(artistName, db, dbResult["ID"])
    mdb.save()

In [None]:
mdb.getArtistData("A-Mafia")

In [None]:
for unknownArtist in unknownArtists.keys():
    print(unknownArtist)

In [None]:
db = "AllMusic"
for unknownArtist in unknownArtists.keys():
    artistNameDBIDs = mdb.getArtistDBIDs(unknownArtist, db, cutoff=0.99)
    if len(artistNameDBIDs) == 1:
        mdb.add(unknownArtist, db, artistNameDBIDs[unknownArtist][0])
        #print(unknownArtist,'\t',artistNameDBIDs)
mdb.save()

In [None]:
artistName = "Sweet"
db = "AllMusic"

In [None]:
artistName = "Sweet"
db = "AllMusic"
unMatchedAlbums = mmb.getArtistAlbumsByArtist(artistName).getUnmatched()
artistNameDBIDs = mdb.getArtistDBIDs(artistName, db)

In [None]:
resultD = {"ID": None, "Matches": 0, "Score": 0.0, "Best": None}

for artistName,artistDBIDs in artistNameDBIDs.items():
    print(artistName,artistDBIDs)
    for dbArtistID in artistDBIDs:
        print('\t',dbArtistID)
        dbArtistIDAlbums = mdb.getArtistAlbumsFromID(db, dbArtistID, flatten=True)
        print("\t\t",dbArtistIDAlbums)

        
        ### Match Albums (if possible)
        ma = matchAlbums()
        ma.match(unMatchedAlbums, dbArtistIDAlbums)
        if debug:
            print("\t\t{0: <45}{1}\t{2}\t{3}\t{4}".format(dbArtistID, len(dbArtistIDAlbums), ma.near, ma.score, ma.maxval))
        if ma.near < resultD["Matches"]:
            continue
        if ma.score < max([resultD["Score"], cutoff]):
            continue
        resultD = {"ID": dbArtistID, "Matches": ma.near, "Score": ma.score, "Best": ma}
        if debug:
            print("\t\t{0: <45}{1}\t{2}\t{3}\t{4} <-- Match".format(dbArtistID, len(artistAlbums), ma.near, ma.score, ma.maxval))





In [None]:
from matchAlbums import matchAlbums

def searchForArtistDBEntries(mdb, artistName):
    mmb.getArtistAlbumsByArtist("Sweet").getUnmatched()
    albums = 


def searchForArtistAlbumsDBEntries(mdb, artistName, albums, dbsToMatch=None, cutoff=0.7, num=10, debug=False):
    retval     = {}
    

    ######################################################################
    #### Set Known Albums
    ######################################################################
    knownArtistAlbums = albums
    
    
    ######################################################################
    #### Loop Over Missing DBs
    ######################################################################
    if dbsToMatch is None:
        dbsToMatch = mdb.getDBs()
    for db in dbsToMatch:        
        artistDBIDs = mdb.getArtistDBIDs(artistName, db, cutoff=cutoff, num=num, debug=num)
        print("{0: <20}".format(db), end="\t")
        if "Full" in debug:
            print("Found {0} possible artists in DB".format(len(artistDBIDs)))
        else:
            print("")
        
        
        ######################################################################
        #### Search For Matches in Possible IDs
        ######################################################################
        resultD = {"ID": None, "Matches": num, "Score": 0.0, "Best": None}
        for dbArtistName, dbArtistIDs in artistDBIDs.items():
            for dbArtistID in dbArtistIDs:
                dbArtistIDAlbums = mdb.getArtistAlbumsFromID(db, dbArtistID, flatten=True)
                                
                ma = matchAlbums()
                ma.match(knownArtistAlbums, dbArtistIDAlbums)
                if "ID" in debug or "Full" in debug:
                    print("\t\t{0: <45}{1}\t{2}\t{3}\t{4}".format(dbArtistID, len(dbArtistIDAlbums), ma.near, ma.score, ma.maxval))
                if ma.near < resultD["Matches"]:
                    continue
                if ma.score < max([resultD["Score"], cutoff]):
                    continue
                resultD = {"ID": dbArtistID, "Matches": ma.near, "Score": ma.score, "Best": ma}
                print("\t\t{0: <45}{1}\t{2}\t{3}\t{4} <-- Match".format(dbArtistID, len(artistAlbums), ma.near, ma.score, ma.maxval))

                
        if resultD["ID"] is not None:
            print("\t\t{0: <45}{1}\t{2} <====================================== Best Match".format(resultD["ID"], resultD["Matches"], resultD["Score"]))
            retval[db] = {'ID': resultD["ID"], 'Name': None}
            if "Full" in debug:
                print("\t\t =====>",retval[db])
        else:
            if "Full" in debug:
                print("\t\t =====> No Match")
            retval[db] = None
            
    return retval



def searchForMutualArtistDBEntries(mdb, artistName, num=2, cutoff=0.8, debug=[None]):
    retval     = {}

    
    ######################################################################
    #### Determine Albums To Match
    ######################################################################
    artistAlbums = []
    dbsToMatch   = []
    dbMatches    = mdb.getArtistDataIDs(artistName)
    knownDBs     = []
    for db,artistID in dbMatches.items():
        if artistID is not None:
            artistAlbums.append(mdb.getArtistAlbumsFromID(db, artistID, flatten=True))
            knownDBs.append(db)
        else:
            dbsToMatch.append(db)
    from listUtils import getFlatList
    knownArtistAlbums   = list(set(getFlatList(artistAlbums)))
    print("Searching for matches:  [{0}] using [{1}] albums collected from [{2}] dbs".format(artistName, len(knownArtistAlbums), len(artistAlbums)))
    print("  Will search for matches in these DBs: {0}".format(dbsToMatch))
    
    
    ######################################################################
    #### Loop Over Missing DBs
    ######################################################################
    retval = searchForArtistAlbumsDBEntries(mdb, artistName, knownArtistAlbums, dbsToMatch, cutoff, num, debug)
    return retval
            
            
def searchForMutualDBEntries(mdb, num=2, cutoff=0.8, debug=[None], minI=-1, maxR=50):
    retval = {}
    nR = 0
    
    musicArtists = mdb.getArtists()
    for i, artistName in enumerate(musicArtists):
        if i <= minI:
            continue
        result = searchForMutualArtistDBEntries(mdb, artistName, num, cutoff, debug)
        for db,dbval in result.items():
            if dbval is not None:
                if retval.get(artistName) is None:
                    retval[artistName] = {}
                retval[artistName][db] = dbval
                nR += 1
                
        if nR > maxR:
            break
                
    print("Found {0} new artist matches after looping over {1} artists".format(len(retval), i))
    return retval

In [None]:
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import numpy as np
from random import random

def r(size=0.1):
    val = size*(random() - 0.5)
    return val

def func(x, a, b):
    return (a + b*np.sin(np.pi*x/24.0))

In [None]:
days = 7

In [None]:
x = np.linspace(0,24*days,250*days)
y = [func(t, a=1.0, b=2.0) for t in x]
ygen = y + 0.2 * np.random.normal(size=x.size)

In [None]:
xdata = x
ydata = ygen
popt, pcov = curve_fit(func, xdata, ydata)
popt

In [None]:
plt.plot(x,y, 'r--', label='truth')
plt.plot(x,ygen, 'g--', label='data')
plt.xlabel('Hour of Week')
plt.ylabel('Energy Usage [arbitrary units]')
plt.plot(xdata, func(xdata, *popt), 'b-', label='fit: a=%5.2f, b=%5.2f' % tuple(popt))
plt.legend()
plt.show()

In [None]:
predicted = [4, 25,  0.75, 11]
observed  = [3, 21, -1.25, 13]

In [None]:
if len(predicted) != len(observed):
    raise ValueError("Must be equal lengths")
residual = list(zip(predicted, observed))
N = len(residual)
import numpy as np
np.sqrt(sum([(item[0] - item[1])**2 for item in list(residual)])/N)

In [None]:
import numpy as np
import pandas as pd


# you can use this table as an example
distr_table = pd.DataFrame({
    'X': [0, 0, 1, 1],
    'Y': [1, 2, 1, 2],
    'pr': [0.25, 0.25, 0.15, 0.35]
})

class CheckIndependence:

    def __init__(self):
        self.version = 1

    def check_independence(self, distr_table: pd.DataFrame):
        # write your solution here
        ## Initialize output
        output = {"are_independent": None, "cov": None, "corr": None}
        
        ### Mean X and Sigma X
        muX  = 0
        for Xval,df in distr_table.groupby('X'):
            prX   = df['pr'].sum()
            muX  += Xval*prX
        sigX = 0
        for Xval,df in distr_table.groupby('X'):
            prX   = df['pr'].sum()
            sigX += prX * (Xval - muX)**2
        sigX = np.sqrt(sigX)

        ### Mean Y and Sigma Y
        muY  = 0
        for Yval,df in distr_table.groupby('Y'):
            prY  = df['pr'].sum()
            muY += Yval*prY
        sigY = 0
        for Yval,df in distr_table.groupby('Y'):
            prY  = df['pr'].sum()
            sigY += prY * (Yval - muY)**2
        sigY = np.sqrt(sigY)

        print(sigX)
        print(sigY)
            
        ### Covariance Calculation
        cov = 0
        for i,row in distr_table.iterrows():
            pr   = row.pr
            xval = row.X
            yval = row.Y
            cov += pr*(xval - muX)*(yval - muY)            
            
        ### Correlation Calculation
        corr = cov / (sigX * sigY)
        
        
        
        ### Independence Calculation
        diff = 0
        for i,row in distr_table.iterrows():
            pr   = row.pr
            xval = row.X
            yval = row.Y

            prX = distr_table[distr_table['X'] == xval]['pr'].sum()
            prY = distr_table[distr_table['Y'] == yval]['pr'].sum()

            prXY = prX*prY
            diff += np.abs(prXY - pr)

        indy = True
        if diff > 1e-4:
            indy = False
        
        
        
        output["cov"]  = cov
        output["corr"] = corr
        output["are_independent"] = indy
        
        return output

In [None]:
ci = CheckIndependence()
ci.check_independence(distr_table)

In [None]:
ci

In [None]:
distr_table['X'].unique()

In [None]:
muX  = 0
for Xval,df in distr_table.groupby('X'):
    prX   = df['pr'].sum()
    muX  += Xval*prX
sigX = 0
for Xval,df in distr_table.groupby('X'):
    sigX += prX * (Xval - muX)**2
sigX = np.sqrt(sigX)

muY  = 0
for Yval,df in distr_table.groupby('Y'):
    prY  = df['pr'].sum()
    muY += Yval*prY
sigY = 0
for Yval,df in distr_table.groupby('Y'):
    prY  = df['pr'].sum()
    sigY += prY * (Yval - muY)**2
sigY = np.sqrt(sigY)

In [None]:
cov = 0
for i,row in distr_table.iterrows():
    pr   = row.pr
    xval = row.X
    yval = row.Y
    cov += pr*(xval - muX)*(yval - muY)

In [None]:

diff

In [None]:
prX = {}
for Xval,df in distr_table.groupby('X'):
    prX[Xval] = df['pr'].sum()

In [None]:
prX

In [None]:
distr_table[distr_table['X'] == 0]['pr'].sum()