In [26]:
###########################################################################
## Basic stuff
###########################################################################
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

###########################################################################
## Utils
###########################################################################
from timeUtils import timestat
from listUtils import getFlatList
from masterDBGate import masterDBGate
from pandas import isna, notna, Series, DataFrame, concat
from uuid import uuid4

###########################################################################
## DB
###########################################################################
from masterManualEntries import masterManualEntries
from masterArtistNameDB import masterArtistNameDB
from masterArtistMerger import masterArtistMerger
from masterMultiArtistDB import masterMultiArtistDB
from masterArtistNameCorrection import masterArtistNameCorrection
from convertByteString import convertByteString
from mainDB import mainDB

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
mme        = masterManualEntries()
cbs        = convertByteString()
mam        = masterArtistMerger()
mma        = masterMultiArtistDB()
manc       = masterArtistNameCorrection()
manDB      = masterArtistNameDB("main")
multimanDB = masterArtistNameDB("multi")

manDBData  = manDB.getData()
manInvData = {fixName: [] for fixName in manDBData.unique()}
for otherName,fixName in manDBData.iteritems():
    manInvData[fixName].append(otherName)

Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Mergers Data From Main Pickle File
Process [Getting Manual Mergers Data From Main Pickle File] Took 0.0 Seconds
masterArtistMerger Summary:
  DB ID Entries: 7195
  DB Entries:    2033
  Artists:       684
Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Renames Data From Main Pickle File
Process [Getting Manual Renames Data From Main Pickle File] Took 0.0 Seconds
masterArtistNameDB Summary:
  Entries: 3634
Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Renames Data From Main Pickle File
Process [Getting Manual Renames Data From Main Pickle File] Took 0.1 Seconds
  No duplicate key/values in manual renames
  No recursive key/values in manual renames
masterArtistNameDB("main") Summary:
  Entries: 45021
  Artists: 34171
Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Renames Data From Main Pickle File
Process [Getting Manual Renames Data From Main Pickle File] Took 0.0 Seconds
  No duplicate key/values

In [3]:
df = mme.getDataFrame()
df = DataFrame({col: colData.apply(lambda x: str(x) if notna(x) else None) for col,colData in df.iteritems()})

Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Entries Data From Main Pickle File
Process [Getting Manual Entries Data From Main Pickle File] Took 1.2 Seconds


In [4]:
ts = timestat("Getting ArtistID -> Clean Name Map")
mDiscs = masterDBGate().getDiscs()
artistIDToCleanName = {db: disc.getArtistIDToPreMergeNameData().apply(lambda x: manc.realName(x)[0]).apply(manc.clean).apply(cbs.convert) for db,disc in mDiscs.items()}
ts.stop()

Current Time is Mon Nov 01, 2021 08:10 for Getting ArtistID -> Clean Name Map
Process [Getting ArtistID -> Clean Name Map] Took 41.3 Seconds


In [5]:
ts = timestat("Getting MergerID -> Name Map")
mergerIDToName = {db: {} for db in mDiscs.keys()}
for artistName,artistData in mam.getData().iteritems():
    for db,dbData in artistData.items():
        mergerIDToName[db][dbData["ID"]] = artistName
ts.stop()

Current Time is Mon Nov 01, 2021 08:10 for Getting MergerID -> Name Map
Current Time is Mon Nov 01, 2021 08:10 for Getting Manual Mergers Data From Main Pickle File
Process [Getting Manual Mergers Data From Main Pickle File] Took 0.1 Seconds
Process [Getting MergerID -> Name Map] Took 0.1 Seconds


In [6]:
def getCleanArtistName(dbID, db):
    if isinstance(dbID,str):
        mergerName = mergerIDToName[db].get(dbID)
        if mergerName is not None:
            return (mergerName,dbID,True)
        
        cleanName = artistIDToCleanName[db].get(dbID)
        if cleanName is not None:
            return (cleanName,dbID,False)
        
        if not dbID.isdigit():
            return ("NotDigit",dbID,False)
        else:
            return ("NotInDB",dbID,False)
    elif isna(dbID):
        return None
    else:
        raise ValueError("Unsure how to get name for ID [{0}]/[{1}]".format(db,dbID))

ts = timestat("Joining ID To Name For {0} Entries And {1} DBs".format(df.shape[0],df.shape[1]))
dfNameData = DataFrame({db: dbDFData.apply(getCleanArtistName, db=db) for db,dbDFData in df.iteritems() if db in mDiscs})
colnames   = ["ArtistName"] + list(dfNameData.columns)
dfNameData = dfNameData.join(df["ArtistName"])[colnames]
ts.stop()

Current Time is Mon Nov 01, 2021 08:10 for Joining ID To Name For 371148 Entries And 12 DBs
Process [Joining ID To Name For 371148 Entries And 12 DBs] Took 18.1 Seconds


### Fix Merger IDs

In [None]:
def fixMergerIDs(df, mam):
    dbMaxLen   = {db: df[db].apply(lambda x: len(x) if x is not None else 0).max() for db in artistIDToCleanName}
    mergedRows = concat([dbData[dbData.apply(lambda x: len(x) if x is not None else 0) == dbMaxLen[db]] for db,dbData in df.iteritems() if db in artistIDToCleanName]).index.drop_duplicates()


    idxs = []
    for idx,row in df.loc[mergedRows].iterrows():
        mergeData = mam.getArtistDataByName(row["ArtistName"])
        if mergeData is None:
            print(row["ArtistName"])
            idxs.append(idx)
            continue
        print(row["ArtistName"])
        for db,dbMergeData in mergeData.items():
            mergeID   = dbMergeData["ID"]
            currentID = row[db]
            print("\t{0: <16}{1}  -->  {2}".format(db,currentID,mergeID))
            df.loc[idx,db] = mergeID

In [None]:
#mme.saveData(manualEntries=df, local=False)

In [7]:
def isMerger(row):
    return sum([mam.getArtistDataByMergerID(dbID) is not None for dbID in row.values]) > 0
ts = timestat("Find Merged Artist Data")
mergedArtists = df.apply(isMerger, axis=1)
mergedIDXs    = df[mergedArtists].index
ts.stop()

Current Time is Mon Nov 01, 2021 08:11 for Find Merged Artist Data
Process [Find Merged Artist Data] Took 2.5 Seconds


In [None]:
dfNameData[dfNameData["ArtistName"] == "Alice Cooper"]

In [9]:
class artistGroup:
    def __init__(self, key, debug=False):
        self.key   = key
        self.debug = debug
        
        ############################################################################
        # General And Diagnostic
        ############################################################################
        self.groupType  = None
        self.terminal   = True # Becomes False If adding an artistGroup To groups()
        self.mmeID      = None
        
        
        ############################################################################
        # Database Matches
        ############################################################################
        self.dbIDs = {}
        
        
        ############################################################################
        # Artist Group Names
        ############################################################################
        
        ### Will likely be an ALL CAPS version of the assigned name
        self.searchName = None
        
        ### My Choice of Group Name (very arbitrary. must be in stylized or latin names)
        self.assignedName = None
        
        ### Stylized Names (any weird way group's name is written)
        self.stylizedNames = []
        
        ### Latin Names (Ascii if possible, something readable in English)
        self.latinNames = []
        
        ### Renames (Mapping between name and one of names in stylized or latin names)
        self.dbRenames  = {}
        self.genRenames = {}
        
        ### A collection of other ArtistGroup items
        self.groups = {}
        
        
    
    ################################################################################################################################
    # General
    ################################################################################################################################
    def show(self):
        print("{0: <20}: {1}".format("Key", self.key))
        print("{0: <20}: {1}".format("Assigned Name", self.assignedName))
        print("{0: <20}: {1}".format("Search Name", self.searchName))
        print("{0: <20}: {1}".format("DB Matches", self.dbIDs))
        print("{0: <20}: {1}".format("DB Renames", self.dbRenames))
        print("{0: <20}: {1}".format("General Renames", self.genRenames))
        
        
    ################################################################################################################################
    # Getters and Setters
    ################################################################################################################################
    def getKey(self):
        return self.key
    
    def setDBIDs(self, dbIDs):
        self.dbIDs = dbIDs
    
    def setAssignedName(self, assignedName):
        self.assignedName = assignedName
        self.searchName   = assignedName.upper()
        
    def setDBRenames(self, dbRenames):
        self.dbRenames = dbRenames
        
    def setGenRenames(self, genRenames):
        self.genRenames = genRenames
        
    def addGroup(self, ag):
        if isinstance(ag, artistGroup):
            self.groups[ag.getKey] = ag

In [105]:
def createArtistGroupData(row, idx, manDB, mergedArtists):
    artistName = row["ArtistName"]
    
    artistDBData = {idx: idxData for idx,idxData in row.iteritems() if isinstance(idxData,tuple)}
    dbNames  = {db: dbData[0] for db,dbData in artistDBData.items() if dbData[0] not in ["NotInDB", "NotDigit"]}    
    dbIDs    = {db: dbData[1] for db,dbData in artistDBData.items()}
    isMerged = {db: dbData[2] for db,dbData in artistDBData.items() if dbData[2] is True}
    isMerged = isMerged if len(isMerged) > 0 else None
    if len(dbNames) == 0:
        print(idx,'\t',artistName)
    
    ag = artistGroup(key=key)
    ag.mmeID = idx
    ag.terminal = not isMerged
    ag.setAssignedName(artistName)

    unMerged = mergedArtists.isin([artistName]).sum() == 0
    if unMerged:
        dbRenames  = {db: {dbName: manDB.renamed(dbName)} for db,dbName in dbNames.items()}
        dbRenames  = {db: dbRename for db,dbRename in dbRenames.items() if list(dbRename.keys()) != list(dbRename.values())}
        genRenames = {rename: artistName for rename in manInvData.get(artistName, {}) if {rename: artistName} not in dbRenames.values()}
    else:
        dbRenames  = {}
        genRenames = {}
    ag.setDBRenames(dbRenames)
    ag.setGenRenames(genRenames)
    
    ag.setDBIDs(dbIDs)
    
    return ag

In [107]:
indivAGS  = {}
mergedAGS = {}
N   = dfNameData.shape[0]
ts  = timestat("Creating Artist Groups For {0} \'Artists\'".format(N))
mergedArtists = df.loc[mergedIDXs]["ArtistName"]

for i,(idx,row) in enumerate(dfNameData.iterrows()):
    if (i+1) % 50000 == 0 or (i+1) == 10000:
        ts.update(n=i+1,N=N)
    
    key  = str(uuid4())
    data = createArtistGroupData(row, idx, manDB, mergedArtists)
    if idx in mergedIDXs:
        mergedAGS[key] = data
    else:
        indivAGS[key] = data
         
print("{0: <30}{1: >6}".format("All Artists", dfNameData.shape[0]))
print("{0: <30}{1: >6}".format("Individual Artists", len(indivAGS)))
print("{0: <30}{1: >6}".format("Merged Artists", len(mergedAGS)))

ts.stop()

Current Time is Mon Nov 01, 2021 09:52 for Creating Artist Groups For 371148 'Artists'
10000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 3.7 Seconds.  ETA is 133.6 Seconds
aaaaaaaaXXX0008239XXX01 	 Alice Cooper
aaaaaaaaXXX0013980XXX05 	 Anima
50000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 18.5 Seconds.  ETA is 118.8 Seconds
100000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 37.5 Seconds.  ETA is 101.7 Seconds
150000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 56.5 Seconds.  ETA is 83.3 Seconds
200000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 1.3 Minutes.  ETA is 1.1 Minutes
250000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 1.6 Minutes.  ETA is 0.8 Minutes
300000/371148 : Process [Creating Artist Groups For 371148 'Artists'] Has Run For 1.9 Minutes.  ETA is 0.5 Minutes
350000/371148 : Process [Crea

In [108]:
print("{0: <30}{1: >6}".format("All Artists", dfNameData.shape[0]))
print("{0: <30}{1: >6}".format("Individual Artists", len(indivAGS)))
print("{0: <30}{1: >6}".format("Merged Artists", len(mergedAGS)))

All Artists                   371148
Individual Artists            370478
Merged Artists                   670


In [113]:
ts = timestat("Split Renames By Known DB Renames")

manDBDataRemaining   = manDBData
ags = {"Individual": indivAGS, "Merged": mergedAGS}
for agType,agData in ags.items():
    dbRenameData = [item for item in getFlatList([ag.dbRenames.values() for key,ag in agData.items()]) if len(item) > 0]
    dbRenameData = {k: v for item in dbRenameData for k,v in item.items()}
    manDBDataTemp      = DataFrame(manDBDataRemaining, columns=["PermReplace"]).join(Series(dbRenameData, name="dbRename"))
    manDBDataRemaining = manDBDataTemp[manDBDataTemp["dbRename"].isna()]["PermReplace"]
    manDBDataDBRename  = manDBDataTemp[manDBDataTemp["dbRename"].notna()]["PermReplace"]

    print("{0: <30}{1: >6}".format("Perm Renames", manDBDataTemp.shape[0]))
    print("{0: <30}{1: >6}".format("Known DB Renames", manDBDataDBRename.shape[0]))
    print("{0: <30}{1: >6}".format("Remaining Renames", manDBDataRemaining.shape[0]))
ts.stop()



Current Time is Mon Nov 01, 2021 09:59 for Split Renames By Known DB Renames
Perm Renames                   45021
Known DB Renames               26976
Remaining Renames              18045
Perm Renames                   18045
Known DB Renames                   0
Remaining Renames              18045
Process [Split Renames By Known DB Renames] Took 0.7 Seconds


  


In [110]:
ts = timestat("Split Renames By Known General Renames")
genRenameData = [ag.genRenames for key,ag in indivAGS.items() if len(ag.genRenames) > 0]
genRenameData = {k: v for item in genRenameData for k,v in item.items()}
manDBDataTemp      = DataFrame(manDBDataRemaining, columns=["PermReplace"]).join(Series(genRenameData, name="genRename"))
manDBDataRemaining = manDBDataTemp[manDBDataTemp["genRename"].isna()]["PermReplace"]
manDBDataGenRename = manDBDataTemp[manDBDataTemp["genRename"].notna()]["PermReplace"]

print("{0: <30}{1: >6}".format("(Perm-DB) Renames", manDBDataTemp.shape[0]))
print("{0: <30}{1: >6}".format("Known Gen Renames", manDBDataGenRename.shape[0]))
print("{0: <30}{1: >6}".format("Remaining Renames", manDBDataRemaining.shape[0]))
ts.stop()

Current Time is Mon Nov 01, 2021 09:55 for Split Renames By Known General Renames
(Perm-DB) Renames              18045
Known Gen Renames               8794
Remaining Renames               9251
Process [Split Renames By Known General Renames] Took 0.1 Seconds


In [102]:
ts = timestat("Split Renames By Merged Renames")
manDBDataTemp        = manDBDataRemaining
manDBDataMergeRename = manDBDataTemp[manDBDataTemp.isin(df.loc[mergedIDXs]["ArtistName"])]
manDBDataRemaining   = manDBDataTemp[~manDBDataTemp.isin(df.loc[mergedIDXs]["ArtistName"])]
ts.stop()

print("{0: <30}{1: >6}".format("(Perm-DB-Merge) Renames", manDBDataTemp.shape[0]))
print("{0: <30}{1: >6}".format("Known Merge Renames", manDBDataMergeRename.shape[0]))
print("{0: <30}{1: >6}".format("Not Merge Renames", manDBDataRemaining.shape[0]))

Current Time is Mon Nov 01, 2021 09:34 for Split Renames By Merged Renames
Process [Split Renames By Merged Renames] Took 0.0 Seconds
(Perm-DB-Merge) Renames         9251
Known Merge Renames             2681
Not Merge Renames               6570


In [95]:
manDBDataRemaining[manDBDataRemaining.isin(["Dave Matthews"])]

Dave Matthews & His Orchestra      Dave Matthews
Dave Matthews And His Orchestra    Dave Matthews
Dave Matthews and His Orchestra    Dave Matthews
Dave Matthews' Big Band            Dave Matthews
Dave Matthews Band                 Dave Matthews
Name: PermReplace, dtype: object

In [104]:
manDBDataMergeRename

ABBA (Björn & Benny, Agnetha & Frida)                        ABBA
ABBA - Agnetha, Björn, Benny, Anna-Frid                      ABBA
ABBA (Björn, Benny, Agnetha & Frida)                         ABBA
Björn Benny & Agnetha Frida                                  ABBA
Aaron Lewis Of Staind                                 Aaron Lewis
                                                    ...          
Ziggy Marley And The Melody Makers                   Ziggy Marley
Ziggy Marley and The Melody Makers                   Ziggy Marley
Zoot Money's Big Roll Band                             Zoot Money
death’s dynamic shroud                     death's dynamic shroud
death's dynamic shroud.wmv                 death's dynamic shroud
Name: PermReplace, Length: 2681, dtype: object