In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from dbmaster import MasterParams, MasterPersist
from dbbase import MusicDBIDModVal, MusicDBDir, MusicDBData
from dbnote import DownloadRecord, MergeSearchArtist, ConcatRawData
from utils import FileIO, DirInfo, FileInfo, getFlatList, Timestat, TermTime, TermTimeTS, getTT
from pandas import Series, DataFrame, concat, Timestamp
from pandb import PanDBIO
from musicdb.genius import MusicDBParams, RawAPIData, MusicDBIO
from functools import partial
from os import getpid

mv = MusicDBIDModVal()
io = FileIO()
mpar = MusicDBParams()
dbio = MusicDBIO()
apiio = RawAPIData()
db = mpar.db

In [None]:
searchArtistRecord = DownloadRecord(db=db, name="SearchArtist", rTypes=["Index", "Data"])
downloadArtistRecord = DownloadRecord(db=db, name="DownloadArtist", rTypes=["Index"])
downloadArtistSongRecord = DownloadRecord(db=db, name="DownloadArtistSong", rTypes=["Index"])
downloadAlbumRecord = DownloadRecord(db=db, name="DownloadAlbum", rTypes=["Index"])
downloadSongRecord = DownloadRecord(db=db, name="DownloadSong", rTypes=["Index"])
allArtists = partial(dbio.rdio.getData, "SearchArtist")
knownArtists = dbio.rdio.getSummaryNameData

In [None]:
if False:
    from utils import FileInfo
    import shutil
    db = dbio.params.db
    dbdir = dbio.params.dir
    nameMap = {"MasterArtists": "SearchArtist", "LocalArtists": "DownloadArtist", "LocalAlbums": "DownloadAlbum"}
    nameMap = nameMap | {"LocalArtistSongs": "DownloadArtistSong", "LocalSongs": "DownloadSong", "LocalAlbums": "DownloadAlbum"}
    dinfo = DirInfo(f"/Users/tgadfort/anaconda3/envs/post/pandb/musicdb/{db}")
    
    cpMap = {}
    for origname, newname in nameMap.items():
        cpMap[f"{dbdir}SearchedFor{origname}"] = f"{db}-{newname}-Index"
        cpMap[f"{dbdir}SearchedFor{origname}Data"] = f"{db}-{newname}-Data"
        cpMap[f"{dbdir}SearchedFor{origname}Errors"] = f"{db}-{newname}-Error"
    assert dinfo.exists(), f"{dinfo} does not exist!"
    for srcFile, dstFile in cpMap.items():
        srcFile = dinfo.join(f"{srcFile}.p")
        dstFile = dinfo.join(f"{dstFile}.p")
        if srcFile.exists():
            srcFile.mvFile(dstFile)

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print(f"{db} Search Results (PID={getpid()})".format(db))
searchArtistRecord.info()
downloadArtistRecord.info()
downloadArtistSongRecord.info()
downloadAlbumRecord.info()
downloadSongRecord.info()
print(f"  {'KnownArtist Data': <20}: {knownArtists().shape[0]}")

# Search For New Artists

In [None]:
######################################################################################################
# Explode Recent Data
######################################################################################################
def getNewData(searchArtistRecord):
    newData = searchArtistRecord.getData()
    if len(newData) == 0:
        return DataFrame()
    newData = concat([Series(s) for s in newData.values()])
    newData.index = newData.index.astype(str)
    newData = newData[~newData.index.duplicated()]
    return newData

def mergeNewData(searchArtistRecord):
    searchArtistRecord.load(verbose=False)
    newData = getNewData(searchArtistRecord)
    msr = MergeSearchArtist(db)
    msr.mergeLocal(searchArtistRecord, newData, test=False)

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
apiio = RawAPIData(debug=False)
knownNames = PanDBIO().getUniqueArtistNames()
mergeNewData(searchArtistRecord)
searchedNames = Series(searchArtistRecord.getIndex())
artistNamesToGet = knownNames[~knownNames.map(searchArtistRecord.isKnown)]

print(f"# {db} Search Results")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {searchedNames.shape[0]}")
print(f"#   Artist Names To Get: {artistNamesToGet.shape[0]}")

del searchedNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistName, nErrors, sleeptime, error=None):
    searchArtistRecord.setError(index=artistName)
    print(f"Search Error ==> {artistName}: {error}")
    nErrors.append(artistName)
    apiio.sleep(sleeptime)
    
n = 0
maxN = 250000000
nErrors = []
searchArtistRecord.load(verbose=False)
for i, (idx, artistName) in enumerate(artistNamesToGet.items()):
    if searchArtistRecord.isKnown(artistName):
        continue

    if len(nErrors) >= 5:
        print("Stopping due to 5 consecutive errors")
        break

    try:
        response = apiio.getArtistSearchData(artistName=artistName)
    except Exception as error:
        isError(artistName, nErrors, 10, error)
        continue

    if not isinstance(response, dict):
        isError(artistName, nErrors, 3.5, "NotDict")
        continue

    nErrors = []
    searchArtistRecord.setData(index=artistName, data=response)
    apiio.sleep(2.5)
    n += 1
        
    if n % 25 == 0:
        ts.update(n=n)
        searchArtistRecord.save()
        apiio.wait(10.0)
        if tt.isFinished():
            break
    
    if n >= maxN:
        print("Breaking after {0} downloads...".format(maxN))
        break

ts.stop()
searchArtistRecord.save()

# Download Artist Data

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=True)
apiio = RawAPIData(debug=False)
knownNames = DataFrame(allArtists()).rename(columns={0: "Name"})
knownNames['ModVal'] = knownNames.index.map(mv.getModVal)
downloadedNames = Series(downloadArtistRecord.getIndex())
availableNames = knownNames[~knownNames.index.isin(downloadedNames.index)]
artistNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby("ModVal")})

print(f"# {db} Search Results")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {downloadedNames.shape[0]}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del downloadedNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistID, artistName, nErrors, sleeptime, error=None):
    downloadArtistRecord.setError(index=artistID)
    print(f"Download Error ==> {artistName}: {error}")
    nErrors.append(artistID)
    apiio.sleep(sleeptime)
    
n = 0
maxN = 250000000
stop = False
nErrors = []
for modVal,modValData in artistNamesToGet.items():
    #modVal = groupModVal[0]
    if stop is True:
        break
    N = modValData.shape[0]
    for i,(artistID,artistName) in enumerate(modValData['Name'].items()):    
        if downloadArtistRecord.isKnown(artistID):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            stop = True
            break
    
        print(f"{modVal: <8} | {i: <8} | {N: <8} | {n: <8} | ", end="")
        try:
            response = apiio.getArtistData(artistID=artistID, artistName=artistName)        
        except Exception as error:
            isError(artistID, artistName, nErrors, 5, error)
            continue
    
        if not isinstance(response, dict):
            isError(artistID, artistName, nErrors, 3.5, "NotDict")
            continue

        
        nErrors = []
        downloadArtistRecord.setIndex(index=artistID)
        dbio.rdio.saveData("RawArtist", modVal, artistID, data=response)
        apiio.sleep(2.0)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadArtistRecord.save()
            apiio.wait(15.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            stop = True
            break

ts.stop()
downloadArtistRecord.save()

In [None]:
downloadArtistRecord.save()

In [None]:
crd = ConcatRawData(db=dbio.db, dType="Artist")
#crd.concat()
#crd.merge(test=False)
#crd.remove(force=True)

# Download Artist Songs Data

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
apiio = RawAPIData(debug=False)
knownNames = DataFrame(allArtists()).rename(columns={0: "Name"})
knownNames['ModVal'] = knownNames.index.map(mv.getModVal)
downloadedNames = Series(downloadArtistSongRecord.getIndex())
availableNames = knownNames[~knownNames.index.isin(downloadedNames.index)]
artistNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby("ModVal")})

print(f"# {db} Search Results")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {downloadedNames.shape[0]}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del downloadedNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistID, artistName, nErrors, sleeptime, error=None):
    downloadArtistSongRecord.setError(index=artistID)
    print(f"Download Error ==> {artistName}: {error}")
    nErrors.append(artistID)
    apiio.sleep(sleeptime)
    
n = 0
maxN = 2500000
stop = False
nErrors = []
for modVal,modValData in artistNamesToGet.items():
    #modVal = groupModVal[0]
    if stop is True:
        break
    N = modValData.shape[0]
    for i,(artistID,artistName) in enumerate(modValData['Name'].items()):    
        if downloadArtistSongRecord.isKnown(artistID):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            stop = True
            break
    
        print(f"{modVal: <8} | {i: <8} | {N: <8} | {n: <8} | ", end="")
        try:
            response = apiio.getArtistSongsData(artistID=artistID, artistName=artistName)        
        except Exception as error:
            isError(artistID, artistName, nErrors, 5, error)
            continue
    
        if not isinstance(response, dict):
            isError(artistID, artistName, nErrors, 3.5, "NotDict")
            continue

        
        nErrors = []
        downloadArtistSongRecord.setIndex(index=artistID)
        dbio.rdio.saveData("RawArtistSong", modVal, artistID, data=response)
        apiio.sleep(2.0)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadArtistSongRecord.save()
            apiio.wait(15.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            stop = True
            break

ts.stop()
downloadArtistSongRecord.save()

In [None]:
downloadArtistSongRecord.save()

In [None]:
mediaData = {}
for modVal in range(100):
    modValData = dbio.data.getModValData(modVal)
    modValMediaData = {}
    for artistID,artistIDData in modValData.iteritems():
        for mediaType,mediaTypeData in artistIDData.media.media.items():
            modValMediaData.update({code: [artistID,media.album,media.url] for code,media in mediaTypeData.items()})
    mediaData.update(modValMediaData)
    if (modVal+1) % 10 == 0:
        print(f"ModVal = {modVal+1}")
        
df = DataFrame(mediaData).T
df.columns = ["ArtistID", "Name", "Ref"]
knownMedia.save(data=df)

## Download Data

In [None]:
dbio   = bandcamp.MusicDBIO(verbose=False,local=True,mkDirs=False)
apiio = bandcamp.RawAPIData(debug=False)