In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from dbmaster import MasterParams, MasterPersist
from dbbase import MusicDBIDModVal, MusicDBDir, MusicDBData
from dbnote import DownloadRecord, MergeSearchArtist, ConcatRawData
from utils import FileIO, DirInfo, FileInfo, getFlatList, Timestat, TermTime, TermTimeTS, getTT
from pandas import Series, DataFrame, concat, Timestamp
from pandb import PanDBIO
from musicdb.albumoftheyear import MusicDBParams, RawWebData, MusicDBIO
from os import getpid

mv = MusicDBIDModVal()
io = FileIO()
mpar = MusicDBParams()
dbio = MusicDBIO()
webio = RawWebData()
db = mpar.db

In [None]:
searchArtistRecord = DownloadRecord(db=db, name="SearchArtist", rTypes=["Index", "Data"])
downloadArtistRecord = DownloadRecord(db=db, name="DownloadArtist", rTypes=["Index"])
downloadAlbumRecord = DownloadRecord(db=db, name="DownloadAlbum", rTypes=["Index"])
knownArtists = dbio.rdio.getSummaryNameData

In [None]:
if False:
    from utils import FileInfo
    import shutil
    db = dbio.params.db
    dbdir = dbio.params.dir
    nameMap = {"MasterArtists": "SearchArtist", "LocalArtists": "DownloadArtist"}
    nameMap = nameMap | {"LocalAlbums": "DownloadAlbum"}
    dinfo = DirInfo(f"/Users/tgadfort/anaconda3/envs/post/pandb/musicdb/{db}")
    
    cpMap = {}
    for origname, newname in nameMap.items():
        cpMap[f"{dbdir}SearchedFor{origname}"] = f"{db}-{newname}-Index"
        cpMap[f"{dbdir}SearchedFor{origname}Data"] = f"{db}-{newname}-Data"
        cpMap[f"{dbdir}SearchedFor{origname}Errors"] = f"{db}-{newname}-Error"
    assert dinfo.exists(), f"{dinfo} does not exist!"
    for srcFile, dstFile in cpMap.items():
        srcFile = dinfo.join(f"{srcFile}.p")
        dstFile = dinfo.join(f"{dstFile}.p")
        if srcFile.exists():
            srcFile.cpFile(dstFile)

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print(f"{db} Search Results (PID={getpid()})".format(db))
searchArtistRecord.info()
downloadArtistRecord.info()
downloadAlbumRecord.info()
print(f"  {'KnownArtist Data': <20}: {knownArtists().shape[0]}")

# Search For New Artists

In [None]:
######################################################################################################
# Explode Recent Data
######################################################################################################
def getNewData(searchArtistRecord):
    newData = searchArtistRecord.getData()
    if len(newData) == 0:
        return DataFrame()
    newData = concat([Series(val) for val in newData.values()])
    newData = newData.reset_index().rename(columns={"index": "Ref", 0: "Name"})
    newData.index = newData["Ref"].map(dbio.getdbid)
    newData.index.name = ""
    return newData

def mergeNewData(searchArtistRecord):
    searchArtistRecord.load(verbose=False)
    newData = getNewData(searchArtistRecord)
    msr = MergeSearchArtist(db)
    msr.mergeLocal(searchArtistRecord, newData, test=False)

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = RawWebData(debug=False)
knownNames = PanDBIO().getUniqueArtistNames()
mergeNewData(searchArtistRecord)
searchedNames = Series(searchArtistRecord.getIndex())
artistNamesToGet = knownNames[~knownNames.map(searchArtistRecord.isKnown)]

print(f"# {db} Search Results")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {searchedNames.shape[0]}")
print(f"#   Artist Names To Get: {artistNamesToGet.shape[0]}")

del searchedNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistName, nErrors, sleeptime, error=None):
    searchArtistRecord.setError(index=artistName)    
    print(f"Search Error ==> {artistName}: {error}")
    nErrors.append(artistName)
    webio.sleep(sleeptime)
    

n = 0
maxN = 2500000
nErrors = []
searchArtistRecord.load(verbose=False)
for i, (idx, artistName) in enumerate(artistNamesToGet.items()):
    if searchArtistRecord.isKnown(artistName):
        continue

    if len(nErrors) >= 5:
        print("Stopping due to 5 consecutive errors")
        break

    try:
        response = webio.getArtistSearchData(artistName=artistName)
    except Exception as error:
        isError(artistName, nErrors, 10, error)
        continue

    if not isinstance(response, dict):
        isError(artistName, nErrors, 5.0, "NotDict")
        continue

    nErrors = []
    searchArtistRecord.setData(index=artistName, data=response)
    webio.sleep(5.0)
    n += 1
        
    if n % 10 == 0:
        ts.update(n=n)
        searchArtistRecord.save()
        webio.wait(10.0)
        if tt.isFinished():
            break
    
    if n >= maxN:
        print("Breaking after {0} downloads...".format(maxN))
        break

ts.stop()
searchArtistRecord.save()

In [None]:
searchArtistRecord.save()

In [None]:
msr = MergeSearchArtist(db)
tmp = msr.getGlobalSearchArtistData()
print(f"Old Global Shape: {tmp.shape}")
msr.copyFromLocal()

In [None]:
dataLocal = msr.getLocalSearchArtistData()
dataGlobal = msr.getGlobalSearchArtistData()

In [None]:
data = concat([dataGlobal, dataLocal])
data = data[data.index.duplicated()]
data.shape

# Download Artist Data

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=True)
webio = RawWebData(debug=False)
msa = MergeSearchArtist(dbio.db)
knownNames = msa.getGlobalSearchArtistData()
knownNames['ModVal'] = knownNames.index.map(mv.getModVal)
downloadArtistRecord.load()
downloadedNames = Series(downloadArtistRecord.getIndex())
availableNames = knownNames[~knownNames.index.isin(downloadedNames.index)]
artistNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby("ModVal")})

print(f"# {db} Search Results")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {downloadedNames.shape[0]}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del downloadedNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistID, artistName, nErrors, sleeptime, error=None):
    downloadArtistRecord.setError(index=artistID)
    print(f"Download Error ==> {artistName}: {error}")
    nErrors.append(artistID)
    webio.sleep(sleeptime)
    
n = 0
maxN = 2500000
stop = False
nErrors = []
for modVal,modValData in artistNamesToGet.items():
    #modVal = groupModVal[0]
    if stop is True:
        break
    N = modValData.shape[0]
    for i,(artistID,row) in enumerate(modValData.iterrows()):
        if downloadArtistRecord.isKnown(artistID):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            stop = True
            break
            
        artistName  = row["Name"]
        artistRef   = row["Ref"]
    
        print(f"{modVal: <8} | {i: <8} | {N: <8} | {n: <8} | ", end="")
        try:
            response = webio.getArtistData(artistID=artistID, artistName=artistName, artistRef=artistRef)
        except Exception as error:
            isError(artistID, artistName, nErrors, 5, error)
            continue
    
        if not isinstance(response, bytes):
            isError(artistID, artistName, nErrors, 3.5, "NotBytes")
            continue

        
        nErrors = []
        downloadArtistRecord.setIndex(index=artistID)
        dbio.rdio.saveData("RawArtist", modVal, artistID, data=response)
        webio.sleep(2.0)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadArtistRecord.save()
            webio.wait(15.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            stop = True
            break

ts.stop()
downloadArtistRecord.save()

In [None]:
downloadArtistRecord.save()

In [None]:
#crd = ConcatRawData(db=dbio.db, dType="Artist")
#crd.concat()
#crd.merge(test=False)
#crd.remove(force=True)

# Download Album Data

## Create Media Data

In [None]:
mediaData = {}
for modVal in range(100):
    modValData = mio.data.getModValData(modVal)
    modValMediaData = {}
    for artistID,artistIDData in modValData.iteritems():
        for mediaType,mediaTypeData in artistIDData.media.media.items():
            modValMediaData.update({code: [artistID,media.album,media.url] for code,media in mediaTypeData.items()})
    mediaData.update(modValMediaData)
    if (modVal+1) % 10 == 0:
        print(f"ModVal = {modVal+1}")
        
df = DataFrame(mediaData).T
df.columns = ["ArtistID", "Name", "Ref"]
knownMedia.save(data=df)

## Download Data

In [None]:
mio   = albumoftheyear.MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = albumoftheyear.RawWebData(debug=False)

In [None]:
useArtist = False
numMaster = 250

knownAlbumsData = knownMedia.get()
knownAlbumsData['IndexModVal'] = knownAlbumsData.index.map(mio.getModVal)
availableNames  = concat([artistIDDF.head(numMaster) for artistID,artistIDDF in knownAlbumsData.groupby(["ArtistID"])])
localAlbumsDict = localAlbums.get()
availableNames  = availableNames[~availableNames.index.isin(localAlbumsDict.keys())]
albumNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby(["IndexModVal"])})

print(f"# {db} Album Search Results")
print(f"#   Available Album IDs:  {knownAlbumsData.shape[0]}")
print(f"#   Known Album IDs:      {len(localAlbumsDict)}")
print(f"#   Albums To Download:   {availableNames.shape[0]}")

del availableNames
del localAlbumsDict
del knownAlbumsData

#   Albums To Download:   23457
#   Albums To Download:   20437
#   Albums To Download:   12137
#   Albums To Download:   4132

In [None]:
def saveAlbumData(db, localAlbumsDict, searchedForErrors):
    print("="*150)
    print(f"Saving {len(localAlbumsDict)} {db} Albums Data")
    localAlbums.save(data=localAlbumsDict)
    print(f"Saving {len(searchedForErrors)} {db} Searched For Errors")
    errors.save(data=searchedForErrors)
    print("="*150)

In [None]:
ts = Timestat("Getting {0} AlbumIDs".format(db))
tt = getTT(skipEOD=False)

n    = 0
maxN = 25000000
localAlbumsDict     = localAlbums.get()
searchedForErrors   = errors.get()
stop = False
nErrors = []
for groupModVal,modValData in albumNamesToGet.iteritems():
    if stop is True:
        break
    for j,(albumID,row) in enumerate(modValData.iterrows()):
        if len(nErrors) >= 5:
            for artistID in nErrors:
                print(f"del searchedForErrors['{artistID}']")
                stop=True
                break
        if any([dct.get(albumID) is not None for dct in [localAlbumsDict, searchedForErrors]]):
            continue
            
        artistID   = row["ArtistID"]
        albumName  = row["Name"]
        albumRef  = row["Ref"]

        print(f"{groupModVal: <8} |{j: <8} | {n: <8} | ", end="")
        try:
            response = webio.getAlbumData(albumName=albumName, albumRef=albumRef)
        except:
            print("Error ==> {0}".format(albumName))
            searchedForErrors[albumID] = True
            nErrors.append(albumID)
            webio.sleep(10)
            continue

        if not isinstance(response,bytes):
            print("Error ==> {0}".format(albumName))
            searchedForErrors[albumID] = True
            nErrors.append(albumID)
            webio.sleep(3.5)
            continue

        nErrors = []
        modVal=mio.mv.get(albumID)
        mio.data.saveRawArtistAlbumData(data=response, modval=modVal, dbID=albumID)
        localAlbumsDict[albumID] = True
        webio.sleep(4.5)
        n += 1
        nLastErrors = 0
        
        if n % 5 == 0:
            if tt.isFinished():
                stop=True
                break

        if n % 50 == 0:
            webio.sleep(5)
            
        if n % 100 == 0:
            saveAlbumData(db, localAlbumsDict, searchedForErrors)
            if tt.isFinished():
                stop=True
                break
            webio.wait(10.0)

        if n >= maxN:
            print("Breaking after {0} downloads...".format(maxN))
            stop=True
            break

ts.stop()
if True: saveAlbumData(db, localAlbumsDict, searchedForErrors)

In [None]:
from os import getpid
getpid()

# Download Lists

## Download Starter

In [None]:
from apiutils import WebIO
from ioutils import FileIO, HTMLIO
io  = FileIO()
hio = HTMLIO()
wio = WebIO()

In [None]:
starter             = {}
starter["List"]     = "https://www.albumoftheyear.org/lists.php"
starter["Genre"]    = "https://www.albumoftheyear.org/genre.php"
#starterBestAlbum = "https://www.albumoftheyear.org/ratings/6-highest-rated/2023/1"
starter["Rating"]   = "https://www.albumoftheyear.org/ratings"
starter["Discover"] = "https://www.albumoftheyear.org/discover"
starter["Releases"] = "https://www.albumoftheyear.org/releases"
starter["MustHear"] = "https://www.albumoftheyear.org/must-hear"

savename = "../../sandbox/AOTYstarter.p"
starterData = io.get(savename)
for key,url in starter.items():
    if starterData.get(key) is not None:
        continue
    retval = wio.get(url)
    if retval.code == 200:
        print(key)
        starterData[key] = retval.data
    wio.sleep(3)
        
print(f"Saving data to {savename}")
io.save(idata=starterData, ifile=savename)

In [None]:
useStarter = False
useSite    = True

if useStarter is True:
    aotyData = io.get("../../sandbox/AOTYstarter.p")
elif useSite is True:
    aotyData = io.get("../../sandbox/AOTYsiteData.p") | io.get("../../sandbox/AOTYsiteData2.p")
else:
    aotyData = {}
    
N  = len(aotyData)
ts = Timestat(f"Sorting {N} Site Refs")
refsData = {"List": {}, "Lists": {}, "Rating": {}, "Album": {}, "Artist": {}, "Discover": {}, "Genre": {}, "MustHear": {}, "Release": {}, "Spotify": {}, "Apple": {}, "Amazon": {}, "User": {}}
for n,(key,keyData) in enumerate(aotyData.items()):
    if (n+1) % 500 == 0 or (n+1) == 100:
        ts.update(n=n+1, N=N)
        
    bsdata = hio.get(keyData)
    refs = bsdata.findAll("a")
    for ref in refs:
        href = ref.get('href')
        if not isinstance(href,str):
            continue
        if href.startswith("/list/"):
            refsData["List"][href] = ref.text.strip()
        elif "lists.php" in href:
            refsData["Lists"][href] = ref.text.strip()
        elif href.startswith("/ratings/"):
            refsData["Rating"][href] = ref.text.strip()
        elif href.startswith("/artist/"):
            refsData["Artist"][href] = ref.text.strip()
        elif href.startswith("/album/"):
            refsData["Album"][href] = ref.text.strip()
        elif href.startswith("/discover/"):
            refsData["Discover"][href] = ref.text.strip()
        elif href.startswith("/genre/"):
            refsData["Genre"][href] = ref.text.strip()
        elif href.startswith("/must-hear/"):
            refsData["MustHear"][href] = ref.text.strip()
        elif "/releases/" in href:
            refsData["Release"][href] = ref.text.strip()
        elif "spotify.com" in href:
            refsData["Spotify"][href] = ref.text.strip()
        elif "apple.com" in href:
            refsData["Apple"][href] = ref.text.strip()
        elif "amazon.com" in href:
            refsData["Amazon"][href] = ref.text.strip()
        elif "/user/" in href:
            refsData["User"][href] = ref.text.strip()
        else:
            continue
            print(href,'\t|\t',ref.text)
            
ts.stop()

In [None]:
savename = "../../sandbox/AOTYsiteRefs.p"
print(f"Saving data to {savename}")
for key,keyData in refsData.items():
    print(f"  {key: <20}{len(keyData)}")
io.save(idata=refsData, ifile=savename)

In [None]:
siteData   = {}
artistData = {}
albumData  = {}
userData   = {}
otherData  = {}

if False:
    for ref,name in refsData["List"].items():
        if ref.startswith("/artist/"):
            artistData[ref] = name
        elif ref.startswith("/album/"):
            albumData[ref] = name
        elif ref.startswith("/user/"):
            userData[ref] = name
        elif ref.startswith("/"):
            siteData[ref] = name
        else:
            otherData[ref] = name
else:
    for key,keyData in refsData.items():
        for ref,name in keyData.items():
            if ref.startswith("/artist/"):
                artistData[ref] = name
            elif ref.startswith("/album/"):
                albumData[ref] = name
            elif ref.startswith("/user/"):
                userData[ref] = name
            elif ref.startswith("/"):
                if "/list/" in ref:
                    siteData[ref] = name
                else:
                    otherData[ref] = name
            else:
                otherData[ref] = name
            
print(f"Found {len(artistData)} Artist Refs")
print(f"Found {len(albumData)} Album Refs")
print(f"Found {len(userData)} User Refs")
print(f"Found {len(siteData)} Site Refs")
print(f"Found {len(otherData)} Other Refs")

In [None]:
for year in range(1970,2005):
    key = f'/lists.php?y={year}'
    val = 'View More'
    siteData[key] = val

In [None]:
savename1 = "../../sandbox/AOTYsiteData.p"
siteDataDownloads1 = io.get(savename1)
print(f"Found {len(siteDataDownloads1)} Previous Downloads")
savename2 = "../../sandbox/AOTYsiteData2.p"
siteDataDownloads2 = io.get(savename2)
print(f"Found {len(siteDataDownloads2)} Previous Downloads")
N = len(siteData)
ts = Timestat(f"Downloading {N} Site Refs")
for n,(ref,name) in enumerate(siteData.items()):
    url=f"https://www.albumoftheyear.org{ref}"
    if any([dct.get(ref) is not None for dct in [siteDataDownloads1,siteDataDownloads2]]):
        continue
    
    retval = wio.get(url)
    if retval.code == 200:
        print(f"{n: <6} | {N: <6} | {ref}")
        siteDataDownloads2[ref] = retval.data
    wio.sleep(3)
    
    if (n+1) % 25 == 0:
        ts.update(n=n+1,N=N)
        print(f"Saving {len(siteDataDownloads2)} data to {savename2}")
        io.save(idata=siteDataDownloads2, ifile=savename2)

ts.stop()
        
print(f"Saving data to {savename2}")
io.save(idata=siteDataDownloads2, ifile=savename2)

In [None]:
siteDataDownloads = io.get("../../sandbox/AOTYsiteData.p")

In [None]:
io.save(idata=siteDataDownloads, ifile=savename)

## Check For New Data

In [None]:
from lib.albumoftheyear import MusicDBID
mid = MusicDBID()
mid.getAlbumID('/album/515536-beyonce-renaissance/critic-lists/?f=all&y=2022')

In [None]:
df = DataFrame(Series(refsData["Album"])).reset_index().rename(columns={"index": "Ref", 0: "List"})
df["AlbumID"] = df["Ref"].map(mid.getAlbumID)
df = df[~df["AlbumID"].duplicated()]

In [None]:
df = DataFrame(Series(refsData["Artist"])).reset_index().rename(columns={"index": "Ref", 0: "Name"})
df["ArtistID"] = df["Ref"].map(mid.getArtistID)
df = df[~df["ArtistID"].duplicated()]

In [None]:
artistNames = searchArtists()

In [None]:
df.index = df["ArtistID"]
df = df.drop(["ArtistID"], axis=1)
artistNames = concat([artistNames,df])
artistNames = artistNames[~artistNames.index.duplicated()]

In [None]:
mio.data.saveSearchArtistData(data=artistNames)

# Backup

In [None]:
from utils import StoreData, backup
from numpy import array_split
sd = StoreData("AlbumOfTheYear", "Artist")
for modVals in array_split(range(100), 2):
    sd.mergeLocalData(modVals=modVals)
sd.mergeGlobalData()

In [None]:
from time import sleep
sleep(200)
sd.mergeGlobalData()

In [None]:
for modVal in range(67):
    srcDir = DirInfo(f"/Volumes/Piggy/Discog/artists-albumoftheyear/{modVal}/artists")
    files  = [FileInfo(ifile) for ifile in srcDir.getFiles()]
    files  = [finfo for finfo in files if finfo.basename.isdigit()]
    dstDir = DirInfo(f"/Users/tgadfort/Music/Discog/artists-albumoftheyear/{modVal}/artists")
    for srcFile in files:
        dstFile = dstDir.join(srcFile.name)
        srcFile.mvFile(dstFile)