In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from dbmaster import MasterParams, MasterPersist
from dbbase import MusicDBIDModVal, MusicDBDir, MusicDBData, getModVals
from dbnote import DownloadRecord, MergeSearchArtist, KnownRecord, ConcatRawData
from utils import FileIO, DirInfo, FileInfo, getFlatList, Timestat, TermTime, TermTimeTS, getTT, getHTML, flattenLists
from pandas import Series, DataFrame, concat, Timestamp
from pandb import PanDBIO
from musicdb.petrucci import MusicDBParams, RawWebData, MusicDBIO
from os import getpid
from urllib.parse import quote

mv = MusicDBIDModVal()
io = FileIO()
mpar = MusicDBParams()
dbio = MusicDBIO()
webio = RawWebData()
db = mpar.db

In [None]:
downloadArtistRecord = DownloadRecord(db=db, name="DownloadArtist", rTypes=["Index"])
downloadWorkRecord = DownloadRecord(db=db, name="DownloadWork", rTypes=["Index"])
knownGenreRecord = KnownRecord(db=db, name="Genre")
knownIMSLPRecord = KnownRecord(db=db, name="IMSLP")
knownCategoryRecord = KnownRecord(db=db, name="Category")
knownComposerRecord = KnownRecord(db=db, name="Composer")
knownPeopleRecord = KnownRecord(db=db, name="People")
knownPerformerRecord = KnownRecord(db=db, name="Performer")
knownGenreTypeRecord = KnownRecord(db=db, name="GenreType")
knownArtists = dbio.rdio.getSummaryNameData

In [None]:
if False:
    from utils import FileInfo
    import shutil
    db = dbio.params.db
    dbdir = dbio.params.dir
    nameMap = {"MasterArtists": "SearchArtist", "LocalArtists": "DownloadArtist", "LocalAlbums": "DownloadAlbum"}
    nameMap = nameMap | {"LocalArtistsTracks": "DownloadArtistTrack", "LocalArtistsReleases": "DownloadArtistRelease"}
    dinfo = DirInfo(f"/Users/tgadfort/anaconda3/envs/post/pandb/musicdb/{db}")
    
    cpMap = {}
    for origname, newname in nameMap.items():
        cpMap[f"{dbdir}SearchedFor{origname}"] = f"{db}-{newname}-Index"
        cpMap[f"{dbdir}SearchedFor{origname}Data"] = f"{db}-{newname}-Data"
        cpMap[f"{dbdir}SearchedFor{origname}Errors"] = f"{db}-{newname}-Error"
    assert dinfo.exists(), f"{dinfo} does not exist!"
    for srcFile, dstFile in cpMap.items():
        srcFile = dinfo.join(f"{srcFile}.p")
        dstFile = dinfo.join(f"{dstFile}.p")
        if srcFile.exists():
            srcFile.mvFile(dstFile)

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print(f"{db} Search Results (PID={getpid()})".format(db))
downloadArtistRecord.info()
downloadWorkRecord.info()
knownIMSLPRecord.info()
knownCategoryRecord.info()
knownGenreRecord.info()
knownComposerRecord.info()
knownPeopleRecord.info()
knownPerformerRecord.info()
knownGenreTypeRecord.info()
#print(f"  {'KnownArtist Data': <20}: {knownArtists().shape[0]}")

# Starter

## IMSLP & Category Data

In [None]:
from utils import HTMLIO, WebIO
from time import sleep

webio = WebIO()
hio = HTMLIO()
retval = webio.get(dbio.params.baseURL)
bsdata = hio.get(retval.data)

select = bsdata.find("select", {"id": "subnav-select"})
options = select.findAll("option") if isBS4Tag(select) else []

optionData   = {option.get('value'): option.text.strip() for option in options if isinstance(option.get('value'),str)}
categoryData = {key: val for key,val in optionData.items() if key.startswith("/wiki/Category")}
imslpData    = {key: val for key,val in optionData.items() if key.startswith("/wiki/IMSLP")}

imslpWikiData = {}
for ref,name in imslpData.items():
    url = f"{baseURL}{ref}"
    print(f"Getting {url}  ==>  ", end="")
    retval = webio.get(url)
    if retval.code == 200:
        imslpWikiData[ref] = retval.data
        print("Good")
    else:
        print("Bad")
    sleep(3)

categoryWikiData = {}
for ref,name in categoryData.items():
    url = f"{baseURL}{ref}"
    print(f"Getting {url}  ==>  ", end="")
    retval = webio.get(url)
    if retval.code == 200:
        categoryWikiData[ref] = retval.data
        print("Good")
    else:
        print("Bad")
    sleep(3)

knownCategoryRecord.load()
knownCategoryRecord.recordData['Data'] = categoryWikiData
knownCategoryRecord.save()

knownIMSLPRecord.load()
knownIMSLPRecord.recordData['Data'] = imslpWikiData
knownIMSLPRecord.save()

In [None]:
from bs4.element import Tag

knownIMSLPRecord.load()
imslpData = knownIMSLPRecord.getData()
bsdata = getHTML(imslpData['/wiki/IMSLP:View_Genres'])
wikitable = bsdata.find("table", {"class": "wikitable"})
refs = wikitable.findAll("a") if isinstance(wikitable, Tag) else []
genres = {}
for ref in refs:
    url = f"{baseURL}{ref}"
    print(url)
    data = webio.get(url)
    if data.code == 200:
        genres[ref] = data.data
    webio.sleep(3)

knownGenrePRecord.init(force=True)
knownGenrePRecord.load()
knownGenrePRecord.setData(genres)
knownGenrePRecord.save()

## Category

In [None]:
def getAnchors(bsdata):
    refs = [ref for ref in bsdata.findAll("a", {"class": ["external", "text"]})]
    anchors = {}
    for ref in refs:
        loc = ref.text[0]
        if anchors.get(loc) is None:
            anchors[loc] = {}
        anchors[loc][ref.text] = ref.get('href')
    for ch in list(anchors.keys()):
        if len(anchors[ch]) > 1:
            del anchors[ch][ch]
        if ch == "T":
            del anchors[ch]['Top']
    return anchors
    
def getJText(bsdata) -> 'str':
    scripts = bsdata.findAll("script")
    start = "if(typeof catpagejs=='undefined')catpagejs={};$.extend(catpagejs,"
    scripts = [script for script in scripts if script.text.startswith(start)]
    for script in scripts:
        jText = script.text.replace(start, "")
        jText = jText[:-1] if jText.endswith(";") else jText
        return jText.encode().decode('unicode-escape')
    return ""

def getChunk(text, pos):
    npos = text.find("[", pos)
    rpos = text.find("]", npos)
    if any([obj == -1 for obj in [npos, rpos]]):
        return None, -1
    chunk = text[npos:(rpos+1)].replace(" \"", " '").replace("\"\"", "'\"")
    return chunk, rpos

In [None]:
knownCategoryRecord.load()
categoryData = knownCategoryRecord.getData()

### Composers

In [None]:
bsdata = getHTML(categoryData['/wiki/Category:Composers'])
jText = getJText(bsdata)
pos = 0
chunks = []
text = jText
while pos != -1:
    chunk,pos = getChunk(text, pos)
    if isinstance(chunk, str):
        try:
            chunk = eval(chunk)
        except Exception as error:
            raise ValueError(f"Could not eval chunk! ({error})")
        chunks.append(chunk)
    else:
        break

from urllib.parse import quote
from utils import flattenLists

baseURL = dbio.params.baseURL
composers = {}
names = flattenLists(chunks)
for name in names:
    qname = quote(name.replace(" ", "_"))
    url = f"{baseURL}/wiki/Category:{qname}"
    # print(f"{name}  |  {url}")
    composers[url] = name
    
knownComposerRecord.load()
knownComposerRecord.setData(composers)
knownComposerRecord.save()

### All People

In [None]:
def fixName(name):
    name = name[2:] if name.startswith('["') else name
    name = name[:-2] if name.endswith('"]') else name
    return name

bsdata = getHTML(categoryData['/wiki/Category:People'])
jText = getJText(bsdata)

pos = 0
chunks = []
text = jText
while pos != -1:
    chunk,pos = getChunk(text, pos)
    if isinstance(chunk, str):
        chunks.append([fixName(name) for name in chunk.split("\",\"")])
    else:
        break

baseURL = dbio.params.baseURL
people = {}
names = flattenLists(chunks)
for name in names:
    qname = quote(name.replace(" ", "_"))
    url = f"{baseURL}/wiki/Category:{qname}"
    # print(f"{name}  |  {url}")
    people[url] = name
    
knownPeopleRecord.load()
knownPeopleRecord.setData(people)
knownPeopleRecord.save()

### Performers

In [None]:
bsdata = getHTML(categoryData['/wiki/Category:Performers'])
jText = getJText(bsdata)

In [None]:
def fixName(name):
    name = name[2:] if name.startswith('["') else name
    name = name[:-2] if name.endswith('"]') else name
    return name

bsdata = getHTML(categoryData['/wiki/Category:Performers'])
jText = getJText(bsdata)

pos = 0
chunks = []
text = jText
while pos != -1:
    chunk,pos = getChunk(text, pos)
    if isinstance(chunk, str):
        chunks.append([fixName(name) for name in chunk.split("\",\"")])
    else:
        break

baseURL = dbio.params.baseURL
performers = {}
names = flattenLists(chunks)
for name in names:
    qname = quote(name.replace(" ", "_"))
    url = f"{baseURL}/wiki/Category:{qname}"
    # print(f"{name}  |  {url}")
    performers[url] = name
    
knownPerformerRecord.load()
knownPerformerRecord.setData(performers)
knownPerformerRecord.save()

## IMSLP

In [None]:
knownIMSLPRecord.load()
imslpData = knownIMSLPRecord.getData()

### Genres

In [None]:
knownGenrePRecord.load()
genreData = knownGenrePRecord.getData()

In [None]:
bsdata = getHTML(genreData['/wiki/IMSLP:View_Genres/Work_Types'])
refs = bsdata.findAll("a", {"class": ["external", "text"]})
workTypes = {ref.get('href'): ref.text for ref in refs}

instrumentTypes = {}
bsdata = getHTML(genreData['/wiki/IMSLP:View_Genres/Instrumentation'])
for table in bsdata.findAll("table", {"class": "wikitable"}):
    instrument = table.find("th").text
    instrument = instrument.replace("\n", "").strip()
    refs = table.findAll("a", {"class": ["external", "text"]})
    instrumentTypes[instrument] = {ref.get('href'): ref.text for ref in refs}

bsdata = getHTML(genreData['/wiki/IMSLP:View_Genres/Featured_Instruments'])
refs = bsdata.findAll("a", {"class": ["external", "text"]})
featuredInstruments = {ref.get('href'): ref.text for ref in refs}

bsdata = getHTML(genreData['/wiki/IMSLP:View_Genres/Languages'])
refs = bsdata.findAll("a", {"class": ["external", "text"]})
languages = {ref.get('href'): ref.text for ref in refs}

bsdata = getHTML(genreData['/wiki/Category:Browse_by_work%27s_style'])
refs = bsdata.findAll("a", {"class": "categorysubcatlink"})
styles = {ref.get('href'): ref.text for ref in refs}

genreTypeData = {"WorkTypes": workTypes, "InstrumentTypes": instrumentTypes, "FeaturedInstruments": featuredInstruments,
                 "Languages": languages, "Styles": styles}

knownGenreTypeRecord.init(force=True)
knownGenreTypeRecord.load()
knownGenreTypeRecord.setData(genreTypeData)
knownGenreTypeRecord.save()

# Download People Data

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = RawWebData(debug=False)
knownPeopleRecord.load(verbose=False)
artistNames = DataFrame(Series(knownPeopleRecord.getData())).reset_index().rename(columns={"index": "URL", 0: "Name"})
artistNames.index = artistNames["URL"].map(dbio.getdbid)
artistNames.index.name = ""
artistNames['IndexModVal'] = artistNames.index.map(mv.getModVal)
downloadArtistRecord.load(verbose=False)
#availableNames = artistNames[~artistNames.index.map(downloadArtistRecord.isKnown)]
availableNames = artistNames
artistNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby(["IndexModVal"])})

print(f"# {db} Download Results")
print(f"#   Available Names:     {artistNames.shape[0]}")
print(f"#   Known Artist Names:  {len(downloadArtistRecord.getIndex())}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del artistNames

In [None]:
ts = Timestat(f"Getting {db} Artists")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(artistName, nErrors, sleeptime, error=None):
    downloadArtistRecord.setError(index=artistName)    
    print(f"Search Error ==> {artistName}: {error}")
    nErrors.append(artistName)
    webio.sleep(sleeptime)

stop = False
n = 0
maxN = 1
nErrors = []
downloadArtistRecord.load(verbose=False)
for groupModVal, df in artistNamesToGet.items():
    if groupModVal <= 73:
        continue
    modVal = groupModVal[0]
    if stop is True:
        break
    N = df.shape[0]
    for i, (artistID, row) in enumerate(df.iterrows()):
        #if downloadArtistRecord.isKnown(artistID):
        #    continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            stop = True
            break

        artistName = row["Name"]
        artistURL = row["URL"]
        print(f"{modVal: <8} | {i: <8} | {N: <8} | {n: <8} | ", end="")

        try:
            response = webio.getArtistData(artistID=artistID, artistName=artistName, artistURL=artistURL)
        except Exception as error:
            isError(artistID, nErrors, 10, error)
            continue
    
        if not isinstance(response, bytes):
            isError(artistID, nErrors, 5.0, "NotBytes")
            continue
    
        nErrors = []
        downloadArtistRecord.setIndex(index=artistID)
        dbio.rdio.saveData("RawArtist", modVal, artistID, data=response)
        webio.sleep(4)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadArtistRecord.save()
            webio.wait(10.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print("Breaking after {0} downloads...".format(maxN))
            stop = True
            break

ts.stop()
downloadArtistRecord.save()

In [None]:
mediaData = {}
for modVal in range(100):
    modValData = mio.data.getModValData(modVal)
    modValMediaData = {}
    for artistID,artistIDData in modValData.iteritems():
        for mediaType,mediaTypeData in artistIDData.media.media.items():
            modValMediaData.update({code: [artistID,media.album,media.url] for code,media in mediaTypeData.items()})
    mediaData.update(modValMediaData)
    if (modVal+1) % 10 == 0:
        print(f"ModVal = {modVal+1}")
        
df = DataFrame(mediaData).T
df.columns = ["ArtistID", "Name", "Ref"]
knownMedia.save(data=df)

## Download Data

In [None]:
mio   = bandcamp.MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = bandcamp.RawWebData(debug=False)

In [None]:
useArtist = False
numMaster = 250

knownAlbumsData = knownMedia.get()
knownAlbumsData['IndexModVal'] = knownAlbumsData.index.map(mio.getModVal)
availableNames  = concat([artistIDDF.head(numMaster) for artistID,artistIDDF in knownAlbumsData.groupby(["ArtistID"])])
localAlbumsDict = localAlbums.get()
availableNames  = availableNames[~availableNames.index.isin(localAlbumsDict.keys())]
albumNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby(["IndexModVal"])})

print(f"# {db} Album Search Results")
print(f"#   Available Album IDs:  {knownAlbumsData.shape[0]}")
print(f"#   Known Album IDs:      {len(localAlbumsDict)}")
print(f"#   Albums To Download:   {availableNames.shape[0]}")

del availableNames
del localAlbumsDict
del knownAlbumsData

#   Albums To Download:   23457
#   Albums To Download:   20437
#   Albums To Download:   12137
#   Albums To Download:   4132

In [None]:
def saveAlbumData(db, localAlbumsDict, searchedForErrors):
    print("="*150)
    print(f"Saving {len(localAlbumsDict)} {db} Albums Data")
    localAlbums.save(data=localAlbumsDict)
    print(f"Saving {len(searchedForErrors)} {db} Searched For Errors")
    errors.save(data=searchedForErrors)
    print("="*150)

In [None]:
ts = Timestat("Getting {0} AlbumIDs".format(db))
tt = getTT(skipEOD=False)

n    = 0
maxN = 25000000
localAlbumsDict     = localAlbums.get()
searchedForErrors   = errors.get()
stop = False
nErrors = []
for groupModVal,modValData in albumNamesToGet.iteritems():
    if stop is True:
        break
    for j,(albumID,row) in enumerate(modValData.iterrows()):
        if len(nErrors) >= 5:
            for artistID in nErrors:
                print(f"del searchedForErrors['{artistID}']")
                stop=True
                break
        if any([dct.get(albumID) is not None for dct in [localAlbumsDict, searchedForErrors]]):
            continue
            
        artistID   = row["ArtistID"]
        albumName  = row["Name"]
        albumRef  = row["Ref"]

        print(f"{groupModVal: <8} |{j: <8} | {n: <8} | ", end="")
        try:
            response = webio.getAlbumData(albumName=albumName, albumRef=albumRef)
        except:
            print("Error ==> {0}".format(albumName))
            searchedForErrors[albumID] = True
            nErrors.append(albumID)
            webio.sleep(10)
            continue

        if not isinstance(response,bytes):
            print("Error ==> {0}".format(albumName))
            searchedForErrors[albumID] = True
            nErrors.append(albumID)
            webio.sleep(3.5)
            continue

        nErrors = []
        modVal=mio.mv.get(albumID)
        mio.data.saveRawArtistAlbumData(data=response, modval=modVal, dbID=albumID)
        localAlbumsDict[albumID] = True
        webio.sleep(4.5)
        n += 1
        nLastErrors = 0
        
        if n % 5 == 0:
            if tt.isFinished():
                stop=True
                break

        if n % 50 == 0:
            webio.sleep(5)
            
        if n % 100 == 0:
            saveAlbumData(db, localAlbumsDict, searchedForErrors)
            if tt.isFinished():
                stop=True
                break
            webio.wait(10.0)

        if n >= maxN:
            print("Breaking after {0} downloads...".format(maxN))
            stop=True
            break

ts.stop()
if True: saveAlbumData(db, localAlbumsDict, searchedForErrors)

In [None]:
from os import getpid
getpid()

# Download Works

In [None]:
def getWorksData():
    from urllib.parse import quote
    def quoteURL(url):
        parts = url.split("/")
        qname = quote(parts[-1])
        retval = "/".join(["/".join(parts[:-1]), qname])
        return retval
            
    dbio = MusicDBIO()
    worksData = concat([dbio.rdio.getData("ModValArtistMedia", modVal)[['name', 'dbid', 'url']] for modVal in getModVals()])    
    worksData.index = worksData['dbid']
    worksData['url'] = worksData['url'].map(quoteURL)
    worksData = worksData.drop('dbid', axis=1)
    return worksData
    
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = RawWebData(debug=False)
worksData = getWorksData()
worksData['IndexModVal'] = worksData.index.map(mv.getModVal)
downloadArtistRecord.load(verbose=False)
availableNames = worksData[~worksData.index.map(downloadWorkRecord.isKnown)]
artistNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby(["IndexModVal"])})

print(f"# {db} Download Results")
print(f"#   Available Names:     {worksData.shape[0]}")
print(f"#   Known Artist Names:  {len(downloadWorkRecord.getIndex())}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del worksData

In [None]:
ts = Timestat(f"Getting {db} Works")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(workName, nErrors, sleeptime, error=None):
    downloadWorkRecord.setError(index=workName)    
    print(f"Search Error ==> {workName}: {error}")
    nErrors.append(workName)
    webio.sleep(sleeptime)

stop = False
n = 0
maxN = 25000000
nErrors = []
downloadWorkRecord.load(verbose=False)
for groupModVal, df in artistNamesToGet.items():
    modVal = groupModVal[0]
    if stop is True:
        break
    N = df.shape[0]
    for i, (workID, row) in enumerate(df.iterrows()):
        if downloadWorkRecord.isKnown(workID):
            continue
    
        if len(nErrors) >= 10:
            print("Stopping due to 10 consecutive errors")
            stop = True
            break

        workName = row["name"]
        workURL = row["url"]
        print(f"{modVal: <8} | {i: <8} | {N: <8} | {n: <8} | ", end="")

        try:
            response = webio.getWorkData(workName=workName, workURL=workURL)
        except Exception as error:
            isError(workID, nErrors, 10, error)
            continue
    
        if not isinstance(response, bytes):
            isError(workID, nErrors, 5.0, "NotBytes")
            continue
    
        nErrors = []
        downloadWorkRecord.setIndex(index=workID)
        dbio.rdio.saveData("RawWork", modVal, workID, data=response)
        webio.sleep(4)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadWorkRecord.save()
            webio.wait(10.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            stop = True
            break

ts.stop()
downloadWorkRecord.save()

In [None]:
downloadWorkRecord.save()

In [None]:
crd = ConcatRawData(db=dbio.db, dType="Work")

In [None]:
#crd.concat()
#crd.merge()
crd.remove(force=True)