In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from dbmaster import MasterParams, MasterPersist
from dbbase import MusicDBIDModVal, MusicDBDir, MusicDBData
from dbnote import DownloadRecord, KnownRecord, ConcatRawData
from utils import FileIO, DirInfo, FileInfo, getFlatList, Timestat, TermTime, TermTimeTS, getTT
from pandas import Series, DataFrame, concat, Timestamp
from pandb import PanDBIO
from musicdb.qobuz import MusicDBParams, RawWebData, MusicDBIO
from os import getpid

mv = MusicDBIDModVal()
io = FileIO()
mpar = MusicDBParams()
dbio = MusicDBIO()
webio = RawWebData()
db = mpar.db

In [None]:
searchArtistRecord = DownloadRecord(db=db, name="SearchArtist", rTypes=["Index", "Data"])
downloadGenreRecord = DownloadRecord(db=db, name="DownloadGenre", rTypes=["Index", "Data"])
downloadNonTermGenreRecord = DownloadRecord(db=db, name="DownloadNonTermGenre", rTypes=["Index", "Data"])
downloadArtistRecord = DownloadRecord(db=db, name="DownloadArtist", rTypes=["Index"])
downloadAlbumRecord = DownloadRecord(db=db, name="DownloadAlbum", rTypes=["Index"])
knownGenreRecord = KnownRecord(db=db, name="Genre")
knownAlbumRecord = KnownRecord(db=db, name="KnownAlbum")

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print(f"{db} Search Results (PID={getpid()})".format(db))
searchArtistRecord.info()
downloadGenreRecord.info()
downloadNonTermGenreRecord.info()
downloadArtistRecord.info()
downloadAlbumRecord.info()
knownGenreRecord.info()
#print(f"  {'KnownArtist Data': <20}: {knownArtists().shape[0]}")

In [None]:
dbio.rdio.getData("SearchArtistGroup")

# Starter Data

In [None]:
from utils import WebIO, getHTML, FileIO
webio = WebIO()
url = "https://www.qobuz.com/us-en/genres/download-streaming-albums"
data = webio.get(url)

In [None]:
bsdata = getHTML(data.data)
ul = bsdata.find("ul", {"class": "hierarchical-list"})
lis = ul.findAll("li")
genres = []
knownGenres = {}
levels = {1: None, 2: None, 3: None, 4: None}
prev = []

for li in lis:
    level = li.get('class')
    levels[level] = name
    atag = li.find('a')
    ref = atag.get('href')
    name = atag.text
    genres.append([level, name, ref])
    knownGenres[ref] = [name, level]

In [None]:
knownGenreRecord.setData(knownGenres)
knownGenreRecord.save()

# Download Numbered Pages

In [None]:
def getTerminalGenres(isTerm=True):
    knownGenreRecord.load(verbose=False)
    data = knownGenreRecord.getData()
    prev = [None, None, 0]
    term = []
    for ref, (name, level) in data.items():
        level = int(level[0][-1])
        #print(level,'\t',name, end="\t")
        if isTerm is True:
            if level <= prev[2]: 
                term.append(prev)
                #print(f" + {prev}", end="")
            #print("")
        elif isTerm is False:
            if level > prev[2]: 
                term.append(prev)
        prev = [ref, name, level]
    term.append(prev)
    return term

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = RawWebData(debug=False)
downloadGenreRecord.load()
downloadNonTermGenreRecord.load()
genreNamesToGet = {}
nonTermGenreNamesToGet = {}
for (ref, genre, level) in getTerminalGenres(isTerm=True):
    genreKey = (genre, None)
    if downloadGenreRecord.isKnown(genreKey):
        continue
    for page in range(1,10000):
        genreKey = (genre, page)
        if not downloadGenreRecord.isKnown(genreKey):
            genreNamesToGet[genreKey] = ref
            break
for (ref, genre, level) in getTerminalGenres(isTerm=False):
    genreKey = (genre, None)
    if downloadNonTermGenreRecord.isKnown(genreKey):
        continue
    for page in range(1,10000):
        genreKey = (genre, page)
        if not downloadNonTermGenreRecord.isKnown(genreKey):
            nonTermGenreNamesToGet[genreKey] = ref
            break
genreNamesToGet = Series(genreNamesToGet)
nonTermGenreNamesToGet = Series(nonTermGenreNamesToGet)
#knownNames = Series({(genre, 1): ref })
#genreNamesToGet = knownNames[~knownNames.index.map(downloadGenreRecord.isKnown)]

print(f"# {db} Search Results (PID={getpid()})")
print(f"#   Known Artist Names:  {downloadGenreRecord.numKnown()}")
print(f"#   Artist Names To Get: {genreNamesToGet.shape[0]}")
print(f"#   Known Artist Names:  {downloadNonTermGenreRecord.numKnown()}")
print(f"#   Artist Names To Get: {nonTermGenreNamesToGet.shape[0]}")

### Term Genre

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=True, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(genreName, page, nErrors, sleeptime, error=None):
    genreKey = (genreName, page)
    downloadGenreRecord.setError(index=genreKey)
    print(f"Search Error ==> {genreName}: {error}")
    nErrors.append(genreName)
    webio.sleep(sleeptime)
    
n = 0
maxN = 250000000
nErrors = []
for i, ((genreName, page), genreRef) in enumerate(genreNamesToGet.items()):
    if tt.isFinished():
        break
            
    genreURL = None
    pageNum = None
    last = False
    i = 0
    while last is False:
        i += 1

        if i > 500:
            last = True
            break
        
        page = page if pageNum is None else pageNum
        genreKey = (genreName, page)
        if downloadGenreRecord.isKnown(genreKey):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            break
        
        try:
            response = webio.getGenreData(genreRef, genreURL, page)
        except Exception as error:
            isError(genreName, page, nErrors, 10, error)
            break
    
        if not isinstance(response, dict):
            isError(genreName, page, nErrors, 3.5, "NotDict")
            break
    
        media = response["Media"]
        nextRef = response["NextRef"]
        pageNum = response["Page"]
        genreURL = f"{webio.baseURL}{nextRef}" if isinstance(nextRef, str) else None
        if len(media) == 0:
            last = True
    
        nErrors = []
        downloadGenreRecord.setData(index=genreKey, data=media)
        if pageNum is None or len(media) < 21:
            last = True
            genreKey = (genreName, None)
            media = []
            downloadGenreRecord.setData(index=genreKey, data=media)
            break
            
        webio.sleep(7.5)
        n += 1
            
        if n % 5 == 0:
            if tt.isFinished():
                last = True
                break
            webio.sleep(1.0)
            
        if n % 15 == 0:
            ts.update(n=n)
            downloadGenreRecord.save()
            webio.wait(20.0)
            if tt.isFinished():
                last = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            last = True
            break

ts.stop()
downloadGenreRecord.save()

### NonTerm Genre

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(genreName, page, nErrors, sleeptime, error=None):
    genreKey = (genreName, page)
    downloadNonTermGenreRecord.setError(index=genreKey)
    print(f"Search Error ==> {genreName}: {error}")
    nErrors.append(genreName)
    webio.sleep(sleeptime)
    
n = 0
maxN = 2500000
nErrors = []
for i, ((genreName, page), genreRef) in enumerate(nonTermGenreNamesToGet.items()):
    if tt.isFinished():
        break
            
    genreURL = None
    pageNum = None
    last = False
    i = 0
    while last is False:
        i += 1

        if i > 1000:
            last = True
            break
        
        page = page if pageNum is None else pageNum
        genreKey = (genreName, page)
        if downloadNonTermGenreRecord.isKnown(genreKey):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            break
        
        try:
            response = webio.getGenreData(genreRef, genreURL, page)
        except Exception as error:
            isError(genreName, page, nErrors, 10, error)
            break
    
        if not isinstance(response, dict):
            isError(genreName, page, nErrors, 3.5, "NotDict")
            break
    
        media = response["Media"]
        nextRef = response["NextRef"]
        pageNum = response["Page"]
        genreURL = f"{webio.baseURL}{nextRef}" if isinstance(nextRef, str) else None
        if len(media) == 0:
            last = True
    
        nErrors = []
        downloadNonTermGenreRecord.setData(index=genreKey, data=media)
        if pageNum is None or len(media) < 21:
            last = True
            genreKey = (genreName, None)
            media = []
            downloadNonTermGenreRecord.setData(index=genreKey, data=media)
            break
            
        webio.sleep(7.5)
        n += 1
            
        if n % 5 == 0:
            if tt.isFinished():
                last = True
                break
            webio.sleep(1.0)
            
        if n % 15 == 0:
            ts.update(n=n)
            downloadGenreRecord.save()
            webio.wait(20.0)
            if tt.isFinished():
                last = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            last = True
            break

ts.stop()
downloadNonTermGenreRecord.save()

## Create Music DB From Numbered Pages

In [None]:
from dbraw import getTimestamp, getYear
from musicdb.qobuz import MusicDBID
artistData = {}
mid = MusicDBID()
isTerm = False
if isTerm is True:
    downloadGenreRecord.load()
    data = downloadGenreRecord.getData()
else:
    downloadNonTermGenreRecord.load()
    data = downloadNonTermGenreRecord.getData()
    
for i, (genreKey, genreKeyData) in enumerate(data.items()):
    for item in genreKeyData:
        artist = item['Artist']
        artistName = artist[0]
        artistRef = artist[1]
        artistID = mid.getArtistID(artistRef)
        
        album = item['Album']
        albumName = album[0]
        albumRef = album[1]
        albumID = mid.getAlbumID(albumRef)
        
        label = item['Label']
        genre = item['Genre']
        release = item['Release']
        releaseTS = getTimestamp(release)
        year = getYear(release)
        cover = item['Cover']

        #print(albumID,'\t',albumRef,'\t',release,'\t',year)

        albumData = {"AlbumID": albumID, "AlbumName": albumName, "AlbumRef": albumRef, "Label": label,
                     "Genre": genre, "Release": release, "Year": year}
        artistInfo = {"ArtistName": artistName, "ArtistRef": artistRef}

        if artistData.get(artistID) is None:
            artistData[artistID] = {"ArtistName": artistName, "ArtistRef": artistRef, "Media": {}}
        artistData[artistID]["Media"][albumID] = albumData

    if i % 250 == 0:
        key = "-".join([str(x) for x in genreKey])
        print(f"{key: <35}{len(artistData)}")
print(f"{' ': <35}{len(artistData)}")

df = DataFrame({artistID: Series(artistIDData) for artistID, artistIDData in artistData.items()}).T

In [None]:
dbio = MusicDBIO(mod=True, mkDirs=False)
groupData = dbio.rdio.getData("SearchArtistGroup")

In [None]:
groupData = concat([groupData, df[~df.index.isin(groupData.index)]])

In [None]:
tmp = df[df.index.isin(groupData.index)]
for artistID, artistIDData in tmp.iterrows():
    newData = artistIDData['Media']
    oldData = groupData.loc[artistID, "Media"]
    print(f"{artistID: <15}{len(newData): <6}{len(oldData): <6} | ", end="")
    allData = artistIDData['Media'] | groupData.loc[artistID, "Media"]
    groupData.at[artistID, "Media"] = allData
    #groupData.loc[artistID, "Media"] = artistIDData['Media'] | groupData.loc[artistID, "Media"]
    #oldData = groupData.loc[artistID, "Media"]
    print(f"{len(allData): <6}")

In [None]:
dbio.rdio.saveData("SearchArtistGroup", data=groupData)

In [None]:
groupData.shape



# Download Albums

In [None]:
knownAlbumRecord.load()
knownAlbums = concat([dbio.rdio.getData("ModValArtistMedia", modVal)[['name', 'url']] for modVal in range(100)])
def getAID(x):
    return x.split('-')[1][3:]
def getAlbumID(x):
    return x.split('-')[2][3:]
knownAlbums["ArtistID"] = knownAlbums.index.map(getAID)
knownAlbums.index = knownAlbums.index.map(getAlbumID)
knownAlbumRecord.setData(data=knownAlbums.T.to_dict())
knownAlbumRecord.save()

In [None]:
dbio = MusicDBIO(local=True, mkDirs=False)
webio = RawWebData(debug=False)
knownNames = DataFrame(knownAlbumRecord.getData()).T
knownNames["IndexModVal"] = knownNames.index.map(dbio.mv.getModVal)
availableNames = knownNames[~knownNames.index.map(downloadAlbumRecord.isKnown)]
albumNamesToGet = Series({modVal: modValDF for modVal,modValDF in availableNames.groupby(["IndexModVal"])})

print(f"# {db} Search Results (PID={getpid()})")
print(f"#   Available Names:     {knownNames.shape[0]}")
print(f"#   Known Artist Names:  {downloadAlbumRecord.numKnown()}")
print(f"#   Artist Names To Get: {availableNames.shape[0]}")

del availableNames
del knownNames

In [None]:
ts = Timestat(f"Getting {db} Artists")
tt = getTT(skipEOD=False, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(albumName, albumID, nErrors, sleeptime, error=None):
    downloadAlbumRecord.setError(index=albumID)    
    print(f"Search Error ==> {albumName}: {error}")
    nErrors.append(albumName)
    webio.sleep(sleeptime)
    

stop = False
n = 0
maxN = 250000000
nErrors = []
downloadAlbumRecord.load(verbose=False)
for groupModVal, df in albumNamesToGet.items():
    modVal = groupModVal[0]
    if stop is True:
        break
    N = df.shape[0]
    for i, (albumID, row) in enumerate(df.iterrows()):
        if downloadAlbumRecord.isKnown(albumID):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            stop = True
            break

        albumName = row["name"]
        albumRef = row["url"]
        print(f"{modVal: <3} | {i: <5} | {N: <5} | {n: <5} | ", end="")

        try:
            response = webio.getAlbumData(albumName=albumName, albumRef=albumRef)
        except Exception as error:
            isError(albumName, albumID, nErrors, 10, error)
            continue
    
        if not isinstance(response, bytes):
            isError(albumName, albumID, nErrors, 5.0, "NotBytes")
            continue
    
        nErrors = []
        downloadAlbumRecord.setIndex(index=albumID)
        dbio.rdio.saveData("RawAlbum", modVal, albumID, data=response)
        webio.sleep(4.5)
        n += 1
            
        if n % 25 == 0:
            ts.update(n=n)
            downloadAlbumRecord.save()
            webio.wait(10.0)
            if tt.isFinished():
                stop = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            stop = True
            break

ts.stop()
downloadAlbumRecord.save()

In [None]:
downloadAlbumRecord.save()

In [None]:
dbio = MusicDBIO(local=True, mkDirs=False)
for modVal in range(100):
    crd = ConcatRawData(db=dbio.db, dType="Album")
    crd.concat(modVal=modVal)
    crd.merge(modVal=modVal, test=False)
    crd.remove(modVal=modVal, force=True)

In [None]:
from musicdb import PoolIO
from time import sleep

pio = PoolIO("Qobuz", nProcs=5)
pio.concat()
pio.merge()
#pio.metaprod()
#pio.sumprod()

In [None]:


https://www.qobuz.com/us-en/album/whats-going-on-marvin-gaye/0060253780989

## Parse

In [None]:
dbio = MusicDBIO(verbose=True)
dbio.pdio.parse(key="Artist")

In [None]:
from utils import FileIO
io = FileIO()
io.save(idata=artistData, ifile="qobuz-artistData.p")

## Make Raw Artist Data 

In [None]:
from utils import FileIO
io = FileIO()
data = io.get("qobuz-artistData.p")


In [None]:
df = Series(data).apply(Series)

In [None]:
df

In [None]:
for artistID, artistIDData in data.items():
    rawio = rdio.makeArtistData(artistID, artistIDData)
    break

In [None]:

up.path

In [None]:
term = getTerminalGenres()
ref = term[0][0]
baseURL = dbio.params.baseURL
url = f"{baseURL}{ref}?ssf%5BsortBy%5D=main_catalog_date_desc"
print(url)
#       https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/1?ssf%5BsortBy%5D=main_catalog_date_desc
#print("https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc")
#https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc
#https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/?ssf%5BsortBy%5D=main_catalog_date_desc
url1="https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/3?ssf%5BsortBy%5D=main_catalog_date_desc"
url2='https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/3?ssf%5BsortBy%5D=main_catalog_date_desc'
url1 == url2
#url="https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/2?ssf%5BsortBy%5D=main_catalog_date_desc"
#genreData = webio.get(url)

In [None]:
url = webio.getGenrePageURL(genreRef=ref, genreURL=None, page=3)
genreData = webio.get(url)

In [None]:
import difflib
a = url1
b = url2
for i,s in enumerate(difflib.ndiff(a, b)):
    print(i,'\t',s)

In [None]:
genreData.data

In [None]:
from utils import getHTML

https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc
https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/2?ssf%5BsortBy%5D=main_catalog_date_desc
bsdata = getHTML(gData.data)

In [None]:
pageDiv = bsdata.find("div", {"class": "product__header"})
nextRef = pageDiv.find("a", {"rel": "next"})
nextRef

In [None]:
from dbraw import isTag, getTagText
def getPageData(bsdata):
    retval = []
    
    wrapper = bsdata.find("ul", {"class": "product__wrapper"})
    for li in wrapper.findAll("li"):
        coverDiv = li.find("div", {"class": "product__cover"})
        cover = coverDiv.get('data-src') if isTag(coverDiv) else None
    
        dataDiv = li.find("div", {"class": "product__data"})
        genreTag = dataDiv.find("p", {"class": "product__data--genre"}) if isTag(dataDiv) else None
        genre = getTagText(genreTag).strip()
        releaseTag = dataDiv.find("p", {"class": "product__data--release"}) if isTag(dataDiv) else None
        release = getTagText(releaseTag).strip()
        
        containerDiv = li.find("div", {"class": "product__container"})
        containerRefTag = containerDiv.find("a") if isTag(containerDiv) else None
        containerRef = containerRefTag.get('href') if isTag(containerRefTag) else None
        containerTitle = getTagText(containerRefTag).strip()
            
        artistDiv = li.find("p", {"class": "product__artist"})
        artistTag = artistDiv.find("a") if isTag(artistDiv) else None
        artistRef = artistTag.get('href') if isTag(artistTag) else None
        artistName = getTagText(artistTag).strip()
        
        infosDiv = li.find("p", {"class": "product__infos"})
        infosTag = infosDiv.find("a") if isTag(infosDiv) else None
        labelRef = infosTag.get('href') if isTag(infosTag) else None
        labelName = getTagText(infosTag).strip()
    
        # print(f"{artistName: <40}{containerTitle: <60}")

        record = {"Artist": [artistName, artistRef], "Album": [containerTitle, containerRef],
                  "Label": [labelName, labelRef], "Genre": genre, "Release": release}

        retval.append(record)

    return retval

In [None]:
retval = getPageData(bsdata)

In [None]:
retval

In [None]:
from utils import getHTML
bsdata = getHTML(data.data)
from utils import FileIO
io = FileIO()
io.save(idata=data.data, ifile="qobuz.genres.p")

In [None]:
url = "https://www.qobuz.com/us-en/search?q=Boris"
data = webio.get(url)

In [None]:
ul = bsdata.find("ul", {"class": "hierarchical-list"})
lis = ul.findAll("li")
genres = []
levels = {"level1": None, "level2": None, "level3": None}
for li in lis:
    level = li.get('class')
    atag = li.find('a')
    ref = atag.get('href')
    name = atag.text
    genres.append([level, name, ref])