In [None]:
%load_ext autoreload
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from dbmaster import MasterParams, MasterPersist
from dbbase import MusicDBIDModVal, MusicDBDir, MusicDBData
from dbnote import DownloadRecord, KnownRecord
from utils import FileIO, DirInfo, FileInfo, getFlatList, Timestat, TermTime, TermTimeTS, getTT
from pandas import Series, DataFrame, concat, Timestamp
from pandb import PanDBIO
from musicdb.yesasia import MusicDBParams, RawWebData, MusicDBIO
from os import getpid

mv = MusicDBIDModVal()
io = FileIO()
mpar = MusicDBParams()
dbio = MusicDBIO()
webio = RawWebData()
db = mpar.db

In [None]:
searchArtistRecord = DownloadRecord(db=db, name="SearchArtist", rTypes=["Index", "Data"])
downloadArtistRecord = DownloadRecord(db=db, name="DownloadArtist", rTypes=["Index"])
knownLinks = KnownRecord(db=db, name="Links")

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print(f"{db} Search Results (PID={getpid()})".format(db))
searchArtistRecord.info()
downloadArtistRecord.info()
knownLinks.info()
#print(f"  {'KnownArtist Data': <20}: {knownArtists().shape[0]}")

# Starter Data

## Known Links

In [None]:
from utils import WebIO, getHTML, FileIO
webio = WebIO()
musicURL = "https://www.yesasia.com/us/en/music-concerts-videos.html"
data = webio.get(musicURL)
from dbraw import getTagText
bsdata = getHTML(data.data)

links = {}
musicDiv = bsdata.find("div", {"id": "idb"})
musicLIs = musicDiv.findAll("li", {"class": "filterPanel"})
for li in musicLIs:
    countryTag = li.find('b')
    country = getTagText(countryTag)
    pages = li.findAll("li")
    refTags = [page.find('a') for page in pages]
    refs = {ref.get('href'): ref.text for ref in refTags}
    links[country] = refs
    print(country, '\t', refs)


editorDiv = bsdata.find("div", {"id": "id19"})
musicLIs = editorDiv.findAll("li", {"class": "filterPanel"})
for li in musicLIs:
    countryTag = li.find('b')
    country = getTagText(countryTag)
    pages = li.findAll("li")
    refTags = [page.find('a') for page in pages]
    refs = {ref.get('href'): ref.text for ref in refTags}
    links[country] = refs
    print(country, '\t', refs)

knownLinks.init(force=True)
knownLinks.load()
knownLinks.setData(links)
knownLinks.save()

## Individual Artists

In [None]:
knownLinks.load(verbose=False)
links = knownLinks.getData()

In [None]:
for country, countryLinks in links.items():
    for url, name in countryLinks.items():
        if name in ["Female Singers", "Male Singers", "Groups"]:
            print(f"{country: <20}{name: <20}{url}")

In [None]:
from utils import WebIO, getHTML, FileIO
from dbraw import getTagText, isTag, isBS4

def getCharRefs(bsdata):
    charIdxTag = bsdata.find("div", {"id": "charIndex"})
    charIdxRefs = [(ref.text, ref.get('href')) for ref in charIdxTag.findAll("a")]
    charIdxRefs = [(ch, ref) for (ch, ref) in charIdxRefs if ch.isalpha() and ref.startswith("http")]
    return charIdxRefs

def getPageData(bsdata, artistData):
    def getArtists(bsdata):
        artistListDiv = bsdata.find("div", {"id": "artistList"})
        artistRefTags = [li.find('a') for li in artistListDiv.findAll("li")]
        artistRefs = [(ref.text, ref.get('href')) for ref in artistRefTags if isTag(ref)]
        return artistRefs
        
    print(f"Curr = {len(artistData): <5} | ", end="")
    newArtists = getArtists(bsdata)
    print(f"Found = {len(newArtists): <5} | ", end="")
    artistData += newArtists
    print(f"Total = {len(artistData): <5}")
    pagingSpan = bsdata.find("span", {"class": "paging"})
    if not isTag(pagingSpan):
        return
    for spanRef in pagingSpan.findAll("a"):
        if spanRef.get('title') == "Next":
            nextURL = spanRef.get('href')
            print(f"{' ': <4}{nextURL: <100}", end="")
            print(".", end="")
            webio.sleep(7.5)
            data = webio.get(nextURL)
            webio.sleep(7.5)
            print(".\t", end="")
            if data.code != 200:
                print(f"Bad Code [{data.code}] [{data}]")
                break
            bsdata = getHTML(data.data)
            if not isBS4(bsdata):
                print("Not BS4")
                break
            getPageData(bsdata, artistData)
            break

In [None]:
webio = WebIO()
url="https://www.yesasia.com/us/japanese-female-singers/0-0-0-bpt.297_alb.4-en/list.html"
artistData = []
print(f"{'A': <4}{url: <100}", end="")
print(".", end="")
data = webio.get(url)
webio.sleep(7.5)
print(".\t", end="")

if data.code != 200:
    print(f"Bad Code {data.code}")
    1/0
bsdata = getHTML(data.data)
if not isBS4(bsdata):
    print("Not BS4")
    1/0

charRefs = getCharRefs(bsdata)
getPageData(bsdata, artistData)

for ch, url in charRefs:
    print(f"{ch: <4}{url: <100}", end="")
    print(".", end="")
    webio.sleep(7.5)
    data = webio.get(url)
    webio.sleep(7.5)
    print(".\t", end="")
    if data.code != 200:
        print(f"Bad Code [{data.code}] [{data}]")
        break
    bsdata = getHTML(data.data)
    if not isBS4(bsdata):
        print("Not BS4")
        break
    getPageData(bsdata, artistData)

In [None]:
from dbraw import isTag

In [None]:
len("https://www.yesasia.com/us/0-0-0-ann.6_vm.32_bt.297_anit.3_bpt.297_alb.4-en/list.html")

In [None]:


charIdxRefs

In [None]:
name = "Rolling St"

baseURL = dbio.params.baseURL
dname = "-".join([val.lower() for val in name.split(" ")])
sname = "+".join(name.split(" "))

url = f"{baseURL}/us/search/{dname}/0-0-0-q.{sname}_bpt.48-en/list.html"
#https://www.yesasia.com/us/search/dave-matthews/0-0-0-q.Dave+Matthews_bpt.48-en/list.html
#https://www.yesasia.com/us/search/rolling-stones/0-0-0-q.Rolling+Stones_bpt.48-en/list.html
test = "https://www.yesasia.com/us/search/rolling-st/0-0-0-q.Rolling+St_bpt.48-en/list.html"
test == url

In [None]:

url = "https://www.qobuz.com/us-en/genres/download-streaming-albums"
data = webio.get(url)

In [None]:
bsdata = getHTML(data.data)
ul = bsdata.find("ul", {"class": "hierarchical-list"})
lis = ul.findAll("li")
genres = []
knownGenres = {}
levels = {1: None, 2: None, 3: None, 4: None}
prev = []

for li in lis:
    level = li.get('class')
    levels[level] = name
    atag = li.find('a')
    ref = atag.get('href')
    name = atag.text
    genres.append([level, name, ref])
    knownGenres[ref] = [name, level]

In [None]:
knownGenreRecord.setData(knownGenres)
knownGenreRecord.save()

# Download Numbered Pages

In [None]:
def getTerminalGenres():
    knownGenreRecord.load(verbose=False)
    data = knownGenreRecord.getData()
    prev = [None, None, 0]
    term = []
    for ref, (name, level) in data.items():
        level = int(level[0][-1])
        #print(level,'\t',name, end="\t")
        if level <= prev[2]: 
            term.append(prev)
            #print(f" + {prev}", end="")
        #print("")
        prev = [ref, name, level]
    term.append(prev)
    return term

In [None]:
dbio = MusicDBIO(verbose=False,local=True,mkDirs=False)
webio = RawWebData(debug=False)
downloadGenreRecord.load()
genreNamesToGet = {}
for (ref, genre, level) in getTerminalGenres():
    genreKey = (genre, None)
    if downloadGenreRecord.isKnown(genreKey):
        continue
    for page in range(1,10000):
        genreKey = (genre, page)
        if not downloadGenreRecord.isKnown(genreKey):
            genreNamesToGet[genreKey] = ref
            break
genreNamesToGet = Series(genreNamesToGet)
#knownNames = Series({(genre, 1): ref })
#genreNamesToGet = knownNames[~knownNames.index.map(downloadGenreRecord.isKnown)]

print(f"# {db} Search Results")
print(f"#   Known Artist Names:  {downloadGenreRecord.numKnown()}")
print(f"#   Artist Names To Get: {genreNamesToGet.shape[0]}")

In [None]:
ts = Timestat(f"Getting {db} ArtistIDs")
tt = getTT(skipEOD=True, vacation=True)
assert dbio.rdio.isLocal, f"MusicDBIO is not set for local downloads!"

def isError(genreName, page, nErrors, sleeptime, error=None):
    genreKey = (genreName, page)
    downloadGenreRecord.setError(index=genreKey)
    print(f"Search Error ==> {genreName}: {error}")
    nErrors.append(artistName)
    webio.sleep(sleeptime)
    
n = 0
maxN = 250000000
nErrors = []
for i, ((genreName, page), genreRef) in enumerate(genreNamesToGet.items()):
    if tt.isFinished():
        break
            
    genreURL = None
    pageNum = None
    last = False
    while last is False:
        page = page if pageNum is None else pageNum
        genreKey = (genreName, page)
        if downloadGenreRecord.isKnown(genreKey):
            continue
    
        if len(nErrors) >= 5:
            print("Stopping due to 5 consecutive errors")
            break
        
        try:
            response = webio.getGenreData(genreRef, genreURL, page)
        except Exception as error:
            isError(genreName, page, nErrors, 10, error)
            continue
    
        if not isinstance(response, dict):
            isError(genreName, page, nErrors, 3.5, "NotDict")
            continue
    
        media = response["Media"]
        nextRef = response["NextRef"]
        pageNum = response["Page"]
        genreURL = f"{webio.baseURL}{nextRef}" if isinstance(nextRef, str) else None
        if len(media) == 0:
            last = True
    
        nErrors = []
        downloadGenreRecord.setData(index=genreKey, data=media)
        if pageNum is None or len(media) < 21:
            last = True
            genreKey = (genreName, None)
            media = []
            downloadGenreRecord.setData(index=genreKey, data=media)
            break
            
        webio.sleep(6.5)
        n += 1
            
        if n % 5 == 0:
            if tt.isFinished():
                last = True
                break
            webio.sleep(1.0)
            
        if n % 15 == 0:
            ts.update(n=n)
            downloadGenreRecord.save()
            webio.wait(10.0)
            if tt.isFinished():
                last = True
                break
        
        if n >= maxN:
            print(f"Breaking after {maxN} downloads...")
            last = True
            break

ts.stop()
downloadGenreRecord.save()

In [None]:
downloadGenreRecord.save()

In [None]:
downloadGenreRecord.getData()

In [None]:
term = getTerminalGenres()
ref = term[0][0]
baseURL = dbio.params.baseURL
url = f"{baseURL}{ref}?ssf%5BsortBy%5D=main_catalog_date_desc"
print(url)
#       https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/1?ssf%5BsortBy%5D=main_catalog_date_desc
#print("https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc")
#https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc
#https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/?ssf%5BsortBy%5D=main_catalog_date_desc
url1="https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/3?ssf%5BsortBy%5D=main_catalog_date_desc"
url2='https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/3?ssf%5BsortBy%5D=main_catalog_date_desc'
url1 == url2
#url="https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/2?ssf%5BsortBy%5D=main_catalog_date_desc"
#genreData = webio.get(url)

In [None]:
url = webio.getGenrePageURL(genreRef=ref, genreURL=None, page=3)
genreData = webio.get(url)

In [None]:
import difflib
a = url1
b = url2
for i,s in enumerate(difflib.ndiff(a, b)):
    print(i,'\t',s)

In [None]:
genreData.data

In [None]:
from utils import getHTML

https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums?ssf%5BsortBy%5D=main_catalog_date_desc
https://www.qobuz.com/us-en/genre/pop-inde/download-streaming-albums/page/2?ssf%5BsortBy%5D=main_catalog_date_desc
bsdata = getHTML(gData.data)

In [None]:
pageDiv = bsdata.find("div", {"class": "product__header"})
nextRef = pageDiv.find("a", {"rel": "next"})
nextRef

In [None]:
from dbraw import isTag, getTagText
def getPageData(bsdata):
    retval = []
    
    wrapper = bsdata.find("ul", {"class": "product__wrapper"})
    for li in wrapper.findAll("li"):
        coverDiv = li.find("div", {"class": "product__cover"})
        cover = coverDiv.get('data-src') if isTag(coverDiv) else None
    
        dataDiv = li.find("div", {"class": "product__data"})
        genreTag = dataDiv.find("p", {"class": "product__data--genre"}) if isTag(dataDiv) else None
        genre = getTagText(genreTag).strip()
        releaseTag = dataDiv.find("p", {"class": "product__data--release"}) if isTag(dataDiv) else None
        release = getTagText(releaseTag).strip()
        
        containerDiv = li.find("div", {"class": "product__container"})
        containerRefTag = containerDiv.find("a") if isTag(containerDiv) else None
        containerRef = containerRefTag.get('href') if isTag(containerRefTag) else None
        containerTitle = getTagText(containerRefTag).strip()
            
        artistDiv = li.find("p", {"class": "product__artist"})
        artistTag = artistDiv.find("a") if isTag(artistDiv) else None
        artistRef = artistTag.get('href') if isTag(artistTag) else None
        artistName = getTagText(artistTag).strip()
        
        infosDiv = li.find("p", {"class": "product__infos"})
        infosTag = infosDiv.find("a") if isTag(infosDiv) else None
        labelRef = infosTag.get('href') if isTag(infosTag) else None
        labelName = getTagText(infosTag).strip()
    
        # print(f"{artistName: <40}{containerTitle: <60}")

        record = {"Artist": [artistName, artistRef], "Album": [containerTitle, containerRef],
                  "Label": [labelName, labelRef], "Genre": genre, "Release": release}

        retval.append(record)

    return retval

In [None]:
retval = getPageData(bsdata)

In [None]:
retval

In [None]:
from utils import getHTML
bsdata = getHTML(data.data)
from utils import FileIO
io = FileIO()
io.save(idata=data.data, ifile="qobuz.genres.p")

In [None]:
url = "https://www.qobuz.com/us-en/search?q=Boris"
data = webio.get(url)

In [None]:
ul = bsdata.find("ul", {"class": "hierarchical-list"})
lis = ul.findAll("li")
genres = []
levels = {"level1": None, "level2": None, "level3": None}
for li in lis:
    level = li.get('class')
    atag = li.find('a')
    ref = atag.get('href')
    name = atag.text
    genres.append([level, name, ref])