In [None]:
%load_ext autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100`% !important; }</style>"))
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
from apiutils import WebIO
from ioutils import FileIO, HTMLIO
from fileutils import FileInfo, DirInfo
from master import MasterParams, MusicDBPermDir
from pandas import Series, DataFrame, concat
from sys import prefix
from listUtils import getFlatList
from musicdb import PanDBIO
mp    = MasterParams(verbose=True)
io    = FileIO()
wio   = WebIO()
hio   = HTMLIO()
mdbpd = MusicDBPermDir()

In [None]:
from lib import classicalarchives
mio   = classicalarchives.MusicDBIO(verbose=True, mkDirs=True)
webio = classicalarchives.RawWebData()
db    = mio.db
permDBDir = mdbpd.getDBPermPath(db)
print("Saving Perminant {0} DB Data To {1}".format(db, permDBDir.str))

In [None]:
from base import MusicDBDir, MusicDBData
permDir = MusicDBDir(permDBDir)
localComposers     = MusicDBData(path=permDir, fname="{0}SearchedForLocalComposers".format(db.lower()))
localPerformers    = MusicDBData(path=permDir, fname="{0}SearchedForLocalPerformers".format(db.lower()))
knownArtists       = {} #mio.data.getSummaryNameData()
searchComposers    = mio.data.getSearchComposersData()
searchPerformers   = mio.data.getSearchPerformersData()
errors             = MusicDBData(path=permDir, fname="{0}SearchedForErrors".format(db.lower()))

In [None]:
##########################################################################################
# Show Summary
##########################################################################################
print("{0} Search Results".format(db))
print("   Local Composers:           {0}".format(len(localComposers.get())))
print("   Local Performers:          {0}".format(len(localPerformers.get())))
print("   Errors:                    {0}".format(len(errors.get())))
print("   Search Composers:          {0}".format(len(searchComposers)))
print("   Search Performers:         {0}".format(len(searchPerformers)))
print("   Known Summary IDs:         {0}".format(len(knownArtists)))

In [None]:
import osascript
def getScript(url, savename, dtime):
    dscript = '''
tell application "Safari"
activate
set URL of document 1 to "{0}"
delay {2}
set myString to source of document 1
end tell
set newFile to POSIX file "{1}"
open for access newFile with write permission
write myString to newFile as «class utf8»
close access newFile
'''.format(url, savename, dtime)
    
    return dscript

# Download ModVal Data

## Download Via OSA

In [None]:
#url = f"https://www.classicalarchives.com/performers/{ch}.html"
from string import ascii_lowercase
aTypes = ["composers", "performers"]
for aType in aTypes:
    for ch in ascii_lowercase:
        url = f"https://www.classicalarchives.com/{aType}/{ch}.html"
        savename = f"/Users/tgadfort/Documents/code310/pandb/note/classicalarchives/{aType}_{ch}.html"
        if FileInfo(savename).exists():
            continue
        print(f"{url: <60} ==> {savename}")
        dscript  = getScript(url, savename, dtime=15)
        code,out,err = osascript.run(dscript)

## Parse OSA Data

In [None]:
from lib.classicalarchives import MusicDBID
mdbid = MusicDBID()
aTypes = ["composers", "performers"]
saveData = {}
for aType in aTypes:
    artistData = {}
    files  = DirInfo("/Users/tgadfort/Documents/code310/pandb/note/classicalarchives").glob(f"{aType}_*.html")
    for ifile in files:
        fData = {}
        bsdata = hio.get(open(ifile, encoding="latin-1").read())
        listingDiv = bsdata.find("div", {"class": "listing"})
        if listingDiv:
            refs = listingDiv.findAll("a")
            fData.update({mdbid.get(ref): {"Name": ref.text, "Ref": ref.get('href')} for ref in refs})
        print(len(fData),'\t',ifile)
        artistData.update(fData)
    artistData = DataFrame(artistData).T
    artistData = artistData[artistData["Ref"].apply(lambda ref: isinstance(ref,str))]
    saveData[aType] = artistData

In [None]:
mio.data.saveSearchComposersData(data=saveData['composers'])
mio.data.saveSearchPerformersData(data=saveData['performers'])

# Download Composer Data

In [None]:
mio   = classicalarchives.MusicDBIO(verbose=False,local=True,mkDirs=False)

In [None]:
useSearchData = True
if useSearchData is True:
    composerNames      = searchComposers #.sort_values(by="Num", ascending=False)
    localComposersDict = localComposers.get()
    composerNamesToGet = composerNames[~composerNames.index.isin(localComposersDict.keys())].sample(frac=1)

    print("# {0} Search Results".format(db))
    print("#   Available Names:      {0}".format(len(composerNames)))
    print("#   Known Artist Names:   {0}".format(len(localComposersDict)))
    print("#   Artist Names To Get:  {0}".format(len(composerNamesToGet)))

In [None]:
if False:
    localComposersDict  = localComposers.get()
    for i,(composerID,row) in enumerate(composerNamesToGet.iterrows()):
        composerName = row["Name"]
        composerRef  = row["Ref"]
        localComposersDict[composerID] = composerName
        if len(localComposersDict) == 500:
            break

    print("Saving {0} {1} Composers Data".format(len(localComposersDict), db))
    localComposers.save(data=localComposersDict)

In [None]:
from timeutils import Timestat, TermTime
import random

ts = Timestat("Getting {0} composerIDs".format(db))
tt = TermTime("tomorrow", "9:50am")
#tt = TermTime("today", "7:00pm")
maxN = 5000000

n  = 0
localComposersDict  = localComposers.get()
searchedForErrors   = errors.get()
N = composerNamesToGet.shape[0]

for i,(composerID,row) in enumerate(composerNamesToGet.iterrows()):
    composerName = row["Name"]
    composerRef  = row["Ref"]
    if localComposersDict.get(composerID) is not None:
        continue
    #if searchedForErrors.get(composerID) is not None:
    #    continue
        
    url = f"https://www.classicalarchives.com{composerRef}"
    savename = f"/Users/tgadfort/Desktop/ClassicalArchives/Composer/{composerID}.html"
    #if FileInfo(savename).exists():
    #    continue
    print(f"{i+1: >5}/{N: <10}{url: <60} ==> ", end="")
    dscript  = getScript(url, savename, dtime=7+random.randint(0,5))
    code,out,err = osascript.run(dscript)
    if FileInfo(savename).exists():
        print(f"{composerID}")
        localComposersDict[composerID] = composerName
    else:
        searchedForErrors[composerID] = composerName
        print(f"Error in download.")
        
    n += 1
        
    if n % 10 == 0 or n >= maxN:
        print("="*150)
        ts.update(n=n)
        print("Saving {0} {1} Composers Data".format(len(localComposersDict), db))
        localComposers.save(data=localComposersDict)
        if len(searchedForErrors) > 0:
            errors.save(data=searchedForErrors)
        print("="*150)
        webio.wait(5.0)
        if tt.isFinished() or n >= maxN:
            break
    
ts.stop()
print("Saving {0} {1} Composers Data".format(len(localComposersDict), db))
localComposers.save(data=localComposersDict)
if len(searchedForErrors) > 0:
    print("Saving {0} {1} Errors".format(len(searchedForErrors), db))
    errors.save(data=searchedForErrors)

# Download Performer Data

In [None]:
mio   = classicalarchives.MusicDBIO(verbose=False,local=True,mkDirs=False)

In [None]:
useSearchData = True

if useSearchData is True:
    performerNames      = searchPerformers #.sort_values(by="Num", ascending=False)
    localPerformersDict = localPerformers.get()
    performerNamesToGet = performerNames[~performerNames.index.isin(localPerformersDict.keys())].sample(frac=1)

    print("# {0} Search Results".format(db))
    print("#   Available Names:      {0}".format(len(performerNames)))
    print("#   Known Artist Names:   {0}".format(len(localPerformersDict)))
    print("#   Artist Names To Get:  {0}".format(len(performerNamesToGet)))
    
#   Artist Names To Get:  18518
#   Artist Names To Get:  9042

In [None]:
from timeutils import Timestat, TermTime
import random

ts = Timestat("Getting {0} performerIDs".format(db))
#tt = TermTime("tomorrow", "9:50am")
tt = TermTime("today", "10:00pm")
maxN = 50000

n  = 0
localPerformersDict = localPerformers.get()
searchedForErrors   = errors.get()
N = performerNamesToGet.shape[0]

for i,(performerID,row) in enumerate(performerNamesToGet.iterrows()):
    performerName = row["Name"]
    performerRef  = row["Ref"]
    if localPerformersDict.get(performerID) is not None:
        continue
    #if searchedForErrors.get(performerID) is not None:
    #    continue
        
    url = f"https://www.classicalarchives.com{performerRef}"
    savename = f"/Users/tgadfort/Desktop/ClassicalArchives/Performer/{performerID}.html"
    if FileInfo(savename).exists():
        continue
    print(f"{i+1: >5}/{N: <10}{url: <60} ==> ", end="")
    dscript  = getScript(url, savename, dtime=7+random.randint(0,5))
    code,out,err = osascript.run(dscript)
    if FileInfo(savename).exists():
        print(f"{performerID}")
        localPerformersDict[performerID] = performerName
    else:
        searchedForErrors[performerID] = performerName
        print(f"Error in download.")
        
    n += 1
        
    if n % 8 == 0 or n >= maxN:
        print("="*150)
        ts.update(n=n)
        print("Saving {0} {1} performers Data".format(len(localPerformersDict), db))
        localPerformers.save(data=localPerformersDict)
        if len(searchedForErrors) > 0:
            errors.save(data=searchedForErrors)
        print("="*150)
        webio.wait(4.0)
        if tt.isFinished() or n >= maxN:
            break
    
ts.stop()
print("Saving {0} {1} performers Data".format(len(localPerformersDict), db))
localPerformers.save(data=localPerformersDict)
if len(searchedForErrors) > 0:
    print("Saving {0} {1} Errors".format(len(searchedForErrors), db))
    errors.save(data=searchedForErrors)

In [None]:
localPerformers.save(data=localPerformersDict)

In [None]:
from lib.classicalarchives import moveLocalFiles, removeLocalFiles
moveLocalFiles()
#removeLocalFiles()
#localPerformers.save(data=localPerformersDict)

In [None]:
mio.prd.parseComposerData(modVal=1, force=True)
mio.prd.mergeModValData(modVal=1)

# Download & Parse

In [None]:
from fileutils import DirInfo
from ioutils import FileIO
aTypeDir = "Composer"
mioLocal  = DirInfo(f"/Users/tgadfort/Desktop/ClassicalArchives/{aTypeDir}")
io        = FileIO()
print("  ==> Finding Files in {0}: ".format(mioLocal.str), end="")
files = list(mioLocal.glob("*.htm*"))
print("  ==> Found {0} Files".format(len(files)))

In [None]:
composerID="23701"
composerRef=f"/composer/{composerID}.html"
url = f"https://www.classicalarchives.com{composerRef}"
savename = f"/Users/tgadfort/Desktop/ClassicalArchives/Composer/{composerID}.html"
print(f"{url: <60} ==> ", end="")
dscript  = getScript(url, savename, dtime=3+random.randint(0,5))
code,out,err = osascript.run(dscript)
print(f"{savename}")

In [None]:
names = mio.data.getSummaryNameData()

In [None]:
names[names.index.isin([FileInfo(ifile).basename for ifile in files])].head(30)

In [None]:
ifile = '/Users/tgadfort/Desktop/ClassicalArchives/Composer/23701.html'
data  = open(ifile, encoding="ascii").read()

In [None]:
data

In [None]:

        files = list(mioLocal.glob("*.htm*"))
        ts = Timestat("Moving {0} Local Files To Global Directories".format(len(files)))
        for n,ifile in enumerate(files):
            if (n+1) % 25 == 0:
                ts.update(n=n+1,N=len(files))
            dbID    = FileInfo(ifile).basename
            modVal  = mioGlobal.getModVal(dbID)
            dstFile = FileInfo(eval(f"mioGlobal.data.getRaw{aTypeDir}Filename(modVal,dbID)"))

In [None]:
from utils import PoolIO
pio = PoolIO("ClassicalArchives")
#pio.parse(force=True)
pio.merge()
pio.meta()
pio.sum()
pio.search()

In [None]:
# Multiple pages
#https://www.classicalarchives.com/artist/6138.html

In [None]:
from collections import Counter
cntr = Counter()
for modVal in range(100):
    data = mio.data.getModValData(modVal)
    for k,v in data.iteritems():
        for name,vals in v.mediaCounts.counts.items():
            cntr[name] += vals

In [None]:
cntr.most_common()

In [None]:
from lib.classicalarchives import RawDBData
rdbData = RawDBData(debug=False)
retval = rdbData.getPerformerData('/Volumes/Piggy/Discog/artists-classicalarchives/4/performer/105604.p')

In [None]:
retval.show()

In [None]:
retval.media.media["Performances"][0].get()

In [None]:
bsdata = hio.get(io.get('/Volumes/Piggy/Discog/artists-classicalarchives/4/performer/105604.p'))

In [None]:
bsdata

In [None]:
jsonData['albums']

In [None]:
jsonLines = [line.strip().split(" = ")[-1] for line in jsData.split("\n")]

In [None]:
jsonLines

In [None]:

try:
    jsonData = [json.loads(jsonLine[:-3]) for jsonLine in jsonLines]
except:
    jsonData = []