# Find XSD files to upload, then "load them up"
Read XML schemas for AMBench 2022 and upload those to a CDCS instance (short: CDCS), properly taking care of dependencies.
This version uses Guillume's code to do the latter. Uses Kevin's code to support uploading new versions of the schemas.
* all *.xsd files inside the 'XSD' folder
* read xml content
* infer included schemas, load those first 
* define template name ('title')
* check whether the template already exists in CDCS. IF SO:
  * Check whether the templates to be removed actually have existing XML documents loaded.<br/>
  * These will have to be migrated to the new version of the template after those have been loaded.<br/>
  * BUT we must preserve&lt;pid&gt;s hence the XML documents must first be downloaded, then deleted on CDCS, then uploaded again.<br/>
  * AND we must ensure they are still valiud wrt to the new schemas!

In [None]:
# import lxml
import lxml.etree as ET
import pandas
import os
from pathlib import Path
import xmlschema
import getpass
from cdcs import CDCS
import requests
import json
# import xml.dom.minidom
import glob
import uuid
import sys
import importlib
import SciServer.Authentication as sauth

In [None]:
import myconfig

In [None]:
importlib.reload(myconfig)
from myconfig import *

In [None]:
sys.path.insert(0, CONFIG.pyUTILS_path)
import ambench.cdcs_utils
from ambench.mapping import new_mapper

In [None]:
importlib.reload(ambench.cdcs_utils)
from ambench.cdcs_utils import *

In [None]:
AMBENCH=CONFIG.AMBENCH_URL.split("/")[2].split(".")[0]
EXCEL_FILE = CONFIG.SAMPLES_EXCEL_FILE
try : 
    USER=CONFIG.USER
except: USER = input('username: ')
try: PASS=CONFIG.PASS
except : PASS = getpass.getpass('enter password ')
AUTH=(USER, PASS)


In [None]:
ROOT=os.getcwd()
path = Path(ROOT)
TEMPLATE_SCHEMA="AMDocs.xsd"

# -2: define functions

In [None]:
def clearTemps():
    filelist = glob.glob(os.path.join(VALID_XML, "*.xml"))
    for f in filelist:
        os.remove(f)
    print("removed",len(filelist),"files from",VALID_XML)
    filelist = glob.glob(os.path.join(INVALID_XML, "*.xml"))
    for f in filelist:
        os.remove(f)
    print("removed",len(filelist),"files from",INVALID_XML)

# -1: get credentials

In [None]:
SCISERVER_USER=sauth.getKeystoneUserWithToken(sauth.getToken()).userName

# 0. define parameters
input
* url to ambench CDCS instance
* folder with XSD files to be uploaded/updated
* file name of the XSD file defining the root element (AMDocs.xsd)
* name of the CDCS template

objects
* pycdcs CDCS instance
* list of ids of all versions of the template
* id of the current version
* xmlschema instance for the schema

In [None]:
XML_WORKSPACE=f"/home/idies/workspace/Temporary/{SCISERVER_USER}/scratch/AMBENCH/XML_TEMP"
VALID_XML=f"{XML_WORKSPACE}/VALID"
INVALID_XML=f"{XML_WORKSPACE}/INVALID"
os.makedirs(VALID_XML,exist_ok=True)
os.makedirs(INVALID_XML,exist_ok=True)
print(VALID_XML,INVALID_XML)

In [None]:
TITLE_PREFIX=''
TEMPLATE=f'{TITLE_PREFIX}{CONFIG.TEMPLATE}'
xsd_filename=f'{CONFIG.XSD}/AMDocs.xsd'
SCHEMA=xmlschema.XMLSchema(xsd_filename,build=False,)
SCHEMA.build()
SCHEMA.validity

In [None]:
try:
    ambench2022=AMBench2022(TEMPLATE,CONFIG.AMBENCH_URL,auth=AUTH)
    if ambench2022.template is None:
        print("Template",TEMPLATE,"does not yet exists, trying to create it now")
        ambench2022.loadSchema(XSD,TITLE_PREFIX,TEMPLATE_SCHEMA)
    tversionsms=ambench2022.get_template_managers(title=TEMPLATE)
    if len(tversionsms)>0:
        CURRENT=tversionsms['current'][0]
    else:
        CURRENT = None
    TEMPLATE_VERSIONS=ambench2022.get_templates(title=TEMPLATE,current=False)
except Exception as e:
    print(e)
    raise(e)

In [None]:
CURRENT

# 1. Check all loaded XML docs
For the current version of the template!

In [None]:
AMDocs=ambench2022.get_records(template=ambench2022.template)
print(len(AMDocs))
AMDocs.head(3)

for all versions

# 2. check validity of retrieved XML docs wrt new schema

In [None]:
clearTemps()

In [None]:
valid_ids=[]
valids=[]
invalid_ids=[]
invalids={}

for t in AMDocs.itertuples():
    is_valid=SCHEMA.is_valid(t.xml_content)
#     if not(is_valid):
#         print(t.title,is_valid)
    fname=t.title
    if not(fname.endswith(".xml")):
        fname=fname+".xml"
    if is_valid:
        valid_ids.append(t.id)
        valids.append(t)
        with open(f"{VALID_XML}/{fname}","w") as f:
            f.write(t.xml_content)
    else:
        invalid_ids.append(t.id)
        with open(f"{INVALID_XML}/{fname}","w") as f:
            f.write(t.xml_content)
        try:
            SCHEMA.validate(t.xml_content)
        except Exception as e:
            invalids[t.title]=e
#             print(e,"\n=====\n")
print(len(valid_ids),"VALID")
print(len(invalid_ids),"INVALID")

# 3. Deal with invalid XML docs
For now keep them in CDCS, they will be linked to the old version of the schema, hence less visible.
Eventually we can keep them and rerun the XML creation from the  "raw" metadata (excel file), jsut making sure the pid is set to the correct value.

In [None]:
if len(invalid_ids)>0:
    print("WARNING")
print(len(invalid_ids),"INVALID FILES WERE FOUND")

# 5. Upload new schema

In [None]:
ambench2022.loadSchema(CONFIG.XSD,TITLE_PREFIX,TEMPLATE_SCHEMA)
# XSD,TITLE_PREFIX,TEMPLATE_SCHEMA

## determine new CURRENT

In [None]:
OLD_CURRENT=CURRENT
try:
    ambench2022=AMBench2022(CONFIG.TEMPLATE,CONFIG.AMBENCH_URL,auth=AUTH)
    if ambench2022.template is None:
        print("Template",TEMPLATE,"does not yet exists, trying to create it now")
        ambench2022.loadSchema(CONFIG.XSD,TITLE_PREFIX,TEMPLATE_SCHEMA)
    tversionsms=ambench2022.get_template_managers(title=CONFIG.TEMPLATE)
    if len(tversionsms)>0:
        CURRENT=tversionsms['current'][0]
    else:
        CURRENT = None
    TEMPLATE_VERSIONS=ambench2022.get_templates(title=CONFIG.TEMPLATE,current=False)
    print("new current:",CURRENT,"old current:",OLD_CURRENT)
except Exception as e:
    print(e)
    raise(e)

# 6. generate pyxb classes from new schema
requires
<pre>
%pip install pyxb
</pre>
Do this in terminal

# 7. Deal with valid XML docs: migrate them to new template

In [None]:
# find id new template
template_id=CURRENT
# for k,t in todo.items():
#     if t['title'] == TEMPLATE:
#         template_id=t['id']
#         break
if template_id is not None:
    r=ambench2022.migrate(template_id,valid_ids)
    if r.status_code <200 or r.status_code >=400:
        print("PROBLEM:",r.content)
    else:
        print("Migration succeeded")
else:
    print("ERROR, no CURRENT template_id detected")