# WIBARAB FeatureDB Validation

* Install & set up dependencies
* extract schematron from RNG and transform to XSLT
* run schematron-xslt on files

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd

from pathlib import Path
from urllib.parse import urlsplit
from saxonpy import PySaxonProcessor, PyXdmValue
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)


# the root of the git repository
dataHome = "../.."

# rng schema
rngSchema = dataHome + "/803_RNG_Schematron/featuredb.rng"
# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the feature documents 
features = manannot + "/features"

proc = PySaxonProcessor(license=False)
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
print(proc.version)
proc.set_cwd(os.path.dirname(os.path.abspath('')))
print(proc.cwd)

Saxon/C 1.2.1 running with Saxon-HE 9.9.1.5C from Saxonica
/home/dschopper/data/WIBARAB/featuredb/080_scripts_generic


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

In [5]:
def setupSchXSLT():
    # install schematron
    schDLURL = "https://github.com/schxslt/schxslt/releases/download/v1.9.5/schxslt-1.9.5-xslt-only.zip"
    schHome = downloadAndUnzip(schDLURL)
    schCompiler = schHome + "/schxslt-1.9.5/2.0/compile-for-svrl.xsl"
    if os.path.exists(schCompiler):
        return schCompiler
    else: 
        print("error: something went wrong, cannot locate file '" + schCompiler + "'")

In [6]:
def transform(s, xsl, o, parameters=[]):
    proc.set_configuration_property("xi", "on")
    saxon = proc.new_xslt_processor()
    saxon.set_source(file_name=os.path.abspath(s))
    for i in parameters:
        saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
    saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
    saxon.set_output_file(os.path.abspath(o))
    saxon.transform_to_string()
    if saxon.exception_occurred():
        #print(getmembers(saxon))
        saxon.get_error_message(0)
        for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message(0))
        print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
    return o

In [14]:
# extract a schematron document embedded in an rng schema
def extractSchematron(rng):
    print(FORCE_SCHEMATRON_EXTRACTION)
    rng2sch = setupRNG2Sch()
    sch = tmpDir + "/" + os.path.basename(rng) + ".sch"
    if not os.path.exists(sch):
        transform(rng, rng2sch, sch)
    return sch

In [8]:
# compile a schematron document to an XSLT stylesheet
def compileSchematron(sch):
    outputPath = tmpDir + "/" + os.path.basename(sch) + ".xsl"
    schCompiler = setupSchXSLT()
    
    transform(sch, schCompiler, outputPath)
    if os.path.exists(outputPath):
        return outputPath
    else: 
        print("error: something went wrong, cannot locate file '" + outputPath + "'")

In [9]:
# prepare rng2sch styelsheet
# returns path to xsl
def setupRNG2Sch():
    RNG2SchtrDL = "https://raw.githubusercontent.com/Schematron/schematron/master/trunk/converters/code/ToSchematron/ExtractSchFromRNG.xsl"
    rng2sch = downloadAndStore(RNG2SchtrDL)
    # tweak XSLT 
    with open(rng2sch) as inputfile:
        lines = inputfile.read()
    lines = lines.replace( 'http://www.ascc.net/xml/schematron','http://purl.oclc.org/dsdl/schematron/')
    lines = lines.replace( '<sch:schema','<sch:schema queryBinding="xslt2"')
    
    with open(rng2sch, 'w') as file:
        file.writelines(lines)
    newPath = libDir+"/"+os.path.basename(rng2sch)
    os.replace(rng2sch, newPath)
    if os.path.exists(newPath):
        return newPath
    else:
        print("error: something went wrong, cannot locate file '" + newPath + "'")

In [10]:
# validate document doc against schematron schema sch
def schValidate(sch, path):
    errs = []
    out = tmpDir + "/validationReports/ " + os.path.basename(path)
    xsl = compileSchematron(sch)
    transform(path, xsl, out)
    report = etree.parse(out)
    successfulReport = report.findall("{http://purl.oclc.org/dsdl/svrl}successful-report")
    failedAssert = report.findall("{http://purl.oclc.org/dsdl/svrl}failed-assert")

    for s in successfulReport + failedAssert:
        location = s.attrib['location'].replace('Q{http://www.tei-c.org/ns/1.0}','').replace('Q{https://wibarab.acdh.oeaw.ac.at/langDesc}','')
        msg = s.find("{http://purl.oclc.org/dsdl/svrl}text").text
        errObj = {
            "message":  msg,
            "line" : s.sourceline,
            "source": path,
            "location": location,
            "stage": "schematron",
            "exceptionType": str(s.tag).replace("{http://purl.oclc.org/dsdl/svrl}","")
        }
        errs.append(errObj)
    return errs
    

In [11]:
def validate(path, rngSchema):
    validationErrors = []
    sch = extractSchematron(rngSchema)
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces={"tei":"http://www.tei-c.org/ns/1.0"})[0]
        status = revisionDesc.attrib['status']
        if status.lower() == "draft":
            print("Ignoring draft "+path)
            return 
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
        # schematron validation
        schErrs = schValidate(sch, path)
        if lem(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
    
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            location = "n/a" if error.path is None else error.path
            valErrObj = {
                "message": error.message, 
                "line": error.line, 
                "source": path, 
                "location": location,
                "stage" : "relaxng", 
                "exceptionType": type(e).__name__
            }
            validationErrors.append(valErrObj)
        schErrs = schValidate(sch, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
        
        
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
    }
        validationErrors.append(valErrObj)
    return validationErrors

In [12]:
validationErrors = []

In [13]:
for i in os.scandir(features):
    if i.name.endswith('.xml') and i.is_file():
        filename=os.path.basename(i)
        filepath=features + "/" + filename
        print("validating " + filepath)
        errs = validate(filepath, rngSchema)
        if type(errs) is list:
            validationErrors = validationErrors + errs
            print(f"{len(errs)} found / {len(validationErrors)} in total")

validating ../../010_manannot/features/features_pron_sg_p2_c.xml


NameError: name 'FORCE_SCHEMATRON_EXTRACTION' is not defined

In [None]:
df = pd.DataFrame(data=validationErrors).T
df = df.transpose()
df

In [None]:
print(f"cound {len(validationErrors)} validation errors")

In [None]:
def make_clickable(source, line):
    link = source.replace('../../','https://github.com/wibarab/featuredb/blob/main/')
    return f'<a href="{link}#L{line}">{source}</a>'

df['link'] = df.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)

In [None]:
errorReport = "tmp/validationReport.html"
with open(errorReport, 'w') as f:
    f.write(df.to_html(render_links=True, escape=False))

In [None]:
type(None) is list