# WIBARAB FeatureDB Validation

* Install & set up dependencies
* extract schematron from RNG and transform to XSLT
* run schematron-xslt on files

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc

from pathlib import Path
from urllib.parse import urlsplit
from saxonpy import PySaxonProcessor, PyXdmValue
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)

# the root of the git repository
dataHome = "../.."

# rng schema
rngSchema = dataHome + "/803_RNG_Schematron/featuredb.rng"
# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the feature documents 
features = manannot + "/features"

proc = PySaxonProcessor(license=False)
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
print(proc.version)
proc.set_cwd(os.path.dirname(os.path.abspath('')))
print(proc.cwd)

Saxon/C 1.2.1 running with Saxon-HE 9.9.1.5C from Saxonica
/home/dschopper/data/WIBARAB/featuredb/080_scripts_generic


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

In [5]:
def setupSchXSLT():
    # install schematron
    schDLURL = "https://github.com/schxslt/schxslt/releases/download/v1.9.5/schxslt-1.9.5-xslt-only.zip"
    schHome = downloadAndUnzip(schDLURL)
    schCompiler = schHome + "/schxslt-1.9.5/2.0/pipeline-for-svrl.xsl"
    if os.path.exists(schCompiler):
        return schCompiler
    else: 
        print("error: something went wrong, cannot locate file '" + schCompiler + "'")

In [6]:
def transform(s, xsl, o, parameters=[]):
    proc.set_configuration_property("xi", "on")
    saxon = proc.new_xslt_processor()
    saxon.set_source(file_name=os.path.abspath(s))
    for i in parameters:
        saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
    saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
    saxon.set_output_file(os.path.abspath(o))
    saxon.transform_to_string()
    if saxon.exception_occurred():
        saxon.get_error_message(0)
        for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message(0))
        print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
    return o

In [7]:
# extract a schematron document embedded in an rng schema
def extractSchematron(rng):
    rng2sch = setupRNG2Sch()
    sch = tmpDir + "/" + os.path.basename(rng) + ".sch"
    if not os.path.exists(sch):
        transform(rng, rng2sch, sch)
    return sch

In [8]:
# compile a schematron document to an XSLT stylesheet
def compileSchematron(sch):
    outputPath = tmpDir + "/" + os.path.basename(sch) + ".xsl"
    schCompiler = setupSchXSLT()
    
    transform(sch, schCompiler, outputPath)
    if os.path.exists(outputPath):
        return outputPath
    else: 
        print("error: something went wrong, cannot locate file '" + outputPath + "'")

In [9]:
# prepare rng2sch styelsheet
# returns path to xsl
def setupRNG2Sch():
    RNG2SchtrDL = "https://raw.githubusercontent.com/Schematron/schematron/master/trunk/converters/code/ToSchematron/ExtractSchFromRNG.xsl"
    rng2sch = downloadAndStore(RNG2SchtrDL)
    # tweak XSLT 
    with open(rng2sch) as inputfile:
        lines = inputfile.read()
    lines = lines.replace( 'http://www.ascc.net/xml/schematron','http://purl.oclc.org/dsdl/schematron/')
    lines = lines.replace( '<sch:schema','<sch:schema queryBinding="xslt2"')
    
    with open(rng2sch, 'w') as file:
        file.writelines(lines)
    newPath = libDir+"/"+os.path.basename(rng2sch)
    os.replace(rng2sch, newPath)
    if os.path.exists(newPath):
        return newPath
    else:
        print("error: something went wrong, cannot locate file '" + newPath + "'")

In [10]:
# validate document doc against schematron schema sch
def schValidate(sch, path):
    errs = []
    out = tmpDir + "/validationReports/ " + os.path.basename(path)
    xsl = compileSchematron(sch)
    try:
        transform(path, xsl, out)
    except e:
        valErrObj = {
                "message": e.message, 
                "line": error.line, 
                "source": path, 
                "location": location,
                "stage" : "relaxng", 
                "exceptionType": type(e).__name__
            }
        fvoInfo = fvoByLinenumber(path, error.line)
        if fvoInfo:
            valErrObj["fvoID"] = fvoInfo['id']
            valErrObj["fvoResp"] = fvoInfo['resp']
        return valErrObj
    report = etree.parse(out)
    successfulReport = report.findall("{http://purl.oclc.org/dsdl/svrl}successful-report")
    failedAssert = report.findall("{http://purl.oclc.org/dsdl/svrl}failed-assert")

    for s in successfulReport + failedAssert:
        location = s.attrib['location'].replace('Q{http://www.tei-c.org/ns/1.0}','').replace('Q{https://wibarab.acdh.oeaw.ac.at/langDesc}','')
        msg = s.find("{http://purl.oclc.org/dsdl/svrl}text").text
        errObj = {
            "type" : "error",
            "message":  msg,
            "line" : s.sourceline,
            "source": path,
            "location": location,
            "stage": "schematron",
            "exceptionType": str(s.tag).replace("{http://purl.oclc.org/dsdl/svrl}","")
        }
        fvoInfo = fvoByLinenumber(path, s.sourceline)
        if fvoInfo:
            errObj["fvoID"] = fvoInfo['id']
            errObj["fvoResp"] = fvoInfo['resp']
        errs.append(errObj)
    return errs
    

In [11]:
def fvoByLinenumber(document, linenumber):
    pResp = re.compile('resp=[\'"](.+?)[\'"]')
    pID = re.compile('xml:id=[\'"](.+?)[\'"]')
    with open(document, "r") as file:
        fvos = []
        index = 1
        for line in file.readlines():
            if "featureValueObservation" in line and "resp" in line:
                fvos.append({'line':index, 'fvo':line, 'resp': pResp.search(line).group(1), 'id': pID.search(line).group(1)})
            index += 1
        file.close()
    previousFvos = list(filter(lambda object: object['line'] < linenumber, fvos))
    if len(previousFvos) == 0:
        print("could not determine fvo for line "+str(linenumber)+" in document "+document)
    else:
        return previousFvos[-1]


In [12]:
def validate(path, rngSchema):
    validationErrors = []
    sch = extractSchematron(rngSchema)
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces={"tei":"http://www.tei-c.org/ns/1.0"})[0]
        status = revisionDesc.attrib['status']
        if status.lower() == "draft":
            print("Ignoring draft "+path)
            return {
                "source" : path,
                "type" : "ignored",
                "status": status
            }
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
        # schematron validation
        schErrs = schValidate(sch, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
    
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            location = "n/a" if error.path is None else error.path
            valErrObj = {
                "type" : "error",
                "message": error.message, 
                "line": error.line, 
                "source": path, 
                "location": location,
                "stage" : "relaxng", 
                "exceptionType": type(e).__name__
            }
            fvoInfo = fvoByLinenumber(path, error.line)
            if fvoInfo:
                valErrObj["fvoID"] = fvoInfo['id']
                valErrObj["fvoResp"] = fvoInfo['resp']
            validationErrors.append(valErrObj)
        schErrs = schValidate(sch, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
        
        
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        fvoInfo = fvoByLinenumber(path, e.lineno)
        if fvoInfo:
            valErrObj["fvoID"] = fvoInfo['id']
            valErrObj["fvoResp"] = fvoInfo['resp']
        validationErrors.append(valErrObj)
    return validationErrors

In [13]:
validationErrors = []
ignoredFiles = []
type(ignoredFiles)

list

In [14]:
for i in os.scandir(features):
    if i.name.endswith('.xml') and i.is_file():
        filename=os.path.basename(i)
        filepath=features + "/" + filename
        print("validating " + filepath)
        results = validate(filepath, rngSchema)
        if type(results) is list:
            res_errs = filter(lambda x: x['type'] == "error", results)
            res_ignored = filter(lambda x: x['type'] == "ignored", results)
            validationErrors = validationErrors + list(res_errs)
            print(f"{len(list(res_errs))} found / {len(validationErrors)} in total")
        elif type(results) is dict:
            if results['type'] == "ignored":
                ignoredFiles.append(results)
        else:
            print("unknown result type")
            print(results)
            

validating ../../010_manannot/features/features_pron_sg_p2_c.xml
Ignoring draft ../../010_manannot/features/features_pron_sg_p2_c.xml
validating ../../010_manannot/features/feature_bound_pronoun_2ps_sg_f_after_vowel.xml
Ignoring draft ../../010_manannot/features/feature_bound_pronoun_2ps_sg_f_after_vowel.xml
validating ../../010_manannot/features/feature_raising_a_stressed_open_.xml
0 found / 1 in total
validating ../../010_manannot/features/feature_bound_pronoun_2ps_pl_m.xml
could not determine fvo for line 84 in document ../../010_manannot/features/feature_bound_pronoun_2ps_pl_m.xml
0 found / 2 in total
validating ../../010_manannot/features/feature_bound_pronoun_2ps_sg_c_after_vowel.xml
Ignoring draft ../../010_manannot/features/feature_bound_pronoun_2ps_sg_c_after_vowel.xml
validating ../../010_manannot/features/features_pron_sg_p1_c.xml
Ignoring draft ../../010_manannot/features/features_pron_sg_p1_c.xml
validating ../../010_manannot/features/feature_bound_pronoun_3ps_sg_m_after_v

Error evaluating (fn:resolve-uri(...)) in xsl:variable/@select on line 1345 column 208 of featuredb.rng.sch.xsl:
  FORG0002: Base URI {{docPath}} is invalid: Illegal character in path at index 0: {docPath}
  In template rule with match="element(Q{https://wibarab.acdh.oeaw.ac.at/langDesc}featureValueObservation)/attribute(Q{}resp)" on line 1305 of featuredb.rng.sch.xsl
     invoked by xsl:apply-templates at file:///home/dschopper/data/WIBARAB/featuredb/080_scripts_generic/validation/tmp/featuredb.rng.sch.xsl#83
  In template rule with match="*" on line 82 of featuredb.rng.sch.xsl
     invoked by xsl:apply-templates at file:///home/dschopper/data/WIBARAB/featuredb/080_scripts_generic/validation/tmp/featuredb.rng.sch.xsl#84
  In template rule with match="*" on line 82 of featuredb.rng.sch.xsl
     invoked by xsl:next-match at file:///home/dschopper/data/WIBARAB/featuredb/080_scripts_generic/validation/tmp/featuredb.rng.sch.xsl#1789
  In template rule with match="element(Q{http://www.tei-c

Ignoring draft ../../010_manannot/features/feature_apophonic_passive.xml
validating ../../010_manannot/features/features_pron_pl_p3_c.xml
Ignoring draft ../../010_manannot/features/features_pron_pl_p3_c.xml
validating ../../010_manannot/features/features_pron_pl_p2_m.xml
Ignoring draft ../../010_manannot/features/features_pron_pl_p2_m.xml
validating ../../010_manannot/features/feature_raising_a_pretonic_closed.xml
could not determine fvo for line 59 in document ../../010_manannot/features/feature_raising_a_pretonic_closed.xml
0 found / 20 in total
validating ../../010_manannot/features/feature_IMP_2sg_m_3w.xml
Ignoring draft ../../010_manannot/features/feature_IMP_2sg_m_3w.xml
validating ../../010_manannot/features/features_pron_pl_p2_f.xml
Ignoring draft ../../010_manannot/features/features_pron_pl_p2_f.xml
validating ../../010_manannot/features/features_pron_pl_p2c.xml
Ignoring draft ../../010_manannot/features/features_pron_pl_p2c.xml
validating ../../010_manannot/features/feature_b

In [15]:
df_err = pd.DataFrame(data=validationErrors).T
df_err = df_err.transpose()
df_err 

Unnamed: 0,type,message,line,source,location,stage,exceptionType,fvoID,fvoResp
0,error,"ID ct_0017 already defined, line 302, column 5...",302,../../010_manannot/features/feature_raising_a_...,,parsing,XMLSyntaxError,fv_0032,dmp:AID
1,error,xml:id : attribute value bp_p2m_ku(w) is not a...,84,../../010_manannot/features/feature_bound_pron...,,parsing,XMLSyntaxError,,
2,error,xml:id : attribute value bp_s3mv_long vowel is...,60,../../010_manannot/features/feature_bound_pron...,,parsing,XMLSyntaxError,,
3,error,"ID fv_0098 already defined, line 667, column 9...",667,../../010_manannot/features/feature_PFV_3sg_f.xml,,parsing,XMLSyntaxError,fv_0098,dmp:AT
4,error,"ID fv_0272 already defined, line 2161, column ...",2161,../../010_manannot/features/feature_bound_pron...,,parsing,XMLSyntaxError,fv_0272,dmp:TD
5,error,xml:id : attribute value bp_s3mc_u(h) is not a...,84,../../010_manannot/features/feature_bound_pron...,,parsing,XMLSyntaxError,,
6,error,Invalid attribute schemaLocation for element TEI,8,../../010_manannot/features/features_q.xml,/*,relaxng,DocumentInvalid,,
7,error,"the target ""../../010_manannot/vicav_biblio_te...",311,../../010_manannot/features/features_q.xml,/TEI[1]/text[1]/body[1]/div[3]/featureValueObs...,schematron,failed-assert,fv_0071,dmp:AID
8,error,"the target ""../../010_manannot/vicav_biblio_te...",702,../../010_manannot/features/features_q.xml,/TEI[1]/text[1]/body[1]/div[3]/featureValueObs...,schematron,failed-assert,fv_0057,dmp:AID
9,error,"the target ""../../010_manannot/vicav_biblio_te...",711,../../010_manannot/features/features_q.xml,/TEI[1]/text[1]/body[1]/div[3]/featureValueObs...,schematron,failed-assert,fv_0062,dmp:AID


In [16]:
print(f"cound {len(validationErrors)} validation errors")

cound 37 validation errors


In [17]:
df_ignored = pd.DataFrame(data=ignoredFiles).T
df_ignored = df_ignored.transpose()
df_ignored

Unnamed: 0,source,type,status
0,../../010_manannot/features/features_pron_sg_p...,ignored,draft
1,../../010_manannot/features/feature_bound_pron...,ignored,draft
2,../../010_manannot/features/feature_bound_pron...,ignored,draft
3,../../010_manannot/features/features_pron_sg_p...,ignored,draft
4,../../010_manannot/features/features_pron_pl_p...,ignored,draft
5,../../010_manannot/features/feature_IPFV_3sg_m...,ignored,draft
6,../../010_manannot/features/feature_JAA_3sg_mf...,ignored,draft
7,../../010_manannot/features/feature_bound_pron...,ignored,draft
8,../../010_manannot/features/feature_IPFV_2sg_f...,ignored,draft
9,../../010_manannot/features/features_pron_pl_p...,ignored,draft


In [30]:
def make_clickable(source, line=None):
    link = source.replace('../../','https://github.com/wibarab/featuredb/blob/main/')
    if line:
        return f'<a href="{link}#L{line}">{source}</a>'
    else:
        return f'<a href="{link}">{source}</a>'

In [31]:
errorReport = "tmp/validationReport.html"
df_err['link'] = df_err.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)
with open(errorReport, 'w') as f:
    f.write(df_err.to_html(render_links=True, escape=False))

In [33]:
ignoredReport = "tmp/ignoredFiles.html"
df_ignored['link'] = df_ignored.apply(lambda x: make_clickable(x['source']), axis=1)

with open(ignoredReport, 'w') as f:
    f.write(df_ignored.to_html(render_links=True, escape=False))