# WIBARAB FeatureDB Statistics

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc

from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)
nss = {"tei":"http://www.tei-c.org/ns/1.0", "wib":"https://wibarab.acdh.oeaw.ac.at/langDesc"}
# the root of the git repository
dataHome = "../.."

# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the feature documents 
features = manannot + "/features"


with saxonche.PySaxonProcessor(license=False) as proc:
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    print(proc.cwd)

SaxonC-HE 12.3 from Saxonica
/home/dschopper/data/WIBARAB/featuredb/080_scripts_generic


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

In [5]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
        exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        if exec.exception_occurred:
            exec.get_error_message
            #for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message())
            print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            print("there was an error transforming "+s+" with stylesheet "+xsl)

In [6]:
def docStatus(doc):
    """returns the status of the document at path; if the document can't be parsed, it returns a dict with the error"""
    revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
    status = revisionDesc.attrib['status']
    lastUpdate = revisionDesc.xpath("//tei:change", namespaces=nss)[0].get('when')
    return {
        "status" : status,
        "lastUpdate": lastUpdate
    }

In [7]:
def getFVOInfo(fvo):
    fvoID = fvo.get("{http://www.w3.org/XML/1998/namespace}id")
    fvoResp = fvo.get('resp')
    fvoStatus = fvo.get('status')
    fvRef = fvo.xpath("tei:name[@type = 'featureValue']/@ref", namespaces=nss)[0]
    return {
        "id": fvoID,
        "fvoResp" : fvoResp,
        "status": fvoStatus,
        "featureValue": fvRef,
        "sourceline": fvo.sourceline
    }

In [8]:
def getInfoAboutFeatureDoc(path):
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
        status = docStatus(doc)

        fvos_unsorted = []
        for f in doc.xpath("//wib:featureValueObservation", namespaces=nss): 
            fvoInfo = getFVOInfo(f)
            fileInfo = {"path" : path}
            fvo = { **fvoInfo, **fileInfo }
            fvos_unsorted.append( fvo )
        fvos = sorted(fvos_unsorted, key=lambda d: d['status']) 
        
        docInfo = {
            "path": path,
            "number_of_fvos" : len(fvos),
            "fvos": fvos
        }
        return { **docInfo, **status }
        
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "status" : "error",
            "path": path, 
            "msg" : e.msg,
            "number_of_fvos": "n/a"
        }
        return valErrObj

In [9]:
featureDocs = []

In [10]:
for i in os.scandir(features):
    if i.name.endswith('.xml') and i.is_file():
        filename = os.path.basename(i)
        filepath = features + "/" + filename
        info = getInfoAboutFeatureDoc(filepath)
        featureDocs.append(info)

In [15]:
def make_clickable(source, line=None, aText=None):
    link = source.replace('../../','https://github.com/wibarab/featuredb/blob/main/')
    if line:
        return f'<a href="{link}#L{line}">{aText if aText is not None else source}</a>'
    else:
        return f'<a href="{link}">{aText if aText is not None else source}</a>'

In [16]:
def make_list(fvos):
    out = "<ul>"
    for i in fvos:
        out = out +"<li>" + i['status'] + " - " + i['fvoResp'] + " - " + i['featureValue']+ " - " + make_clickable(i['path'], line=i['sourceline'], aText = i['id']) + " </li>"
    out = out + "</ul>"
    return out

In [17]:
if len(featureDocs) > 0:
    df_fD = pd.DataFrame(data=featureDocs).T
    df_fD = df_fD.transpose()
    df_fD = df_fD[['path', 'status', 'lastUpdate', 'number_of_fvos', 'fvos']]#.reindex(sorted(df_fD.columns), axis=1)
    df_fD = df_fD.sort_values('lastUpdate')
    df_fD
else: 
    print("no featureDocs")

In [18]:
    statsReport = "tmp/statsReport.html"
    df_fD['path'] = df_fD.apply(lambda x: make_clickable(x['path']), axis=1)
    with open(statsReport, 'w', encoding="utf-8") as f:
        f.write(df_fD.to_html(render_links=True, formatters={'fvos': lambda x: make_list(x) }, escape=False))