# WIBARAB FeatureDB Statistics

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc

from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
outputPath = tmpDir
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)
os.makedirs(outputPath, exist_ok=True)
nss = {"tei":"http://www.tei-c.org/ns/1.0", "wib":"https://wibarab.acdh.oeaw.ac.at/langDesc"}
# the root of the git repository
dataHome = "../.."

# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the feature documents 
features = manannot + "/features"


with saxonche.PySaxonProcessor(license=False) as proc:
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    print(proc.cwd)

SaxonC-HE 12.3 from Saxonica
/home/dschopper/data/WIBARAB/featuredb/080_scripts_generic


In [3]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
        exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        if exec.exception_occurred:
            exec.get_error_message
            #for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message())
            print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            print("there was an error transforming "+s+" with stylesheet "+xsl)

In [4]:
def docStatus(doc):
    """returns the status of the document at path; if the document can't be parsed, it returns a dict with the error"""
    revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
    status = revisionDesc.attrib['status']
    lastUpdate = revisionDesc.xpath("//tei:change", namespaces=nss)[0].get('when')
    return {
        "status" : status,
        "lastUpdate": lastUpdate
    }

In [5]:
def getFVOInfo(fvo):
    fvoID = fvo.get("{http://www.w3.org/XML/1998/namespace}id")
    fvoResp = fvo.get('resp')
    fvoStatus = fvo.get('status')
    fvRef = fvo.xpath("tei:name[@type = 'featureValue']/@ref", namespaces=nss)[0]
    return {
        "id": fvoID,
        "fvoResp" : fvoResp,
        "status": fvoStatus,
        "featureValue": fvRef,
        "sourceline": fvo.sourceline
    }

In [6]:
def getInfoAboutFeatureDoc(path):
    filename = os.path.basename(path)
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
        status = docStatus(doc)

        fvos_unsorted = []
        for f in doc.xpath("//wib:featureValueObservation", namespaces=nss): 
            fvoInfo = getFVOInfo(f)
            fileInfo = {"path" : path}
            fvo = { **fvoInfo, **fileInfo }
            fvos_unsorted.append( fvo )
        fvos = sorted(fvos_unsorted, key=lambda d: d['status']) 
        
        docInfo = {
            "path": path,
            "filename": filename,
            "number_of_fvos" : len(fvos),
            "fvos": fvos
        }
        return { **docInfo, **status }
        
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "status" : "error",
             "filename": filename,
            "path": path
         #   "number_of_fvos": 
        }
        return valErrObj

In [7]:
featureDocs = []

In [8]:
for i in os.scandir(features):
    if i.name.endswith('.xml') and i.is_file():
        filename = os.path.basename(i)
        filepath = features + "/" + filename
        info = getInfoAboutFeatureDoc(filepath)
        featureDocs.append(info)

In [9]:
def make_clickable(source, line=None, aText=None):
    link = source.replace('../../','https://github.com/wibarab/featuredb/blob/main/')
    if line:
        return f'<a href="{link}#L{line}">{aText if aText is not None else source}</a>'
    else:
        return f'<a href="{link}">{aText if aText is not None else source}</a>'

In [10]:
def make_ul(fvos):
    out = "<ul>"
    for i in fvos:
        out = out +"<li>" + i['status'] + " - " + i['fvoResp'] + " - " + i['featureValue']+ " - " + make_clickable(i['path'], line=i['sourceline'], aText = i['id']) + " </li>"
    out = out + "</ul>"
    return out

## Create a summary report for all fvo_docs

In [11]:
if len(featureDocs) > 0:
    df_fD = pd.DataFrame(data=featureDocs)
    df_fD['reportPath'] = df_fD['filename'].apply(lambda x: x.replace('.xml','_fvoReport.html'))
    # re-order columns
    df_fD = df_fD[['path', 'reportPath', 'status', 'lastUpdate', 'number_of_fvos']]#.reindex(sorted(df_fD.columns), axis=1)
    df_fD = df_fD.sort_values('lastUpdate')
    
    df_fD['path'] = df_fD.apply(lambda x: make_clickable(x['path']), axis=1)
    df_fD['reportPath'] = df_fD.apply(lambda x: make_clickable(x['reportPath'], aText="Report"), axis=1)
    df_fD.loc['Total number of fvos']= df_fD['number_of_fvos'].sum(skipna=True,axis=0)
    
    statsReport = outputPath + "/statsReport.html"
    
    with open(statsReport, 'w', encoding="utf-8") as f:
        f.write(df_fD.to_html(render_links=False, formatters={'number_of_fvos': lambda x: int(x), 'fvos': lambda x: make_ul(x) }, escape=False))

In [12]:
df_fD

Unnamed: 0,path,reportPath,status,lastUpdate,number_of_fvos
0,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_pron_sg_p2_c_fvoReport.html""...",draft,2022-08-17,46.0
38,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""feature_bound_pronoun_2ps_sg_f_after_...",draft,2022-08-17,278.0
39,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_pron_pl_p3_m_fvoReport.html""...",draft,2022-08-17,215.0
40,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_pron_sg_p2_f_fvoReport.html""...",draft,2022-08-17,268.0
42,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_pron_sg_p3_f_fvoReport.html""...",draft,2022-08-17,323.0
...,...,...,...,...,...
56,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_ghayn_fvoReport.html"">Report...",error,,
62,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""features_k_fvoReport.html"">Report</a>",error,,
64,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""feature_IMP_2sg_m_3weak_fvoReport.htm...",error,,
67,"<a href=""https://github.com/wibarab/featuredb/...","<a href=""feature_PFV_3sg_f_3wy_fvoReport.html""...",error,,


## Create a report for each fvo_doc

In [14]:
if len(featureDocs) > 0:
    # create one report per fvo_doc
    for i in featureDocs:
        if 'fvos' in i.keys():
            df_fvos = pd.DataFrame(data=i['fvos'])
            df_fvos.set_index("id")
            df_fvos['path'] = df_fvos.apply(lambda x: make_clickable(x['path']+"#L"+str(x['sourceline'])), axis=1)
            #df_fvos['path'] = df_fvos.apply(lambda x: make_clickable(x['path']), axis=1)
            #df_fvos.set_index("filename")
            filename = i['filename']
            fvoDocReport = outputPath + "/" + filename.replace('.xml','_fvoReport.html')
            with open(fvoDocReport, 'w', encoding="utf-8") as f:
                f.write(df_fvos.to_html(render_links=True, escape=False))