# Fix broken biblid fields  

The current Zotero export has several problems (cf. [issue #31](https://github.com/wibarab/featuredb/issues/31):

* duplicate biblIDs
* ill-formed biblIDs (blanks, leading integers, all sorts of special characters)
* structural errors in the TEI export

1. run validation on export
2. regenerate IDs for ill-formed or repeated IDs
3. update fvo docs in case they referenced repeated IDs 
4. write new ids to a copy of the Zotero TEI export
5. push new ids from TEI export back to Zotero

In [1]:
import requests
import json
import logging
import os
import re
import xml
import xml.etree.ElementTree as ET
from urllib.parse import urlparse, parse_qs, urlencode
import asyncio
import aiohttp
from random import randrange
from datetime import datetime
# this module is needed to make asyncio.run work inside the notebook as well as in the generated python script
import nest_asyncio
from random import random
nest_asyncio.apply()
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
#logging.basicConfig(level=logging.DEBUG)
import saxonche

from lxml import isoschematron, etree


In [2]:
xmlns = {"tei": "http://www.tei-c.org/ns/1.0", "xml":"http://www.w3.org/XML/1998/namespace", "": "http://www.tei-c.org/ns/1.0", "wib":"https://wibarab.acdh.oeaw.ac.at/langDesc"}
for key in xmlns:
    ET.register_namespace(key, xmlns[key])

In [3]:
request_headers = {'Authorization': 'Bearer ' + os.environ['API_TOKEN']}
group_id = "2165756"
limit_downloads_to = int(os.environ['LIMIT_DOWNLOADS_TO']) if 'LIMIT_DOWNLOADS_TO' in os.environ and os.environ['LIMIT_DOWNLOADS_TO'] else None
# On GitHub more than one connections to api.zotero.org was broken when this environment variable was introduced
conn_limit=int(os.environ['MAX_CONNECTIONS']) if 'MAX_CONNECTIONS' in os.environ and os.environ['MAX_CONNECTIONS'] else 4 
total_timeout=int(os.environ['TIMEOUT']) if 'TIMEOUT' in os.environ and os.environ['TIMEOUT'] else 5 #s
logging.info("limit_downloads_to = " + str(limit_downloads_to) + ", conn_limit = " + str(conn_limit) + ', total_timeout = ' + str(total_timeout) + 's')

2023-10-05 23:32:50,501 - limit_downloads_to = None, conn_limit = 4, total_timeout = 5s


## Validate current Zotero Export

In [4]:
# the root of the git repository
dataHome = "../.."
# rng schema
rngSchema = dataHome + "/803_RNG_Schematron/tei_all.rng"

pathToFvoDocuments = dataHome + "/010_manannot/features"

# the path to the Zotero export
pathToZoteroExport=dataHome + "/010_manannot/vicav_biblio_tei_zotero.xml"
pathToFixZoteroXSL=dataHome + "/082_scripts_xsl/fix_zotero_TEI_export.xsl"


with saxonche.PySaxonProcessor(license=False) as proc:
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    print(proc.cwd)


SaxonC-HE 12.3 from Saxonica
/home/dschopper/data/WIBARAB/featuredb/080_scripts_generic


In [5]:
def getXMLIDs(path):
    ids=[]
    with open(path, "r") as f:
        for index, line in enumerate(f):
            
            pID = re.compile('xml:id=[\'"](.+?)[\'"]')
            matches = pID.search(line)
            if matches:
                ids.append({matches.group(1):index})
    return ids

In [6]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
        exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        if exec.exception_occurred:
            exec.get_error_message
            #for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message())
            print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            print("there was an error transforming "+s+" with stylesheet "+xsl)


In [7]:
def validate(path, rngSchema):
    """Validate a document against the rngSchema. Returns a list of dicts of which each one represents a validation (or parsing) error."""
    validationErrors = []
    
    try:
        doc = etree.parse(path)
    
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
    
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        
        validationErrors.append(valErrObj)     
        return validationErrors
        
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            # we ignore rng errors about @schemaLocation since 
            # that is needed for validation in the TEI-enricher
            if error.message != "Invalid attribute schemaLocation for element TEI":
                location = "n/a" if error.path is None else error.path
                valErrObj = {
                    "type" : "error",
                    "message": error.message, 
                    "line": error.line, 
                    "source": path, 
                    "location": location,
                    "stage" : "relaxng", 
                    "exceptionType": type(e).__name__
                }
                # DEBUG
                print(valErrObj)
                validationErrors.append(valErrObj)        
    
    return validationErrors

In [8]:
def sanitize(string):
    return re.sub(r'[^a-zA-Z]','', string)

def sanitizeDate(string):
    return re.sub(r'[^0-9]','', string)


def constructID(entry):
    key=entry["key"]
    lastNames=[]
    for c in entry["data"]["creators"]:
        if c["creatorType"] == "author":
            # strip all non-word characters
            if "lastName" in c.keys():
                lastNames.append(sanitize(c["lastName"]))
            else:
                lastNames.append(sanitize(c["name"]))
    date=sanitizeDate(entry["data"]["date"])
    if date == "":
        print(entry)
    ranNum=randrange(1000, 9999,4)
    return "-".join(lastNames)+"_"+date+"_"+str(ranNum)
    

In [9]:
def regenerateID(zoteroID, previousAttempts=[]):
    # fetch JSON entry from API
    entryURL=f"https://api.zotero.org/groups/{group_id}/items/{zoteroID}"
    response = requests.get(entryURL)
    if response:
        entry = json.loads(response.text)
        newID=constructID(entry)
        print("regenerating ID for "+zoteroID+": "+newID)
        # check against list of existing IDs 
        # if it does not exists, then return it
        # otherwise try another one
        existingIDs = getXMLIDs(pathToZoteroExport)
        if newID in existingIDs:
            regenerateID(zoteroID, previousAttempts.append(newID))
        else:
            return newID
    else:
         print("an error occured fetching "+entryURL)
         print(e.text)
         print(e.status_code)

In [10]:
def getByCorresp(ZoteroExport, corresp):
    return ZoteroExport.findall(f".//tei:biblStruct[@corresp='{corresp}']", xmlns)

In [11]:
def fixZoteroTEIExport(pathToZoteroExport):
    pathToOutput=pathToZoteroExport#+".tmp"
    transform(pathToZoteroExport, pathToFixZoteroXSL, pathToOutput)
    return pathToOutput

In [12]:
def getFvoByBiblID(biblid):
    biblRefs=[]
    for pathToFvoDoc in os.scandir(pathToFvoDocuments):
        if pathToFvoDoc.name.endswith('.xml') and pathToFvoDoc.is_file():
            filename = os.path.basename(pathToFvoDoc)
            filepath = pathToFvoDocuments + "/" + filename
            parser = etree.XMLParser(recover=True)
            tree=ET.parse(pathToFvoDoc, parser=parser)
            refs=tree.findall(f'.//tei:bibl[@corresp="zot:'+biblid+'"]', xmlns)
            if len(refs) > 0:
                biblRefs.append((filepath,refs))
    return biblRefs

In [13]:
def getBiblEntryByID(biblid):
    zoteroExport = ET.parse(pathToZoteroExport)
    return zoteroExport.findall('.//biblStruct[@xml:id = "'+biblid+'"]', xmlns)

In [14]:
def rewriteIDRef(biblRefs, oldID, newID):
    # biblRefs= list of tuples with pos 1= path to document, pos 2= list of elements
    for biblRef in biblRefs:
        filepath=biblRef[0]
        bibls=biblRef[1]
        for bibl in bibls:
            if bibl.attrib["corresp"] == "zot:"+oldID:
                bibl.attrib["corresp"] = "zot:"+newID
        dom=bibls[0].getroottree()
        dom.write(filepath, encoding="utf-8", xml_declaration=True)
        fixPIs(filepath)

In [15]:
def setBiblStatus(biblRefs, status, comment):
    for biblRef in biblRefs:
        filepath=biblRef[0]
        bibls=biblRef[1]
        for bibl in bibls:
            if not "status" in bibl.keys() or bibl.get("status") != status:
                bibl.set("status", status)
                cNode=etree.Comment(comment)
                bibl.insert(0, cNode)
        dom=bibls[0].getroottree()
        dom.write(filepath, encoding="utf-8", xml_declaration=True)
        fixPIs(filepath)

In [16]:
def fixErrors(pathToZoteroExport):
    # fix structural errors in TEI export from Zotero
    vErrs = validate(pathToZoteroExport, rngSchema)
    if len(vErrs) == 0:
        return 
    else:
        vErrsByLine={}
        for i in vErrs:
            vErrsByLine[i["line"]] = i
        
        parser = etree.XMLParser(recover=True)
        ZoteroExport = etree.parse(pathToZoteroExport, parser)
        
        with open(pathToZoteroExport, "r") as f:
            for i, line in enumerate(f):
                if i+1 in vErrsByLine:
                    e = vErrsByLine[i+1]
                    if ("xml:id : attribute value"  in e["message"] and "is not an NCName" in e["message"]) or ("ID" in e["message"] and "already defined" in e["message"]):
                        pCorresp = re.compile('corresp=[\'"](.+?)[\'"]')
                        pID = re.compile('xml:id=[\'"](.+?)[\'"]')
                        corresp = pCorresp.search(line).group(1)
                        oldID = pID.search(line).group(1)
                        if corresp:
                            zoteroKey = corresp.split("/")[-1]
                            bibl = getByCorresp(ZoteroExport, corresp)
                            newID = regenerateID(zoteroKey)

                            # update entry with new id
                            bibl[0].attrib["{http://www.w3.org/XML/1998/namespace}id"] = newID
                            
                            # update references to new ID
                            if ("xml:id : attribute value"  in e["message"] and "is not an NCName" in e["message"]):
                                existingReferences = getFvoByBiblID(oldID)
                                if len(existingReferences) > 0:
                                    print("found "+str(len(existingReferences))+" references to ID "+oldID+". Replacing with "+newID+".")
                                    rewriteIDRef(existingReferences, oldID, newID)
                                    setBiblStatus(existingReferences, "CHECKME", "broken ID has been fixed in Zotero export, please revise")
                            else: 
                                # these are ambiguous references which need to be checked again
                                existingReferences = getFvoByBiblID(oldID)
                                if len(existingReferences) > 0:
                                    # potential targets
                                    targets=[]
                                    for i in getBiblEntryByID(oldID):
                                        url=i.get("corresp")
                                        #id=i.get("{http://www.w3.org/XML/1998/namespace}id")
                                        targets.append(url)
                                    rewriteIDRef(existingReferences, oldID, newID)
                                    setBiblStatus(existingReferences, "CHECKME", "ambiguous ID reference. Could be either "+" or ".join(targets))

                            
                            
                        else:
                            print("could not identify @corresp in line")
                            print(line)
                    else:
                        print("unhandled error")
                        print(e)
        ZoteroExport.write(pathToZoteroExport, encoding="utf-8", xml_declaration=True)
        vErrsAfterFix = validate(pathToZoteroExport, rngSchema)
        if vErrsAfterFix == vErrs:
            print("fixErrors was not successful, please check messages. Aborting")
            return
        else:
            fixErrors(pathToZoteroExport)

In [17]:
def flagAmbiguousBiblRefs(pathToZoteroExport):
    zoteroExport=ET.parse(pathToZoteroExport)
    for bibl in zoteroExport.findall(".//tei:biblStruct",xmlns):
        n=bibl.attrib["n"]
        # if id.endswith("a-z"):
        regex=re.compile(r"[a-z]$")
        matches = regex.search(n)
        if matches:
            id=bibl.attrib["{http://www.w3.org/XML/1998/namespace}id"]
            existingReferences = getFvoByBiblID(id)
            if existingReferences:
                setBiblStatus(existingReferences, "CHECKME", "potentially ambiguous references")

In [18]:
zoteroExport=ET.parse(pathToZoteroExport)
pathToZoteroExportOLD="vicav_biblio_tei_zotero_7bd84e37.xml"
zoteroExportOLD=ET.parse(pathToZoteroExportOLD)
def recoverOldIDRefs(pathToDocument):
    doc=etree.parse(pathToDocument)

    for bibl in doc.findall(".//wib:featureValueObservation/tei:bibl",xmlns):
        corresp=bibl.get("corresp")
        if corresp.startswith("zot:"):
            IDRef=corresp.replace("zot:","")
            if IDRef != "":
                target=zoteroExport.findall('.//tei:biblStruct[@xml:id="'+IDRef+'"]',xmlns)
                # target is not resolvable
                if not target:
                    print("trying to recover broken reference "+IDRef)
        
                    # try to see if the reference points to @n
                    targetN=zoteroExport.findall('.//tei:biblStruct[@n="'+IDRef+'"]',xmlns)
                    if len(targetN) == 1:
                        targetNewID=targetN[0].attrib["{http://www.w3.org/XML/1998/namespace}id"]
                        bibl.attrib["corresp"]="zot:"+targetNewID
                        addChange(doc, "dmp:DS", "fixed broken bibl/@corresp references")
                        doc.write(filepath, encoding="utf-8", xml_declaration=True)
                        fixPIs(filepath)

                    elif len(targetN) > 1:
                        print("ERROR: several biblStruct[@n='"+IDRef+"']")
                    else:

                        
                        # try to find idref in previous version of Zotero export
                        targetOLD=zoteroExportOLD.findall('.//tei:biblStruct[@xml:id="'+IDRef+'"]',xmlns)
                        
                        if len(targetOLD) > 1:
                            print("ERROR: "+IDRef+" points to several targets")
                        
                        # if the id is located in the old export, match it with the new
                        # one via the Zotero URL
                        elif len(targetOLD) == 1:
                            targetOLDCorresp=targetOLD[0].get("corresp")
                            print(targetOLD)
                            targetNewViaCorresp=zoteroExport.findall('.//tei:biblStruct[@corresp="'+targetOLDCorresp+'"]',xmlns)
                            if len(targetNewViaCorresp) > 1:
                                print("ERROR: "+targetOLDCorresp+": several instances in "+pathToZoteroExport)
                            elif len(targetNewViaCorresp) == 1:
                                targetNewID=targetNewViaCorresp[0].attrib["{http://www.w3.org/XML/1998/namespace}id"]
                                bibl.attrib["corresp"]="zot:"+targetNewID
                                addChange(doc, "dmp:DS", "fixed broken bibl/@corresp references")
                                doc.write(filepath, encoding="utf-8", xml_declaration=True)
                                fixPIs(filepath)
                        else:
                            print("unable to recover broken reference")
                        
                        

In [19]:
def addChange(doc,who, msg):
    now=datetime.now()
    dateString="{:%Y-%m-%d}".format(now)
    revisionDesc=doc.findall('.//tei:revisionDesc',xmlns)[0]
    hasBeenUpdated=revisionDesc.findall('./tei:change[@who="'+who+'"][@when="'+dateString+'"][.="'+msg+'"]', xmlns)
    if not hasBeenUpdated:
        revisionDesc.append(etree.fromstring('<tei:change who="'+who+'" when="'+dateString+'" xmlns:tei="http://www.tei-c.org/ns/1.0">'+msg+'</tei:change>'))

In [20]:
def fixPIs(pathToDoc):
    transform(pathToDoc, "../../082_scripts_xsl/fixPIs.xsl", pathToDoc)

In [None]:
fixErrors(pathToZoteroExport)
flagAmbiguousBiblRefs(pathToZoteroExport)
# recover broken bibl/@corresp references which might point to an old ID or to @n attribute 
for pathToFvoDoc in os.scandir(pathToFvoDocuments):
        if pathToFvoDoc.name.endswith('.xml') and pathToFvoDoc.is_file():
            filename = os.path.basename(pathToFvoDoc)
            filepath = pathToFvoDocuments + "/" + filename
            print("===== "+filepath+" =====")
            recoverOldIDRefs(filepath)