# Script to automate the export and manipulation of the VICAV-library

## Import Package eTree to parse XML Files


In [315]:
import requests
import json
import logging
import os
import re
import xml
import xml.etree.ElementTree as ET
from lxml import isoschematron, etree
from urllib.parse import urlparse, parse_qs, urlencode
import asyncio
import aiohttp
# this module is needed to make asyncio.run work inside the notebook as well as in the generated python script
import nest_asyncio
from random import random
import saxonche
import itertools 
from datetime import datetime

nest_asyncio.apply()
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
#logging.basicConfig(level=logging.DEBUG)

## Define name-space for xml-parsing


In [316]:
xmlns = {"tei": "http://www.tei-c.org/ns/1.0", "xml":"http://www.w3.org/XML/1998/namespace", "": "http://www.tei-c.org/ns/1.0"}
for key in xmlns:
    ET.register_namespace(key, xmlns[key])

## Access to the VICAV Zotero library

- Use API_TOKEN from environment to access Zotero
- Set the Zotero group id for VICAV here


In [317]:
#request_headers = {'Authorization': 'Bearer ' + os.environ['API_TOKEN']}
request_headers = {'Authorization': 'Bearer ' + 'NiddJVb3nSL8UdrxnqTlc6FP'}
group_id = "2165756"
limit_downloads_to = int(os.environ['LIMIT_DOWNLOADS_TO']) if 'LIMIT_DOWNLOADS_TO' in os.environ and os.environ['LIMIT_DOWNLOADS_TO'] else None
# On GitHub more than one connections to api.zotero.org was broken when this environment variable was introduced
conn_limit=int(os.environ['MAX_CONNECTIONS']) if 'MAX_CONNECTIONS' in os.environ and os.environ['MAX_CONNECTIONS'] else 4 
# total_timeout=int(os.environ['TIMEOUT']) if 'TIMEOUT' in os.environ and os.environ['TIMEOUT'] else 5 #s
total_timeout=100
logging.info("limit_downloads_to = " + str(limit_downloads_to) + ", conn_limit = " + str(conn_limit) + ', total_timeout = ' + str(total_timeout) + 's')

2024-08-12 17:36:40,895 - limit_downloads_to = None, conn_limit = 4, total_timeout = 100s


## Read all items in the library

Load items from Zotero group library

    Args:
        group_id (str): ID of a Zotero group
        limit (int): number of items to retrieve from library, maximum is 100.
        start (int): item number to start with


In [318]:
async def get_items(session, group_id:str,limit:int,start:int,itemType = None,format = None):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + "?limit=" + str(limit) + "&start=" + str(start) + ("&itemType="+itemType if itemType is not None else "") + ("&format="+format if format is not None else "")
    retries = 2
    while retries > 0:
        try:
            async with session.get(url=request_url, headers=request_headers) as response:
                if response.status == 200:
                    if format == "tei":
                        parsed = ET.fromstring(await response.read())
                    else:
                        parsed = json.loads(await response.text())
                    response_headers = response.headers
                    logging.info("Got "+request_url + (" Backoff: " + response.headers["Backoff"] if "Backoff" in response.headers else ""))
                    return parsed, response_headers
        except Exception as e:
            retries = retries - 1
            await asyncio.sleep(3 + random() + 0.5)
            logging.info("Retrying after " + type(e).__name__ + (": " + str(e) if str(e) else ""))

Get total number of items in group library

    Args:
        group_id (str): ID of a Zotero group

    Returns:
        int: number of items in the library


In [319]:
def total_number_items(group_id) -> int:
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return int(response.headers["Total-Results"])

Get headers of Zotero-Api-Calls

    Args:
        group_id (str): ID of a Zotero group


In [320]:
def get_headers(group_id):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return response.headers

Get links from headers

    Args:
        headers: http-headers of a response

    Returns:
        dict


In [321]:
def get_links_from_headers(headers) -> dict:
    link_list = headers["Link"].split(",")
    links = {}
    for link_item in link_list:
        #print(link_item)
        link_type = link_item.split('; rel="')[1].replace('"','').strip()
        link_value = link_item.split('; rel="')[0].replace("<","").replace(">","").strip()
        links[link_type] = link_value
    
    return links

## Get all items of a collection/group lib

- Generate all links with `for start in range(limit,last,limit)`.
- Then download in parallel.


In [322]:
async def fetch(request_url, session, format = None):
    await asyncio.sleep(1 * random() + 0.5)
    retries = 2
    while retries > 0:
        try:
            async with session.get(request_url, headers=request_headers) as response:
                if format == "tei":
                    content = ET.fromstring(await response.read())
                else:
                    content = json.loads(await response.text())
                logging.info("Got "+request_url + (" Backoff: " + response.headers["Backoff"] if "Backoff" in response.headers else ""))
                return {"status": response.status, "data": content}
        except Exception as e:
            retries = retries - 1
            await asyncio.sleep(3 + random() + 0.5)
            logging.info("Retrying after " + type(e).__name__ + (": " + str(e) if str(e) else ""))

async def fetch_batch(url_list, format = None):
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        responses = await asyncio.gather(*[fetch(url, session, format) for url in url_list])
    return responses

async def get_all_items(session, group_id, itemType = None, format = None):
    logging.info("Getting all items" + 
                 ((" of type " + itemType) if itemType is not None else "") + 
                 ((" formatted as " + format) if format is not None else "") + " now.")

    # settings to be used in the function to get the items (limit is max 100 per single request)
    limit=100
    start=0
    urls = []
    
    # get the first 200 items to start with
    first_round=await get_items(session, group_id,limit,start,itemType,format)
    allitems = first_round[0]
    
    # get the next link from the headers
    next_url = get_links_from_headers(first_round[1])["next"]
    last_url = get_links_from_headers(first_round[1])["last"]
    next_url_parsed = urlparse(next_url)
    parsed_qs = parse_qs(next_url_parsed.query)
    last_qs = parse_qs(urlparse(last_url).query)
    last_start = limit_downloads_to if limit_downloads_to is not None and format is not None else int(last_qs["start"][0])
    for start in range(limit, last_start+1, limit):
        parsed_qs["start"] = [start]
        parsed = next_url_parsed._replace(query=urlencode(parsed_qs, doseq=True))
        urls.append(parsed.geturl())
    i = 0
    while len(urls[i:i+conn_limit]) > 0:
        for response in await fetch_batch(urls[i:i+conn_limit], format):
            if isinstance(allitems, ET.Element) and isinstance(response["data"], ET.Element):
                for child in response["data"]:
                    allitems.append(child)
            else:
                allitems = allitems + response["data"]   
        i = i + conn_limit
    
    return allitems

Store all items of a group library in a json file

    Args:
        group_id (str): ID of a Zotero group
        filename (str): name of the export file including file-extension

    Returns:
        bool: True if successful


In [323]:
def export_all_items_to_file(group_id,filename) ->bool: 
    allitems = get_all_items(group_id)
    with open(filename,"w") as f:
        json.dump(allitems, f)
    return True

Store export in a file and get all item ids

The export contains also the note items. These are child items of some other item in this export. They have a parent reference.

There are also attachment items. These are child items of some other item in this export. Most of them have a parent reference but some don't have a parent item (anymore?)


In [324]:
json_file = "export_grouplib.json"
item_ids = []
note_ids = []
attachment_ids = []
async def get_generic_items(session):
    if os.path.isfile(json_file):
        logging.info("Grouplib export json already exists. Delete to fetch again (time consuming).")
        with open(json_file, 'r') as f:
            all_items = json.load(f)    
    else:
        all_items = await get_all_items(session, group_id)
    return all_items

async def get_export_json():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        all_items = await get_generic_items(session)
    # all_items = test5
    with open(json_file,"w") as f:
        json.dump(all_items, f)
        logging.info("Exported json.")
    
    for item in all_items:
        item_id = item["key"]
        item_type = item["data"]["itemType"]
        if item_type == 'note':
            note_ids.append(item_id)
        elif item_type == 'attachment':
            attachment_ids.append(item_id)
        else:
            item_ids.append(item_id)
    return all_items
all_items = asyncio.run(get_export_json())
all_items_map = {data["key"]:data for data in all_items}
all_notes_map = {data["data"]["parentItem"]:data for data in [all_items_map[id] for id in note_ids]}
all_attachments_map = {data["data"]["parentItem"] if "parentItem" in data["data"] else "ZZZZZZZZ":data for data in [all_items_map[id] for id in attachment_ids]}

2024-08-12 17:36:40,958 - Grouplib export json already exists. Delete to fetch again (time consuming).
2024-08-12 17:36:41,931 - Exported json.


## replace xml:id with biblid from extra-field

Most (but currently not all) Zotero items should have a canonical biblid assigned. This function gets the value from data/extra and
tries to extract the canonical biblid


In [325]:
def is_valid_xml_id(id):
    # https://stackoverflow.com/questions/55038323/how-to-write-a-regex-expression-to-check-a-valid-xml-element-ncname-in-javascrip
    p = re.compile(r"^[a-zA-Z_][\w.-]*$")
    matches = p.match(id)
    if matches:
        return True
    else:
        return False

In [326]:
malformedBiblIDs=[]
def get_biblid_from_extra(item):
    if "extra" in item["data"]:
        if item["data"]["extra"] != "":
            if "(biblid:" in item["data"]["extra"]:
                biblid=item["data"]["extra"].split(":")[1].replace(")","")
                if is_valid_xml_id(biblid):
                    return biblid
                else:
                    msg=item["key"] + " malformed biblid: " + item["data"]["extra"]
                    logging.info(msg)
                    malformedBiblIDs.append(msg)
                    return None
                    
            else:
                msg=item["key"] + " malformed biblid: " + item["data"]["extra"]
                logging.info(msg)
                malformedBiblIDs.append(msg)
                return None
        else:
            msg=item["key"] + " no biblid"
            logging.info(msg)
            malformedBiblIDs.append(msg)
            return None

In [327]:
biblid_by_zuid={item["key"]: get_biblid_from_extra(item)
               for item in all_items}

2024-08-12 17:36:41,992 - SE4ZYJKK no biblid
2024-08-12 17:36:41,992 - 7RJJASIF no biblid
2024-08-12 17:36:41,993 - 7JB4ZVXN malformed biblid: (biblid_Diez_2026_8302)


## Get the geo data and replace tags with geo refs

geo data is in `../../vicav_biblio/vicav_geodata.xml`


In [328]:
geo_data = ET.parse("../../010_manannot/vicav_geodata.xml")
geo_parent_map = {c:p for p in geo_data.iter( ) for c in p}
place_by_name = {placeName.text: 
                 {"type": geo_parent_map[placeName].get("type"),
                  "geo": geo_parent_map[placeName].find("./tei:location/tei:geo[@decls='#dd']",xmlns).text,
                  "el": geo_parent_map[placeName]}
                for placeName in geo_data.findall(".//tei:listPlace/tei:place/tei:placeName", xmlns)}

# Get the current mapping for @n to Zoteros unique ID

Zotero suggests readable @xml:id, we use them for @n, but those are not unique between runs or  
take into account that there may be more than one works by one author in a year.


In [329]:
get_zotero_unique_id = re.compile(r'https?://zotero.org/groups/[\d]+/items/(?P<zuid>[A-Z0-9]+)')
current_bibl_data = ET.parse("../../010_manannot/vicav_biblio_tei_zotero.xml")
n_by_zuid = {get_zotero_unique_id.match(bibStr.get("corresp")).groupdict()["zuid"]: bibStr.get("n")
                 for bibStr in current_bibl_data.findall(".//tei:biblStruct", xmlns)}
zuid_by_n = {bibStr.get("n"): get_zotero_unique_id.match(bibStr.get("corresp")).groupdict()["zuid"]
                 for bibStr in current_bibl_data.findall(".//tei:biblStruct", xmlns)}
duplicate_xmlid = {}

## Get all TEIs from Zotero

man nimmt die Liste mit den IDs der entries, baut für jeden entry die URL nach dem Muster  
https://api.zotero.org/groups/2165756/items/944KQVKQ?format=tei  
man lädt das mit GET requesst  
dann aus dem response den body und parsed das mit ET from string, nimmt daraus das  
`<biblStruct>` Element;  
baut eine gemeinsame `<listBibl>` und fügt das geparste Element ein,  
dann dumpt man den ganzen Element-Tree

### Retrieves TEI of an item generated by Zotero

Resolves place names to geo coordinates using the place by name dict created above.

### Keeping the xml:ids stable

The code uses the Zotero unique ids to look up the @xml:id in the current bibliography and change it to that if it is needed.  
Additionally if the id of the entry just downloaded does not match the known Zotero unique id a new unique @xml:id is generated appending b-z.  
This code is not tested very much for corner cases. It should be replaced by getting the canonical biblid from the downloaded data.


In [330]:
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
tag_parser = re.compile(r'(?P<geo_type>[^:]+):(?P<geo_name>[^[]+)(\[(?P<long>[\d.,]+) +(?P<lat>[\d.,]+)])?')

def create_geo_tag(tags_el, tag):
    #starts with reg: geo: diaGroup: -> lookup, get geo location and create elemnt
    # uses PEP 634, requires python 3.10+
    m = tag_parser.match(tag)
    if m is not None:
        match m.groupdict():
            case {"geo_type": "reg" | "geo" | "diaGroup", "geo_name": geo_name, "long": long, "lat": lat}:
                tag_note_el = ET.SubElement(tags_el, "note", type="tag")
                geo_name = geo_name.rstrip()
                ET.SubElement(tag_note_el, "name", type=m.groupdict()["geo_type"]).text = geo_name
                if geo_name in place_by_name:
                    ET.SubElement(tag_note_el, "geo").text = place_by_name[geo_name]["geo"]
                else:
                    ET.SubElement(tag_note_el, "note", type="missing_geo_data")
                return tag_note_el
    if tag in place_by_name:
        tag_note_el = ET.SubElement(tags_el, "note", type="tag", subtype="unmarked_geo")
        ET.SubElement(tag_note_el, "name", type=place_by_name[tag]["type"]).text = tag
        ET.SubElement(tag_note_el, "geo").text = place_by_name[tag]["geo"]
        return tag_note_el
    ret = ET.SubElement(tags_el, "note", type="tag")
    ret.text = tag
    return ret

def extend_item_tei(bibl):
    try:
        zuid = get_zotero_unique_id.match(bibl.get("corresp")).groupdict()["zuid"]
        logging.debug("Zoterio unique ID: " + zuid)
        zotero_xmlid = bibl.get("{http://www.w3.org/XML/1998/namespace}id")
        logging.debug("zotero @xml:id: " + zotero_xmlid)        
        bibl.set("n", zotero_xmlid)
        if zuid in n_by_zuid and zotero_xmlid != n_by_zuid[zuid]:
            bibl.set("n", n_by_zuid[zuid])
            logging.info("Changed @n for item " + zuid + " from "+ zotero_xmlid + " to " + n_by_zuid[zuid] + ".")
        elif zotero_xmlid in zuid_by_n and zuid_by_n[zotero_xmlid] != zuid:
            initial_xmlid = zotero_xmlid
            if not zotero_xmlid in duplicate_xmlid:
                duplicate_xmlid[zotero_xmlid] = zotero_xmlid + "b"
            else:
                duplicate_xmlid[zotero_xmlid] = duplicate_xmlid[zotero_xmlid][:-1] + chr(ord(duplicate_xmlid[zotero_xmlid][-1]) + 1)
            zotero_xmlid = duplicate_xmlid[zotero_xmlid]
            if zotero_xmlid in zuid_by_n and zuid_by_n[zotero_xmlid] != zuid:
                logging.error("genereated @n for item " + zuid + " is in use!")
            bibl.set("n", zotero_xmlid)            
            logging.info("Changed @n for item " + zuid + " from "+ initial_xmlid +
                         " to " + zotero_xmlid + " (duplicate of " + zuid_by_n[initial_xmlid] + ")")
        if zuid in biblid_by_zuid and biblid_by_zuid[zuid] is not None:
            bibl.set("{http://www.w3.org/XML/1998/namespace}id", biblid_by_zuid[zuid])
        if zuid in all_notes_map:
            note_for_zuid = all_notes_map[zuid]["data"]["note"].replace("&", "&amp;")
            parsed_note = ET.fromstring("<note>"+note_for_zuid+"</note>")
            bibl.append(parsed_note)
        tags = all_items_map[zuid]["data"]["tags"]
        if len(tags) > 0:
            tags_el = ET.SubElement(bibl, "note", type="tags")
            for o in tags:
                create_geo_tag(tags_el, o["tag"])

    except xml.etree.ElementTree.ParseError:
        logging.info("XML parser error in notes for item id "+zuid+"\n"+note_for_zuid)
    if bibl is None:
        logging.debug("No biblStruct in item " + zuid)
    logging.debug("Extended TEI for " + zuid)

async def get_item_tei(group_id,item_id,session):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + item_id + "?format=tei"
    bibl = None
    note_for_item_id = ""
    response_text = ""
    try:
        async with session.get(url=request_url, headers=request_headers) as response:
            response_text = await response.text()
        list_bibl = ET.fromstring(response_text)
        bibl = list_bibl.find("tei:biblStruct",xmlns)
        logging.debug("XML: " + ET.tostring(bibl, encoding="unicode"))
        if bibl is None:
            logging.info("There is no biblStruct in the response for item " + item_id +":\n"+response_text)
            return bibl
        extend_item_tei(bibl)
    except asyncio.TimeoutError:
        logging.info("Timeout fetching " + item_id)
    if bibl is None:
        logging.debug("No biblStruct in item " + item_id)
    logging.info("Fetched TEI for " + item_id)
    return bibl

In [331]:
async def get_item_tei_test():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn) as session:
        #test = await get_item_tei(group_id,"944KQVKQ",session)
        #test = await get_item_tei(group_id,"DXNCFAMR",session)
        #test = await get_item_tei(group_id,"6QNLQCG9",session)
        #test = await get_item_tei(group_id,"944KQVKQ",session)
        #test = await get_item_tei(group_id,"6BGIGGVN",session)
        test = await get_item_tei(group_id,"3BUJBNPK",session)
    ET.indent(test)
    ET.dump(test)
asyncio.run(get_item_tei_test())

2024-08-12 17:36:44,653 - Changed @n for item 3BUJBNPK from Woidich2007 to Woidich2007a.
2024-08-12 17:36:44,654 - Fetched TEI for 3BUJBNPK


<biblStruct xmlns="http://www.tei-c.org/ns/1.0" type="journalArticle" xml:id="woidich_2007_0001" corresp="http://zotero.org/groups/2165756/items/3BUJBNPK" n="Woidich2007a">
  <analytic>
    <title level="a">Everything you always wanted to know about 'āl, yi'ūl "to say" in Egyptian Arabic</title>
    <author>
      <forename>Manfred</forename>
      <surname>Woidich</surname>
    </author>
  </analytic>
  <monogr>
    <imprint>
      <biblScope unit="page">675-700</biblScope>
      <date>2007</date>
    </imprint>
  </monogr>
  <note type="tags">
    <note type="tag">
      <name type="reg">Egypt</name>
      <geo>27.000000 30.000000</geo>
    </note>
    <note type="tag">
      <name type="reg">Egypt-Sudan</name>
      <geo>22.316667 30.100000</geo>
    </note>
  </note>
</biblStruct>


In [332]:
template = ET.parse("listbibl_template.xml")
sourceDesc = template.find(".//tei:sourceDesc/tei:p", xmlns)
now=datetime.now()
dateTimeString="{:%Y-%m-%d %H:%M:%S}".format(now)
sourceDesc.text = sourceDesc.text.replace("{dateTime}", dateTimeString)
list_bibl = template.find("tei:text/tei:body/tei:listBibl",xmlns)

### Load template containing a listBibl-element that will be filled with the retrieved biblStruct elements


## Get the TEI

- For each item-id get the TEI and append it to list-bibl
- Save the resulting XML
- Use the same method as for the JSON for the whole group lib
- Then download in parallel.

We need to consider https://www.zotero.org/support/dev/web_api/v3/basics#rate_limiting

TODO: Maybe only get the all the top items in the group with https://api.zotero.org/groups/2165756/items/top?format=tei&limit=100&sort=creator


In [333]:
async def async_download():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        all_items = await get_all_items(session, group_id, format = "tei")
    return all_items

all_items = asyncio.run(async_download())
for bibl in all_items:
    extend_item_tei(bibl)
all_items[:] = sorted(all_items, key=lambda child: child.get("{http://www.w3.org/XML/1998/namespace}id"))
for item in all_items:
    list_bibl.append(item)

pathToZoteroExport='../../010_manannot/vicav_biblio_tei_zotero.xml'
with open(pathToZoteroExport, 'wb') as f:
    ET.indent(template)
    template.write(f, encoding='utf-8',xml_declaration=True)
    logging.info("TEI export done.")

2024-08-12 17:36:44,673 - Getting all items formatted as tei now.
2024-08-12 17:36:47,706 - Got https://api.zotero.org/groups/2165756/items/?limit=100&start=0&format=tei
2024-08-12 17:36:50,579 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=200
2024-08-12 17:36:50,887 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=300
2024-08-12 17:36:51,544 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=400
2024-08-12 17:36:53,357 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=100
2024-08-12 17:36:56,141 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=700
2024-08-12 17:36:56,637 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=500
2024-08-12 17:36:56,802 - Got https://api.zotero.org/groups/2165756/items/?format=tei&limit=100&start=800
2024-08-12 17:36:57,028 - Got https://api.zotero.org/groups/2165756/items/?format=tei&li

## Fix TEI export

Zotero's TEI serialization has some structural errors (elements in wrong position etc.) which we fix witha small XSLT transformation.


In [334]:
with saxonche.PySaxonProcessor(license=False) as proc:
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    print(proc.cwd)

def transform(source, xsl, output, parameters=[]):
    try:
        with saxonche.PySaxonProcessor(license=False) as proc:
            proc.set_configuration_property("xi", "on")
            saxon = proc.new_xslt30_processor()
            for param in parameters:
                saxon.set_parameter(name=param, value=proc.make_string_value(parameters[param]))
            exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
            exec.apply_templates_returning_file(source_file=os.path.abspath(source), output_file=os.path.abspath(output))
            if exec.exception_occurred:
                print(saxon.get_error_message())
                print(f"Transformation failed: {source} with stylesheet {xsl} -> {output}")
            if os.path.exists(os.path.abspath(output)):
                return output
            else:
                print(f"Error transforming {source} with stylesheet {xsl}")
    except Exception as e:
        print(f"An error occurred: {e}")

def applyTransformation(pathToZoteroExport, xslPath, pathToOutput=None):
    if pathToOutput is None:
        pathToOutput = pathToZoteroExport + ".tmp"
    transform(pathToZoteroExport, xslPath, pathToOutput)
    return pathToOutput

# fix Zotero export
pathToFixZoteroXSL="../../082_scripts_xsl/fix_zotero_TEI_export.xsl"
applyTransformation(pathToZoteroExport, pathToFixZoteroXSL, pathToZoteroExport)

# add decade of data collection
pathToDOCXSL = "../../082_scripts_xsl/tei_2_tei_zotero_doc.xsl"
applyTransformation(pathToZoteroExport, pathToDOCXSL, pathToZoteroExport)

SaxonC-HE 12.4.2 from Saxonica
c:\Users\ksramo\Documents\WIBARAB_GEO\featuredb\080_scripts_generic


'../../010_manannot/vicav_biblio_tei_zotero.xml'

In [335]:
def validate(path, rngSchema):
    """Validate a document against the rngSchema. Returns a list of dicts of which each one represents a validation (or parsing) error."""
    validationErrors = []
    
    try:
        doc = etree.parse(path)
    
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
    
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        
        validationErrors.append(valErrObj)     
        return validationErrors
        
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            # we ignore rng errors about @schemaLocation since 
            # that is needed for validation in the TEI-enricher
            if error.message != "Invalid attribute schemaLocation for element TEI":
                location = "n/a" if error.path is None else error.path
                valErrObj = {
                    "type" : "error",
                    "message": error.message, 
                    "line": error.line, 
                    "source": path, 
                    "location": location,
                    "stage" : "relaxng", 
                    "exceptionType": type(e).__name__
                }
                # DEBUG
                print(valErrObj)
                validationErrors.append(valErrObj)        
    
    return validationErrors

## report duplicate xml:ids

Since bibl-ids are entered manually, they might contain duplicate. We report them here.


In [336]:
print (pathToZoteroExport)
zoteroExport = ET.parse(pathToZoteroExport)
ids=[]
bibls=zoteroExport.findall('.//biblStruct', xmlns)
for bibl in bibls:
    id = bibl.get("{http://www.w3.org/XML/1998/namespace}id")
    url = bibl.get("corresp")
    ids.append((id,url))

ids.sort(key=lambda x: x[0])
groups=[]
for key, group in itertools.groupby(ids, lambda x: x[0]):
    groups.append((key, [i[1] for i in list(group)]))

for i in groups:
    if len(i[1]) > 1:
        print(i[0]+": "+str(len(i[1]))+" entries with this id: "+", ".join(i[1]))

../../010_manannot/vicav_biblio_tei_zotero.xml
leitner_2021_8011: 2 entries with this id: http://zotero.org/groups/2165756/items/AJTU7MKV, http://zotero.org/groups/2165756/items/73V9CH52
ould_mohamed_baba_2004_0000: 2 entries with this id: http://zotero.org/groups/2165756/items/N79IC9JK, http://zotero.org/groups/2165756/items/HELVL6RU


## Validate Zotero Export against TEI all


In [337]:
tei_all = "../../803_RNG_Schematron/tei_all.rng"
errors=validate(pathToZoteroExport, tei_all)
if not errors:
    print("Zotero export is valid!")
else:
    errors

## Slow but validating code

The following code fetches items Zotero unique id by Zotero unique id. This is much slower but could potentially find missing items.
