# ELAN to TEI conversion 

**Author:** Daniel Schopper    
**Description:** This notebook automates the ELAN to TEI conversion in the WIBARAB Project. It is based on the same process in the SHAWI Project.
**Last Change:** 2023-10-10     
**History:**    
* 2023-10-10: Initital set up
* 2023-10-12: updated to Saxon CE HE (Omar Siam)

In [1]:
import io
import sharepy
import os
import logging
import requests
import pathlib
#import filetype – not used
from pathlib import Path
from urllib.parse import urlsplit
from lxml import isoschematron, etree
import saxonche
from zipfile import ZipFile
import subprocess
from datetime import datetime
# from inspect import getmembers, signature
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

## Configuration

In [2]:
# the URL of the Sharepoint installation 
sp_baseURL = "oeawacat.sharepoint.com"

# the sharepoint username + password are taken from the environment
sp_username = os.environ['SP_USERNAME']
sp_pwd = pwd = os.environ['SP_PWD']

# the name of the Sharepoint Site
sp_siteName = "ACDH-CH_p_WIBARAB_BedoinTypeArabicNomadicSedentaryPeopleMidd"

# the path to the Excel file
sp_pathToRecordingsXLSX = "Shared%20Documents/Fieldwork%20data%20+%20analysis/WIBARAB_Recordings.xlsx"


# the name of the local directory where downloaded data will be stored
dataDir = "data"

# the name of the local directory where downloaded libraries and other auxiliary code will be stored
libDir = "lib"

# the root of the git repository
dataHomeDir = "../.."

# path to project-specific stylesheets
pathToStylesheetsDir = dataHomeDir+"/082_scripts_xsl"

# the path to the ELAN transcription files
pathToELANDir = dataHomeDir+"/122_elan"

# the path to the non-annotated TEI transcription files
pathToTEIDir = dataHomeDir+"/103_tei_w"

# the path to the annotated TEI transcription files
pathToAnnotatedTEIDir = dataHomeDir+"/010_manannot"


# the path to the NoSkE verticals
noSkEVertDir = dataHomeDir+"/130_vert_plain"

# the path to the tei Corpus document produced by this script
pathToTeiCorpus = pathToTEIDir+"/wibarabCorpus.xml"


# the path to the audio files
pathToRecordingsDir = "THIS_IS_NOT_USED"#"/mnt/univie_orientalistik/SHAWI/Recordings"

In [3]:
def deleteOutputOfPreviousRuns(pathToDir):
    # TOOD Implement
    logging.info("removing output of previous runs in "+pathToDir)
    if not pathToDir.startswith(".."):
        logging.error(pathToDir +" is not a relative path. Aborting deletion.")
    else:
        with os.scandir(pathToDir) as it:
            for entry in it:
                if entry.is_file() and entry.name.endswith('.xml') or entry.name.endswith('.txt'):
                    os.remove(entry)

In [4]:
with saxonche.PySaxonProcessor(license=False) as proc:
    logging.info(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    logging.info(proc.cwd)


#set up directories
logging.info("** setting up directories **")

# remove data from previous runs
deleteOutputOfPreviousRuns(pathToTEIDir)
deleteOutputOfPreviousRuns(noSkEVertDir)

for i in [dataDir,libDir]: 
    if os.path.exists(i):
        logging.info("skipped existing directory '"+i+"'")
    else:
        os.mkdir(i)
        logging.info("created directory '"+i+"'")
        
        
# define which steps should be skipped. 

SKIP_PROCESSING = []#["runTEICorpo"]

2023-10-23 17:31:38,992 - SaxonC-HE 12.3 from Saxonica
2023-10-23 17:31:38,995 - /home/dschopper/data/WIBARAB/corpus/080_scripts_generic
2023-10-23 17:31:39,002 - ** setting up directories **
2023-10-23 17:31:39,004 - removing output of previous runs in ../../103_tei_w
2023-10-23 17:31:39,007 - removing output of previous runs in ../../130_vert_plain
2023-10-23 17:31:39,009 - skipped existing directory 'data'
2023-10-23 17:31:39,010 - skipped existing directory 'lib'


## Setup

### Step 1: get the latest release of the TEI Stylesheets 

In [5]:
# Setup

# fetch the TEI Stylesheets    
def installFromGithub(libraryName):
    auth = {}
    if 'GITHUB_TOKEN' in os.environ:
        auth = {"Authorization": "Bearer "+os.environ['GITHUB_TOKEN']}
    headers = {"Accept" : "application/vnd.github.v3+json"}
    repo = libraryName
    logging.info("** Fetching library "+repo+" **")
    libBasePath = libDir+"/"+repo
    
    # First we check which tag name the latest release has
    r = requests.get("https://api.github.com/repos/"+repo+"/releases/latest", headers={**headers, **auth})
    if r.status_code != 200:
        logging.error("An error occured fetching the latest release. Maybe there isn't any release? ")
        logging.error(r.content)
        return 1
    release = r.json()
    tag = release["tag_name"]
    
    # we check whether we have the latest version already \
    # by checking if the respective path is already installed
    libReleasePath = libBasePath+"/"+tag
    haveLatestVersion = os.path.exists(libReleasePath)
    if haveLatestVersion:
        logging.info("We have already the latest version ("+tag+"). Exiting")
        logging.info("")
        return libReleasePath
    else:
        url = release["assets"][0]["browser_download_url"]
        payload = requests.get(url, headers=auth).content
        zipfilename = os.path.basename(url)
        os.makedirs(libReleasePath, exist_ok=True)
        zipfilePath = libReleasePath +"/"+zipfilename
        open(zipfilePath, 'wb').write(payload)
        ZipFile(zipfilePath).extractall(path=libReleasePath)
        logging.info("Downloaded latest version ("+tag+") to "+libReleasePath)
        logging.info("")
        return libReleasePath


pathToTEIGuidelines=installFromGithub("TEIC/TEI")
pathToTEIStylesheets=installFromGithub("TEIC/Stylesheets")


2023-10-23 17:31:39,037 - ** Fetching library TEIC/TEI **
2023-10-23 17:31:39,383 - We have already the latest version (P5_Release_4.6.0). Exiting
2023-10-23 17:31:39,384 - 
2023-10-23 17:31:39,389 - ** Fetching library TEIC/Stylesheets **
2023-10-23 17:31:39,727 - We have already the latest version (v7.55.0). Exiting
2023-10-23 17:31:39,728 - 


### Step 2: Download the latest version of the Excel Sheet

In [6]:
# TODO will need to add credentials if this is run in non-interactive mode
def downloadFromSP(sp_filepath, force=False):
    url = "https://"+sp_baseURL+"/sites/"+sp_siteName+"/"+sp_filepath
    logging.info("attempting to download file from '"+url+"'")
    filename = os.path.basename(sp_filepath)
    downloadPath = dataDir+"/"+filename
    if os.path.exists(downloadPath) and not force:
        logging.info("skipping existing file "+downloadPath)
        return downloadPath
    else:
        s = sharepy.connect(sp_baseURL, username=sp_username, password=sp_pwd)
        s.getfile(url, filename=downloadPath)
        return downloadPath


pathToExcelSheet = downloadFromSP(sp_pathToRecordingsXLSX, force="downloadExcelSheet" not in SKIP_PROCESSING)
logging.info(pathToExcelSheet)

2023-10-23 17:31:39,744 - attempting to download file from 'https://oeawacat.sharepoint.com/sites/ACDH-CH_p_WIBARAB_BedoinTypeArabicNomadicSedentaryPeopleMidd/Shared%20Documents/Fieldwork%20data%20+%20analysis/WIBARAB_Recordings.xlsx'
2023-10-23 17:31:41,605 - data/WIBARAB_Recordings.xlsx


## Step 2: transform xlsx to TEI table

In [7]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        try:
            exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
            exec.set_global_context_item(file_name=os.path.abspath(s))
            # From the docs saxonc.html#PyXsltExecutable-set_initial_match_selection
            # This method does not set the global context item for the transformation;
            # if that is required, it can be done separately using the set_global_context_item method.
            exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        except saxonche.PySaxonApiError as e:
            logging.info(str(e))
            logging.info(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if proc.exception_occurred:
            logging.info(proc.get_error_message())
            logging.info(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            logging.info("there was an error transforming "+s+" with stylesheet "+xsl)

In [8]:
def xlsx2teitable(xlsx, output):

    # first, extract contents of XLSX document to a temp directory
    unzipPath=xlsx.replace(".xlsx","")
    os.makedirs(unzipPath, exist_ok=True)
    ZipFile(xlsx).extractall(path=unzipPath)
    
    # then transform the .rels file using the TEIC Stylesheets 
    pathToXlsxtoteiXSL=pathToTEIStylesheets+"/xml/tei/stylesheet/xlsx/xlsxtotei.xsl"

    params = {
        "inputDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri(),
        "workDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri()
    }

    transform(
        s = unzipPath+"/_rels/.rels", 
        xsl = pathToXlsxtoteiXSL, 
        o = output, 
        parameters=params
    )
    return

In [9]:
pathToTEItable=pathToExcelSheet.replace(".xlsx",".xml")

if not "xlsx2teitable" in SKIP_PROCESSING:    
    xlsx2teitable(xlsx=pathToExcelSheet, output=pathToTEItable)
    debugstring="""<!-- 
   THIS FILE IS INCLUDED IN THE GIT REPOSITORY ONLY FOR DEBUGGING PURPOSES. 
   
   The source of this file is constantly being edited at 
   https://oeawacat.sharepoint.com/sites/ACDH-CH_p_ShawiTypeArabicDialects_Shawi/_layouts/15/Doc.aspx?sourcedoc={F01FF43B-2409-4E31-A5BF-653E0559B160}&file=SHAWI%20Recordings.xlsx&action=default&mobileredirect=true&cid=f7311564-c2b6-4b08-9a52-468547688408
   So this copy is most probably already outdated.
   
  To update it, you can either run https://gitlab.com/acdh-oeaw/shawibarab/shawi-data/-/blob/main/080_scripts_generic/080_01_ELAN2TEI/ELAN2TEI.ipyn
   *OR*  
   1) download the Excel file manually from Sharepoint
   2) and tranform it to TEI using oxgarage.tei-c.org/ 
   
-->
    """
    f = open(pathToTEItable,mode="r",encoding="UTF8")
    src = f.read()
    new = src.replace('<?xml version="1.0" encoding="UTF-8"?>','<?xml version="1.0" encoding="UTF-8"?>\n'+debugstring)
    f.close()
    f = open(pathToTEItable, mode="wt",encoding="UTF8")
    f.write(new)
    f.close()
        
    logging.info(pathToTEItable)

2023-10-23 17:31:42,053 - data/WIBARAB_Recordings.xml


## Step 3: transform TEI table to corpus header

In [10]:
pathToTeitableToCorpusXSL=pathToStylesheetsDir+"/table2corpus.xsl"
params = {
    "pathToRecordings" : pathlib.Path(os.path.abspath(pathToRecordingsDir)).as_uri(),
    "sp_pathToRecordingsXLSX": sp_pathToRecordingsXLSX
}
try:
    transform(pathToTEItable, pathToTeitableToCorpusXSL, pathToTeiCorpus, params)
except saxonche.PySaxonApiError as e:
    logging.error("an error occured: " + str(e) + "\n" + pathToTEItable + ": " + pathToTeitableToCorpusXSL + " -> " + pathToTeiCorpus)
logging.info(pathToTeiCorpus)

2023-10-23 17:31:42,188 - ../../103_tei_w/wibarabCorpus.xml


## Step 4: Run TEICorpo

In [11]:
def installFromUrl(url, force=False):
    r = requests.get(url)
    filename = os.path.basename(urlsplit(url).path)
    downloadpath = libDir+"/"+filename
    if os.path.exists(downloadpath) and not force:
        logging.info("skipping download")
    else:
        open(downloadpath, 'wb').write(r.content)
        logging.info("file "+downloadpath+" downloaded")
    return downloadpath

# TODO check for filetype and automatically extract zip file 
# so this can be re-used for the insta
 
installFromUrl("https://github.com/christopheparisse/teicorpo/blob/e06a01ad5cb4c3aef631b3749ce59b5eb6f5ea11/teicorpo.jar?raw=true")
installFromUrl("https://repo1.maven.org/maven2/commons-io/commons-io/2.11.0/commons-io-2.11.0.jar")
pathToTeiCorpo=libDir+"/*"
logging.info(pathToTeiCorpo)

2023-10-23 17:31:43,112 - skipping download
2023-10-23 17:31:43,199 - skipping download
2023-10-23 17:31:43,201 - lib/*


Collect all ELAN documents from pathToELANDir

In [12]:
ELANDocs = []

In [13]:
def processDir(pathToDir):
    docs = []
    logging.info("processing "+pathToDir)
    for i in os.scandir(pathToDir):
        filename=os.path.basename(i)
        if i.is_dir():
            dirname=os.path.basename(i)
            dirDocs = processDir(pathToDir+"/"+dirname)
            docs.extend(dirDocs)
            
        elif filename.endswith(".eaf"):
            basename=Path(i).stem
    
            # check whether there is already a manually annotated TEI version of this ELAN document
            TEI_annotated_filename=pathToAnnotatedTEIDir+"/"+basename+".xml"
            TEI_annotated_exists = os.path.exists(os.path.abspath(TEI_annotated_filename)) 
            TEI_annotated=os.path.abspath(TEI_annotated_filename) if TEI_annotated_exists else False
            
            docs.append({
                "filepath" : os.path.abspath(i), # path to the ELAN document
                "filename" : filename,
                "basename" : basename,
                "TEI_annotated" : TEI_annotated,
                "tmpDir" : False,  # path to temporary output files (e.g. output of TEICorpo)
                "filepath_tmp_TEI" : False, # path to the output of TEICorpo
                "TEI" : False # path to the TEI representation of the ELAN document with metadata from the spreadsheet
                
            })
    return docs

In [14]:
ELANDocs=processDir(pathToELANDir)

2023-10-23 17:31:43,258 - processing ../../122_elan
2023-10-23 17:31:43,260 - processing ../../122_elan/AID


In [15]:
for d in ELANDocs:
    logging.info(d["filepath"])

2023-10-23 17:31:43,274 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/03_NZ_M.73_Karantina_historyKarantina.eaf
2023-10-23 17:31:43,276 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/15_NZ_M.73_Karantina_ProphetstoryValues.eaf
2023-10-23 17:31:43,278 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/09_NZ_M.73_Karantina_creditSon.picturesWedding.eaf
2023-10-23 17:31:43,279 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/05_NZ_M.73_Karantina_shyuxa.eaf
2023-10-23 17:31:43,280 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/18_NZ_M.73_Karantina_tamerStory.eaf
2023-10-23 17:31:43,281 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/17_NZ_M.73_Karantina_shadirStory.eaf
2023-10-23 17:31:43,282 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/14_NZ_M.73_Karantina_Beirutbefore.eaf
2023-10-23 17:31:43,283 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/04_NZ_M.73_Karantina_Pictures.eaf
2023-10-23 17:31:43,284 - /home/dschopper/data/WIBARAB/corpus/122_elan/AID/01_NZ_M.73_

In [16]:
def runTEICorpo(docs = dict):
    runtime = datetime.now().strftime("%Y-%m-%d_%H-%M")
    tmpDir = pathToTEIDir+"/"+runtime
    os.makedirs(tmpDir, exist_ok=True)
    for i in docs:
        pathToInput = i["filepath"]
        filenameELAN = i["filename"]
        filenameTEI = i["basename"]+".xml"
        pathToOutput = tmpDir+"/"+"ELAN_"+filenameTEI
        i["filepath_tmp_TEI"] = os.path.abspath(pathToOutput)
        i["tmpDir"] = tmpDir
        output = os.path.abspath(pathToTEIDir + "/" + i["basename"] + ".xml")
        i["TEI"] = os.path.abspath(output)
        res = subprocess.run(["java", "-cp", pathToTeiCorpo, "-Dfile.encoding=UTF-8", "fr.ortolang.teicorpo.TeiCorpo", "-from","elan", "-to","tei", "-o",pathToOutput, pathToInput], capture_output=True, encoding="UTF-8")
        print(res.stdout)
        print(res.stderr)
        print(pathToOutput)

run TEI Corpo on all ELANDocs, writing the path to the TEI output back to the variable

In [None]:
if not "runTEICorpo" in SKIP_PROCESSING:
    runTEICorpo(docs=ELANDocs)

TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_03_NZ_M.73_Karantina_historyKarantina.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_15_NZ_M.73_Karantina_ProphetstoryValues.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_09_NZ_M.73_Karantina_creditSon.picturesWedding.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_05_NZ_M.73_Karantina_shyuxa.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_18_NZ_M.73_Karantina_tamerStory.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_17_NZ_M.73_Karantina_shadirStory.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-23_17-31/ELAN_14

## Step 5: Merge metadata and TEICorpo Output

In [None]:
def mergeMetadata(docInfo, p):
    """Tries to find the corpus metadata in WIBARABCorpus.xml for the TEICorpo output by comparing its filename to the tei:title elements in it (= IDs of the Recording table), 
       and then replaces the teiHeader in the TEICorpo output with it."""
    # TOOD The matching logic should be revised, it's too messy 
    # probably move to the jupyter-notebook instead of having it here.
    
    # <xsl:variable name="corpusDoc" select="doc($pathToCorpusDoc)" as="document-node()"/>
    # <xsl:variable name="IDcandidates" select="$corpusDoc//*:title"/>
    # <xsl:variable name="pathSegs" select="tokenize(base-uri($input),'/')"/>
    # <xsl:variable name="recordingID" select="$IDcandidates[some $x in $pathSegs satisfies contains(lower-case($x), lower-case(.))]"/>
    
    # pathToTmpTEI: 
    pathToTmpTEI=docInfo["filepath_tmp_TEI"]
    pathToMergedTEI=docInfo["tmpDir"]+"/"+docInfo["basename"]+"_00_metaMerged.xml"
    
    logging.info("trying to inject metadata from "+p["pathToCorpusDoc"]+" into "+pathToTmpTEI)
    
    try:
        transform(s=pathToTmpTEI, xsl=pathToStylesheetsDir+"/mergeHeaderAndTranscription.xsl", o=pathToMergedTEI, parameters=p)
    except saxonche.PySaxonApiError as e:
        logging.error("an error occured: " + str(e) + "\n" + s + ": " + pathToPostprocessXSL + " -> " + s)
    
    # check wether the output file is well-formed
    #try:
    #    parsed = etree.parse(pathToMergedTEI)
   #     if parsed:
    docInfo["filepath_tmp_00_mergedMetadata"]=pathToMergedTEI
    return pathToMergedTEI
    
    #except etree.XMLSyntaxError as e:
     #   logging.error("merge metadata resulted in an non-wellformed (empty?) XML document")
      #  return False

## Step 6: Post-process merged TEI document prior to tokenization

In [None]:
def postProcessMergedTEI(docObject, pathToInput):
    """applies a post-process XSLT to the merged document prior to """
    s = pathToInput #docInfo["filepath_tmp_TEImergedMetadata"]
    o = s
    logging.info("running post-metadata-merge processing on "+s)
    if not os.path.exists(os.path.abspath(s)):
        logging.error("file "+s+" does not exist.")
    else:
        pathToPostprocessXSL=pathToStylesheetsDir+"/postprocessTEICorpoOutput.xsl"
        try:
            transform(s, pathToPostprocessXSL, o, {})
        except saxonche.PySaxonApiError as e:
            logging.error("an error occured: " + str(e) + "\n" + s + ": " + pathToPostprocessXSL + " -> " + o)
    
        # check wether the output file is well-formed
        #try:
        #    parsed = etree.parse(o)
        #    if parsed:
        docObject["filepath_tmp_TEImergedMetadata"]=o
        return o
        #except etree.XMLSyntaxError as e:
        #    logging.error("post-processing merged TEI document resulted in an non-wellformed (empty?) XML document")
        #    return False

## Step 7: Tokenization of unannotated texts

Run a local copy of [xsl-tokenizer](https://github.com/acdh-oeaw/xsl-tokenizer)

The merged TEI document is tokenized for further manual annotation.

### Step 7.0: (Re-)generate tokenizer stylesheets (optional)

Regenerate the XSLs used in the following steps.
This can not be done with saxonpy (xincludes are not resolved)
use
```bash
java -jar Saxon-HE-9.9.1-8.jar -s:profile.xml -xi:on -xsl:xsl/make_xsl.xsl
```

For all the ELAN files converted to TEI:

### Step 7.1: Remove new lines

Remove new lines and store to intermediate document:

In [None]:
def removeNL(docObject, pathToInput):
    s = pathToInput # docInfo["tmpDir"]+'/'+docInfo["basename"]+"_00_metaMerged.xml"
    o = docObject["tmpDir"]+'/'+docObject["basename"]+"_01_nlRmd.xml"
    logging.info("removing new lines from "+s)
    transform(s = s, xsl = "./tokenizer/xsl/rmNl.xsl", o = o)
    # check wether the output file is well-formed
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    docObject["filepath_tmp_t0_rmnl"]=o
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 0 / removing newlines resulted in an non-wellformed (empty?) XML document")
    #    return False

### Step 7.2: create w tags


In [None]:
def tokenize(docInfo, pathToInput):
    s = pathToInput # docInfo["tmpDir"]+'/'+docInfo["basename"]+"_01_nlRmd.xml"
    logging.info("tokenizing "+s)
    o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_02_toks.xml"
    transform(s = s, xsl = "./tokenizer/wrapper_toks.xsl", o = o)
    # check wether the output file is well-formed
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    docInfo["filepath_tmp_t1_w"]=o
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 1 / tokenization resulted in an non-wellformed (empty?) XML document")
    #    return False

### Step 7.3: Add part attributes to w tags

Add Part-Attributes and explicit token links:

In [None]:
def addP(docInfo, pathToInput):
    s = pathToInput #docInfo["tmpDir"]+'/'+docInfo["basename"]+"_02_toks.xml"
    logging.info("adding @part on <w>")
    o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_03_tokenized.xml"
    transform(s = s, xsl = "./tokenizer/wrapper_addP.xsl", o = o)
    # check wether the output file is well-formed
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    docInfo["filepath_tmp_t2_part"]=o
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 2 / adding w/@part resulted in an non-wellformed (empty?) XML document")
    #    return False

### Step 7.4: apply project-specific post-processing

Do some post tokenization processing specific to the Shawi project.

In [None]:
def postProcess(docInfo, pathToInput):
    s = pathToInput #docInfo["tmpDir"]+'/'+docInfo["basename"]+"_03_tokenized.xml"
    logging.info("applying post-tokenization processing to "+s)
    o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml"
    transform(s = s, xsl = "./tokenizer/postTokenization/1.xsl", o = o)
    # check wether the output file is well-formed
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    docInfo["filepath_tmp_t3_post"]=o
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 4 / postprocessing resulted in an non-wellformed (empty?) XML document")
    #    return False

## Step 6.5: move token namespace from xtoks to TEI 

**--> This step creates the files which data curators will copy to `010_manannot` and annotate using the TEI enricher**

In [None]:
def createTEIForAnnotation(docInfo, pathToInput):
    s = pathToInput # output of postProcess = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml"
    o = docInfo["TEI"]
    logging.info("creating TEI document for annotation from "+s)
    transform(s = s, xsl = "./tokenizer/custom_xtoks2tei.xsl", o = o, parameters = {"preserve-ws": "false"})
    
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 5 / custom_xtoks2tei resulted in an non-wellformed (empty?) XML document")
    #    return False

## Step 7: Create NoSke input

We create verticals from the unannotated texts and attach the token annotations from `010_manannot` to them.

### Step 7.1 Create XML vertical from tokenized XML documents

We take the tokenized XML document (prior to have moved to TEI) and create an XML vertical from it:

In [None]:
def createXMLVert(docInfo, pathToInput):
    s = pathToInput # output of postProcess = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml"
    logging.info("creating XML vertical from "+s)
    o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_05_vert.xml"
    transform(s = s, xsl = "./tokenizer/custom_xtoks2vert.xsl", o = o)
    
    #try:
    #    parsed = etree.parse(o)
    #    if parsed:
    return o
    #except etree.XMLSyntaxError as e:
    #    logging.error("tokenizing step 5 / custom_xtoks2tei resulted in an non-wellformed (empty?) XML document")
    #return False

### Step 7.2: attach manual annotations to the XML vertical

In [None]:
def attachAnnotationsToXMLVert(docInfo, pathToXMLVertical):
    """Try to add existing annotations to the newly converted document, if they exist."""
    if docInfo["TEI_annotated"] and os.path.exists(os.path.abspath(docInfo["TEI_annotated"])):
        transform(
            s = pathToXMLVertical, #docInfo["tmpDir"]+'/'+docInfo["basename"]+"_05_vert.xml",
            xsl = pathToStylesheetsDir+"/copyAnaToVert.xsl", 
            o = docInfo["tmpDir"] + "/" + docInfo["basename"] + "_05_vert_annot.xml",
            parameters = {
                "path_to_annotated_doc": pathlib.Path(os.path.abspath(docInfo["TEI_annotated"])).as_uri()
            }
        )
    else:
        logging.info("No previous annotations found for "+docInfo["basename"])

### Step 7.3 convert XML vertical to text vertical

Create a vertical vor NoSkE

In [None]:
def createNoSkEVert(docInfo, pathToInput):
    s = pathToInput # docInfo["tmpDir"] + "/" + docInfo["basename"] + "_05_vert_annot.xml"
    o = noSkEVertDir + "/" + docInfo["basename"] + ".txt"
    transform( s = s,xsl = "./tokenizer/wrapper_vert2txt.xsl", o = o)
    return o

## Run Steps 6- 8

In [None]:
mergeParam = { "pathToCorpusDoc": pathlib.Path(os.path.abspath(pathToTeiCorpus)).as_uri() }
for doc in ELANDocs:
    logging.info("\n\n*** processing "+doc["basename"]+': '+doc["filepath_tmp_TEI"]+" -> "+doc["TEI"])
    
    mdMerged = mergeMetadata(doc, mergeParam)
    if not mdMerged: 
        logging.error("mergeMetadata did not return expected value. Expected path to merged tmp TEI. returned value: "+str(mdMerged))
    else:
        
        mdMergedPostProcessed = postProcessMergedTEI(doc, mdMerged)
        
        if not mdMergedPostProcessed:
            logging.error("mdMergedPostProcessed did not return expected value. Expected path, got "+str(mdMergedPostProcessed))
        
        else:
            nlRmved = removeNL(doc, mdMergedPostProcessed)
            
            if not nlRmved:
                logging.error("removeNL did not return expected value. Expected path, got "+str(mdMergedPostProcessed))
            else:
                
                tokenized = tokenize(doc, nlRmved)

                if not tokenized:
                    logging.error("tokenize did not return expected value. Expected path, got "+str(tokenized))
                else:
                    pAdded = addP(doc, tokenized)
                    if not pAdded:
                        logging.error("addP did not return expected value. Expected path, got "+str(pAdded))

                    else:
                        tokenizedPostProcessed = postProcess(doc, pAdded)
                        if not tokenizedPostProcessed:
                            logging.error("postProcess did not return expected value. Expected path, got "+str(tokenizedPostProcessed))
                        else:
                            
                            teiForAnnotation = createTEIForAnnotation(doc, tokenizedPostProcessed)
                            
                            if not teiForAnnotation:
                                logging.error("createTEIForAnnotation did not return expected value. Expected path, got "+str(teiForAnnotation))
                            
                            xmlVert = createXMLVert(doc, tokenizedPostProcessed)
                            if not xmlVert:
                                logging.error("createXMLVert did not return expected value. Expected path, got "+str(tokenizedPostProcessed))
                            
                            annotationsAttached = attachAnnotationsToXMLVert(doc, xmlVert)
                            if annotationsAttached:
                                createNoSkEVert(doc, annotationsAttached)
                            else:
                                createNoSkEVert(doc, xmlVert)
                            
                            logging.info(doc["basename"]+": done.")
            

## Replace TEI elements with x-includes in corpus document