# ELAN to TEI conversion 

**Author:** Daniel Schopper    
**Description:** This notebook automates the ELAN to TEI conversion in the WIBARAB Project. It is based on the same process in the SHAWI Project.
**Last Change:** 2023-10-10     
**History:**    
* 2023-10-10: Initital set up
* 2023-10-12: updated to Saxon CE HE (Omar Siam)

In [1]:
import io
import sharepy
import os
import logging
import requests
import pathlib
#import filetype – not used
from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
import subprocess
from datetime import datetime
# from inspect import getmembers, signature
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

## Configuration

In [2]:
# the URL of the Sharepoint installation 
sp_baseURL = "oeawacat.sharepoint.com"

# the sharepoint username + password are taken from the environment
sp_username = os.environ['SP_USERNAME']
sp_pwd = pwd = os.environ['SP_PWD']

# the name of the Sharepoint Site
sp_siteName = "ACDH-CH_p_WIBARAB_BedoinTypeArabicNomadicSedentaryPeopleMidd"

# the path to the Excel file
sp_pathToRecordingsXLSX = "Shared%20Documents/Fieldwork%20data%20+%20analysis/WIBARAB_Recordings.xlsx"


# the name of the local directory where downloaded data will be stored
dataDir = "data"

# the name of the local directory where downloaded libraries and other auxiliary code will be stored
libDir = "lib"

# the root of the git repository
dataHomeDir = "../.."

# path to project-specific stylesheets
pathToStylesheetsDir = dataHomeDir+"/082_scripts_xsl"

# the path to the ELAN transcription files
pathToELANDir = dataHomeDir+"/122_elan"

# the path to the non-annotated TEI transcription files
pathToTEIDir = dataHomeDir+"/103_tei_w"

# the path to the annotated TEI transcription files
pathToAnnotatedTEIDir = dataHomeDir+"/010_manannot"


# the path to the NoSkE verticals
noSkEVertDir = dataHomeDir+"/130_vert_plain"

# the path to the tei Corpus document produced by this script
pathToTeiCorpus = pathToTEIDir+"/shawiCorpus.xml"


# the path to the audio files
pathToRecordingsDir = "THIS_IS_NOT_USED"#"/mnt/univie_orientalistik/SHAWI/Recordings"

In [3]:
with saxonche.PySaxonProcessor(license=False) as proc:
    logging.info(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    logging.info(proc.cwd)


#set up directories
logging.info("** setting up directories **")
for i in [dataDir,libDir]: 
    if os.path.exists(i):
        logging.info("skipped existing directory '"+i+"'")
    else:
        os.mkdir(i)
        logging.info("created directory '"+i+"'")
        
        
# define which steps should be skipped. 

SKIP_PROCESSING = []#["runTEICorpo"]

2023-10-20 11:01:44,112 - SaxonC-HE 12.3 from Saxonica
2023-10-20 11:01:44,113 - /home/dschopper/data/WIBARAB/corpus/080_scripts_generic
2023-10-20 11:01:44,114 - ** setting up directories **
2023-10-20 11:01:44,115 - skipped existing directory 'data'
2023-10-20 11:01:44,115 - skipped existing directory 'lib'


## Setup

### Step 1: get the latest release of the TEI Stylesheets 

In [4]:
# Setup

# fetch the TEI Stylesheets    
def installFromGithub(libraryName):
    auth = {}
    if 'GITHUB_TOKEN' in os.environ:
        auth = {"Authorization": "Bearer "+os.environ['GITHUB_TOKEN']}
    headers = {"Accept" : "application/vnd.github.v3+json"}
    repo = libraryName
    logging.info("** Fetching library "+repo+" **")
    libBasePath = libDir+"/"+repo
    
    # First we check which tag name the latest release has
    r = requests.get("https://api.github.com/repos/"+repo+"/releases/latest", headers={**headers, **auth})
    if r.status_code != 200:
        logging.error("An error occured fetching the latest release. Maybe there isn't any release? ")
        logging.error(r.content)
        return 1
    release = r.json()
    tag = release["tag_name"]
    
    # we check whether we have the latest version already \
    # by checking if the respective path is already installed
    libReleasePath = libBasePath+"/"+tag
    haveLatestVersion = os.path.exists(libReleasePath)
    if haveLatestVersion:
        logging.info("We have already the latest version ("+tag+"). Exiting")
        logging.info("")
        return libReleasePath
    else:
        url = release["assets"][0]["browser_download_url"]
        payload = requests.get(url, headers=auth).content
        zipfilename = os.path.basename(url)
        os.makedirs(libReleasePath, exist_ok=True)
        zipfilePath = libReleasePath +"/"+zipfilename
        open(zipfilePath, 'wb').write(payload)
        ZipFile(zipfilePath).extractall(path=libReleasePath)
        logging.info("Downloaded latest version ("+tag+") to "+libReleasePath)
        logging.info("")
        return libReleasePath


pathToTEIGuidelines=installFromGithub("TEIC/TEI")
pathToTEIStylesheets=installFromGithub("TEIC/Stylesheets")


2023-10-20 11:01:44,134 - ** Fetching library TEIC/TEI **
2023-10-20 11:01:44,387 - We have already the latest version (P5_Release_4.6.0). Exiting
2023-10-20 11:01:44,389 - 
2023-10-20 11:01:44,398 - ** Fetching library TEIC/Stylesheets **
2023-10-20 11:01:44,732 - We have already the latest version (v7.55.0). Exiting
2023-10-20 11:01:44,732 - 


### Step 2: Download the latest version of the Excel Sheet

In [5]:
# TODO will need to add credentials if this is run in non-interactive mode
def downloadFromSP(sp_filepath, force=False):
    url = "https://"+sp_baseURL+"/sites/"+sp_siteName+"/"+sp_filepath
    logging.info("attempting to download file from '"+url+"'")
    filename = os.path.basename(sp_filepath)
    downloadPath = dataDir+"/"+filename
    if os.path.exists(downloadPath) and not force:
        logging.info("skipping existing file "+downloadPath)
        return downloadPath
    else:
        s = sharepy.connect(sp_baseURL, username=sp_username, password=sp_pwd)
        s.getfile(url, filename=downloadPath)
        return downloadPath


pathToExcelSheet = downloadFromSP(sp_pathToRecordingsXLSX, force="downloadExcelSheet" not in SKIP_PROCESSING)
logging.info(pathToExcelSheet)

2023-10-20 11:01:44,746 - attempting to download file from 'https://oeawacat.sharepoint.com/sites/ACDH-CH_p_WIBARAB_BedoinTypeArabicNomadicSedentaryPeopleMidd/Shared%20Documents/Fieldwork%20data%20+%20analysis/WIBARAB_Recordings.xlsx'
2023-10-20 11:01:46,810 - data/WIBARAB_Recordings.xlsx


## Step 2: transform xlsx to TEI table

In [6]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        try:
            exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
            exec.set_global_context_item(file_name=os.path.abspath(s))
            # From the docs saxonc.html#PyXsltExecutable-set_initial_match_selection
            # This method does not set the global context item for the transformation;
            # if that is required, it can be done separately using the set_global_context_item method.
            exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        except saxonche.PySaxonApiError as e:
            logging.info(str(e))
            logging.info(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if proc.exception_occurred:
            logging.info(proc.get_error_message())
            logging.info(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            logging.info("there was an error transforming "+s+" with stylesheet "+xsl)

In [7]:
def xlsx2teitable(xlsx, output):

    # first, extract contents of XLSX document to a temp directory
    unzipPath=xlsx.replace(".xlsx","")
    os.makedirs(unzipPath, exist_ok=True)
    ZipFile(xlsx).extractall(path=unzipPath)
    
    # then transform the .rels file using the TEIC Stylesheets 
    pathToXlsxtoteiXSL=pathToTEIStylesheets+"/xml/tei/stylesheet/xlsx/xlsxtotei.xsl"

    params = {
        "inputDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri(),
        "workDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri()
        
    }

    transform(
        s = unzipPath+"/_rels/.rels", 
        xsl = pathToXlsxtoteiXSL, 
        o = output, 
        parameters=params
    )
    return

In [8]:
pathToTEItable=pathToExcelSheet.replace(".xlsx",".xml")

if not "xlsx2teitable" in SKIP_PROCESSING:    
    xlsx2teitable(xlsx=pathToExcelSheet, output=pathToTEItable)
    debugstring="""<!-- 
   THIS FILE IS INCLUDED IN THE GIT REPOSITORY ONLY FOR DEBUGGING PURPOSES. 
   
   The source of this file is constantly being edited at 
   https://oeawacat.sharepoint.com/sites/ACDH-CH_p_ShawiTypeArabicDialects_Shawi/_layouts/15/Doc.aspx?sourcedoc={F01FF43B-2409-4E31-A5BF-653E0559B160}&file=SHAWI%20Recordings.xlsx&action=default&mobileredirect=true&cid=f7311564-c2b6-4b08-9a52-468547688408
   So this copy is most probably already outdated.
   
  To update it, you can either run https://gitlab.com/acdh-oeaw/shawibarab/shawi-data/-/blob/main/080_scripts_generic/080_01_ELAN2TEI/ELAN2TEI.ipyn
   *OR*  
   1) download the Excel file manually from Sharepoint
   2) and tranform it to TEI using oxgarage.tei-c.org/ 
   
-->
    """
    f = open(pathToTEItable,mode="r",encoding="UTF8")
    src = f.read()
    new = src.replace('<?xml version="1.0" encoding="UTF-8"?>','<?xml version="1.0" encoding="UTF-8"?>\n'+debugstring)
    f.close()
    f = open(pathToTEItable, mode="wt",encoding="UTF8")
    f.write(new)
    f.close()
        
    logging.info(pathToTEItable)

2023-10-20 11:01:47,205 - data/WIBARAB_Recordings.xml


## Step 3: transform TEI table to corpus header

In [9]:
pathToTeitableToCorpusXSL=pathToStylesheetsDir+"/table2corpus.xsl"
params = {
    "pathToRecordings" : pathlib.Path(os.path.abspath(pathToRecordingsDir)).as_uri()
}
try:
    transform(pathToTEItable, pathToTeitableToCorpusXSL, pathToTeiCorpus, params)
except saxonche.PySaxonApiError as e:
    logging.error("an error occured: " + str(e) + "\n" + pathToTEItable + ": " + pathToTeitableToCorpusXSL + " -> " + pathToTeiCorpus)
logging.info(pathToTeiCorpus)

2023-10-20 11:01:47,313 - ../../103_tei_w/shawiCorpus.xml


## Step 4: Run TEICorpo

In [10]:
def installFromUrl(url, force=False):
    r = requests.get(url)
    filename = os.path.basename(urlsplit(url).path)
    downloadpath = libDir+"/"+filename
    if os.path.exists(downloadpath) and not force:
        logging.info("skipping download")
    else:
        open(downloadpath, 'wb').write(r.content)
        logging.info("file "+downloadpath+" downloaded")
    return downloadpath

# TODO check for filetype and automatically extract zip file 
# so this can be re-used for the insta
 
installFromUrl("https://github.com/christopheparisse/teicorpo/blob/e06a01ad5cb4c3aef631b3749ce59b5eb6f5ea11/teicorpo.jar?raw=true")
installFromUrl("https://repo1.maven.org/maven2/commons-io/commons-io/2.11.0/commons-io-2.11.0.jar")
pathToTeiCorpo=libDir+"/*"
logging.info(pathToTeiCorpo)

2023-10-20 11:01:48,374 - skipping download
2023-10-20 11:01:48,499 - skipping download
2023-10-20 11:01:48,502 - lib/*


Collect all ELAN documents from pathToELANDir

In [11]:
ELANDocs = []

for i in os.scandir(pathToELANDir):
    filename=os.path.basename(i)
    if filename.endswith(".eaf"):
        basename=Path(i).stem
        ELANDocs.append({
            "filepath" : os.path.abspath(i),
            "filename" : filename,
            "basename" : basename
        })
        
        # check whether there is already a manually annotated TEI version of this ELAN document
        TEI_annotated_filename=pathToAnnotatedTEIDir+"/"+basename+".xml"
        
        ELANDocs[-1]["TEI_annotated"]=os.path.abspath(TEI_annotated_filename)
for d in ELANDocs:
    logging.info(d["filepath"])

2023-10-20 11:01:48,512 - /home/dschopper/data/WIBARAB/corpus/122_elan/SAU_2022_Speaker10_marriage.eaf
2023-10-20 11:01:48,513 - /home/dschopper/data/WIBARAB/corpus/122_elan/SAU_2022_Speaker20_traditionaltools.eaf
2023-10-20 11:01:48,514 - /home/dschopper/data/WIBARAB/corpus/122_elan/SAU_2022_Speaker20_shepherd.eaf
2023-10-20 11:01:48,516 - /home/dschopper/data/WIBARAB/corpus/122_elan/SAU_2022_Speaker14_childhood_womenswork.eaf


In [12]:
def runTEICorpo(docs = dict):
    runtime = datetime.now().strftime("%Y-%m-%d_%H-%M")
    tmpDir = pathToTEIDir+"/"+runtime
    os.makedirs(tmpDir, exist_ok=True)
    for i in docs:
        pathToInput = i["filepath"]
        filenameELAN = i["filename"]
        filenameTEI = i["basename"]+".xml"
        pathToOutput = tmpDir+"/"+"ELAN_"+filenameTEI
        i["filepath_tmp_TEI"] = os.path.abspath(pathToOutput)
        i["tmpDir"] = tmpDir
        output = os.path.abspath(pathToTEIDir + "/" + i["basename"] + ".xml")
        i["TEI"] = os.path.abspath(output)
        res = subprocess.run(["java", "-cp", pathToTeiCorpo, "-Dfile.encoding=UTF-8", "fr.ortolang.teicorpo.TeiCorpo", "-from","elan", "-to","tei", "-o",pathToOutput, pathToInput], capture_output=True, encoding="UTF-8")
        print(res.stdout)
        print(res.stderr)
        print(pathToOutput)

run TEI Corpo on all ELANDocs, writing the path to the TEI output back to the variable

In [13]:
if not "runTEICorpo" in SKIP_PROCESSING:
    runTEICorpo(docs=ELANDocs)

TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker10_marriage.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker20_traditionaltools.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker20_shepherd.xml
TeiCorpo (version 1.4.44) 14/12/2021 08:07 Version TEI_CORPO: 0.9.1


../../103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker14_childhood_womenswork.xml


## Step 5: Merge metadata and TEICorpo Output

In [14]:
def mergeMetadata(docInfo, p):
    return transform(
        s = docInfo["filepath_tmp_TEI"],
        xsl = pathToStylesheetsDir+"/mergeHeaderAndTranscription.xsl",
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_00_metaMerged.xml",
        parameters = p)

## Step 6: Tokenization of unannotated texts

Run a local copy of [xsl-tokenizer](https://github.com/acdh-oeaw/xsl-tokenizer)

The merged TEI document is tokenized for further manual annotation.

### Step 6.0: (Re-)generate tokenizer stylesheets (optional)

Regenerate the XSLs used in the following steps.
This can not be done with saxonpy (xincludes are not resolved)
use
```bash
java -jar Saxon-HE-9.9.1-8.jar -s:profile.xml -xi:on -xsl:xsl/make_xsl.xsl
```

For all the ELAN files converted to TEI:

### Step 6.1: Remove new lines

Remove new lines and store to intermediate document:

In [15]:
def removeNL(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_00_metaMerged.xml", 
        xsl = "./tokenizer/xsl/rmNl.xsl", 
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_01_nlRmd.xml"
    )

### Step 6.2: create w tags


In [16]:
def tokenize(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_01_nlRmd.xml", 
        xsl = "./tokenizer/wrapper_toks.xsl", 
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_02_toks.xml"
    )

### Step 6.3: Add part attributes to w tags

Add Part-Attributes and explicit token links:

In [17]:
def addP(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_02_toks.xml", 
        xsl = "./tokenizer/wrapper_addP.xsl", 
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_03_tokenized.xml"
    )

### Step 6.4: apply project-specific post-processing

Do some post tokenization processing specific to the Shawi project.

In [18]:
def postProcess(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_03_tokenized.xml", 
        xsl = "./tokenizer/postTokenization/1.xsl", 
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml"
    )

## Step 6.5: move token namespace from xtoks to TEI 

**--> This step creates the files which data curators will copy to `010_manannot` and annotate using the TEI enricher**

In [19]:
def createTEIForAnnotation(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml",
        xsl = "./tokenizer/custom_xtoks2tei.xsl", 
        o = docInfo["TEI"],
        parameters = {
            "preserve-ws": "false"
        }
    )

## Step 7: Create NoSke input

We create verticals from the unannotated texts and attach the token annotations from `010_manannot` to them.

### Step 7.1 Create XML vertical from tokenized XML documents

We take the tokenized XML document (prior to have moved to TEI) and create an XML vertical from it:

In [20]:
def createXMLVert(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_04_posttok.xml",
        xsl = "./tokenizer/custom_xtoks2vert.xsl", 
        o = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_05_vert.xml"
    )

### Step 7.2: attach manual annotations to the XML vertical

In [21]:
def attachAnnotationsToXMLVert(docInfo):
    return transform(
        s = docInfo["tmpDir"]+'/'+docInfo["basename"]+"_05_vert.xml",
        xsl = pathToStylesheetsDir+"/copyAnaToVert.xsl", 
        o = docInfo["tmpDir"] + "/" + docInfo["basename"] + "_05_vert_annot.xml",
        parameters = {
            "path_to_annotated_doc": pathlib.Path(os.path.abspath(docInfo["TEI_annotated"])).as_uri()
        }
    )

### Step 7.3 convert XML vertical to text vertical

Create a vertical vor NoSkE

In [22]:
def createNoSkEVert(docInfo):
    return transform(
        s = docInfo["tmpDir"] + "/" + docInfo["basename"] + "_05_vert_annot.xml",
        xsl = "./tokenizer/wrapper_vert2txt.xsl", 
        o = noSkEVertDir + "/" + docInfo["basename"] + ".txt"
    )

## Run Steps 6- 8

In [23]:
print(os.path.abspath(pathToTeiCorpus))
mergeParam = { "pathToCorpusDoc": pathlib.Path(os.path.abspath(pathToTeiCorpus)).as_uri() }
for doc in ELANDocs:
    print(doc["basename"]+': '+doc["filepath_tmp_TEI"]+" -> "+doc["TEI"])
    mergeMetadata(doc, mergeParam)
    removeNL(doc)
    tokenize(doc)
    addP(doc)
    postProcess(doc)
    createTEIForAnnotation(doc)
    createXMLVert(doc)
    attachAnnotationsToXMLVert(doc)
    createNoSkEVert(doc)
    print(doc["basename"]+": done.")

/home/dschopper/data/WIBARAB/corpus/103_tei_w/shawiCorpus.xml
SAU_2022_Speaker10_marriage: /home/dschopper/data/WIBARAB/corpus/103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker10_marriage.xml -> /home/dschopper/data/WIBARAB/corpus/103_tei_w/SAU_2022_Speaker10_marriage.xml


$input=file:///home/dschopper/data/WIBARAB/corpus/103_tei_w/2023-10-20_11-01/ELAN_SAU_2022_Speaker10_marriage.xml
$pathToCorpusDoc=file:///home/dschopper/data/WIBARAB/corpus/103_tei_w/shawiCorpus.xml
2023-10-20 11:01:50,474 - XTMM9000: Processing terminated by xsl:message at line 34 in mergeHeaderAndTranscription.xsl. Line number: 34
$IDcandidates=SHAWI Corpus, SAU_2022_Speaker10_marriage_1, SHAWI Corpus, SAU_2022_Speaker10_marriage_2, SHAWI Corpus, SAU_2022_Speaker20_traditionaltools, SHAWI Corpus, SAU_2022_Speaker20_shepherd, SHAWI Corpus, SAU_2022_Speaker14_childhood_womenswork, SHAWI Corpus, KUW_2022_Speaker8_neighbourhood, SHAWI Corpus, KUW_2022_Speaker8_education, SHAWI Corpus, KUW_2022_Speaker8_family, SHAWI Corpus, KUW_2022_Speaker8_wedding, SHAWI Corpus, KUW_2022_Speaker8_lifedevelopments_cooking, SHAWI Corpus, SAU_2022_Speaker14_houses_education, SHAWI Corpus, SAU_2022_Speaker14_education_2, SHAWI Corpus, SAU_2022_Speaker14_childhood, SHAWI Corpus, SAU_2022_Speaker14_weddings

NameError: name 'pathToShawiStylesheetsDir' is not defined

## Replace TEI elements with x-includes in corpus document