# Word- & RST-Annotation Merge

#### Prerequisites:
* **Python 3** *(tested with Python 3.9)*
* **xlrd** *(pip package)*:
~~~
    pip install xlrd
~~~ 
* **openpyxl** *(pip package)*:
~~~
    pip install openpyxl
~~~ 
* **xlsxwriter** *(pip package)*:
~~~
    pip install xlsxwriter
~~~ 
* **nltk** *(pip package)*:
~~~
    pip install nltk
~~~ 
* **punkt** *(nltk package)*:
~~~
    python
    >>> import nltk
    >>> nltk.download('punkt')
~~~    

#### Usage:
 - Change the parameters in the cell below to your desired value
 - Click on *Run All Cells*

In [None]:
# Parameters
maximumRelationLevelToShow = 3

# Data directories
rstFolder = "input/RstAnnotations"
wordFolder = "input/WordAnnotations"
outputFolder = "output"

In [None]:
import os
import pandas as pd

def extractRstAnnotations(filePath):
    from WordRelationsExtractor import WordRelationsExtractor
    extractor = WordRelationsExtractor(filePath)

    # Generate output
    csvRows = []
    for wordRelation in extractor.getWordRelationsTable():
        word = wordRelation[0]
        relations = wordRelation[1]

        csvRow = [word]
        for i in range(0, maximumRelationLevelToShow):
            if i < len(relations):
                csvRow.append(relations[i])
            else:
                csvRow.append("")
        csvRows.append(csvRow)
        
    def buildTableHeader():
        header = ["RST-ZH2"]
        for i in range(0, maximumRelationLevelToShow):
            header.append("Rel-Level-"+str(i+1))
        return header

    return pd.DataFrame(data=csvRows, columns=buildTableHeader())

def readWordAnnotations(filePath):
    def filterOutPunctuations(df):
        return df[df['ZH2'].str.contains('[A-Za-z]', na=False)]

    return filterOutPunctuations(pd.read_excel(wordFilePath))

def performAnnotationMerging(rstFilePath, wordFilePath, outputFilePath):
    rstAnnotations = extractRstAnnotations(rstFilePath)
    wordAnnotations = readWordAnnotations(wordFilePath)
    # Combine RST and Word dataframes back-tn-back (i.e., tails first - because some word-annotated files still contain the title at the beginning)
    invRst = rstAnnotations[::-1].reset_index(drop=True)
    invWord = wordAnnotations[::-1].reset_index(drop=True)
    result = invWord.join(invRst)[::-1].reset_index(drop=True)
    # Write merged output
    result.to_csv(outputFilePath+".csv")
    result.to_excel(outputFilePath+".xlsx", engine='xlsxwriter')

In [None]:
# Loop through all RST and Word annotation files, match and merge them
for rstFile in os.listdir(rstFolder):
    textName = rstFile[:-4]
    for wordFile in os.listdir(wordFolder):
        if str.__contains__(wordFile, textName):
            print("Merge: " +rstFile+ " & "+wordFile)
            rstFilePath = os.path.join(rstFolder, rstFile)
            wordFilePath = os.path.join(wordFolder, wordFile)
            outputFilePath = os.path.join(outputFolder, wordFile[:-5] + "_withRST")
            performAnnotationMerging(rstFilePath, wordFilePath, outputFilePath)
            break