# Goal: read in XML data from UD treebanks

In [87]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [25]:
DATA_DIR = '../data/raw/treebank_data-master/v2.0/Greek/nonArethusaCompliant/'
files = os.listdir(DATA_DIR)

In [32]:
files[:2]

['tlg0003.tlg001.perseus-grc1.1.tb.xml', 'tlg0059.tlg001.perseus-grc1.tb.xml']

In [99]:
# Let's start with Perseus
file_dir = os.path.join(DATA_DIR, files[0])

# Load XML
tree = ET.parse(file_dir)
root = tree.getroot()
node = root.find('plugins/sg/labels')

# Define function to print out XML tree
def print_children(root, level=10, ind=0):
    for child in root:
        tab = "  " * ind
        tag = child.tag
        long = child.find('long').text
        print(tab + f"{tag}: {long}")
        for element in child:
            if (element.tag == 'nested') and (level > 0):
                print_children(element, level-1, ind+1)

In [93]:
data = {
    "sentence_id": [],
    "document_id": [],
    "subdoc": [],
    "word_id": [],
    "form": [],
    "lemma": [],
    "postag": [],
    "relation": [],
    "head": [],
}
for sentence in root.iter('sentence'):
    sentence_attrib = sentence.attrib
    for word in sentence:
        word_attrib = word.attrib
        data["sentence_id"].append(sentence_attrib["id"])
        data["document_id"].append(sentence_attrib["document_id"])
        data["subdoc"].append(sentence_attrib["subdoc"])
        data["word_id"].append(word_attrib["id"])
        data["form"].append(word_attrib.get("form", None))
        data["lemma"].append(word_attrib.get("lemma", None))
        data["postag"].append(word_attrib.get("postag",None))
        data["relation"].append(word_attrib["relation"])
        data["head"].append(word_attrib["head"])
# Make df        
df = pd.DataFrame().from_dict(data)

In [94]:
df

Unnamed: 0,sentence_id,document_id,subdoc,word_id,form,lemma,postag,relation,head
0,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,1,Θουκυδίδης,Θουκυδίδης,n-s---mn-,SBJ,3
1,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,2,Ἀθηναῖος,Ἀθηναῖος,n-s---mn-,ATR,1
2,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,3,ξυνέγραψε,συγγράφω,v3saia---,PRED,0
3,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,4,τὸν,ὁ,l-s---ma-,ATR,5
4,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,5,πόλεμον,πόλεμος,n-s---ma-,OBJ_AP,10
...,...,...,...,...,...,...,...,...,...
25261,942,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.146,7,καὶ,καί,c--------,COORD,6
25262,942,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.146,8,πρόφασις,πρόφασις,n-s---fn-,PNOM_CO,7
25263,942,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.146,9,τοῦ,ὁ,l-s---ng-,ATR,10
25264,942,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.146,10,πολεμεῖν,πολεμέω,v--pna---,ATR,8


In [112]:
df.loc[df.isna().sum(axis=1) > 0, :]

Unnamed: 0,sentence_id,document_id,subdoc,word_id,form,lemma,postag,relation,head
151,4,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.2.1,30,[0],,,PRED_CO,18
152,4,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.2.1,31,[1],,,PRED_CO,18
259,6,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.2.3,35,[0],,,SBJ_AP_CO,29
640,20,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.4.1,23,[0],,,ADV,5
768,22,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.5.2,50,[0],,,APOS,28
...,...,...,...,...,...,...,...,...,...
23702,881,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.139.3,62,[0],,,OBJ,24
23703,881,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.139.3,63,[1],,,APOS,27
24073,895,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.141.1,37,[0],,,OBJ_CO,10
24358,905,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.141.7,53,[0],,,ATR,46
