# Goal: read in XML data from UD treebanks

In [34]:
import os
import xml.etree.ElementTree as ET

In [36]:
DATA_DIR = '../data/raw/'
folders = os.listdir(DATA_DIR)

In [37]:
folders

['treebank_data-master', '.gitkeep', 'Greek-Dependency-Trees-master', 'zip']

In [366]:
# Let's start with Perseus
file_name = folders[0] + '/v2.0/Greek/TAGSETS.xml'
file_dir = os.path.join(DATA_DIR, file_name)

# Load XML
tree = ET.parse(file_dir)
root = tree.getroot()
node = root.find('plugins/sg/labels')

# Define function to print out XML tree
def print_children(root, level=10, ind=0):
    for child in root:
        tab = "  " * ind
        tag = child.tag
        long = child.find('long').text
        print(tab + f"{tag}: {long}")
        for element in child:
            if (element.tag == 'nested') and (level > 0):
                print_children(element, level-1, ind+1)

In [367]:
# Coarse labels
print_children(node, level=0)

adj: adjective
art: article
prn: pronoun
adp: adposition
noun: noun
verb: verb
adv: adverb
nrl: numeral


In [371]:
# Finer labels
print_children(node, level=1)

adj: adjective
  prp: adjective proper
  sbs: substantive adjective
  vrb__prp: verbal adjective proper
  vrb__sbs: substantive verbal adjective
  nna: none of the above
  idk: I do not know
art: article
prn: pronoun
  sbs: substantive pronoun
  adj: adjective pronoun
  nna: none of the above
  idk: I do not know
adp: adposition
noun: noun
  nmn: nominative
  gnt: genitive
  dtv: dative
  acc: accusative
  vct: vocative
  nna: none of the above
  idk: I do not know
verb: verb
  fnt: finite verb
  prt: participle
  inf: infinitive
  nna: none of the above
  idk: I do not know
adv: adverb
  tmp: time
  plc: place
  src: source
  cau: cause
  cns: result
  mnn: manner
  msr: measure
  ins__mns: instrument or means
  fnl: purpose
  cnc: concession
  ngt: negation
  prt: particle
  nna: none of the above
  idk: I do not know
nrl: numeral
  crd: cardinal
  ord: ordinal
  adv: adverb
  nna: none of the above
  idk: I do not know


In [376]:
# Even finer labels
print_children(node, level=3)

adj: adjective
  prp: adjective proper
  sbs: substantive adjective
    nmn: nominative
      dpd: dependent
      ind: independent
      nna: none of the above
      idk: I do not know
    gnt: genitive
      dpd: dependent
      ind: independent
      nna: none of the above
      idk: I do not know
    dtv: dative
      dpd: dependent
      nna: none of the above
      idk: I do not know
    acc: accusative
      dpd: dependent
      ind: independent
      nna: none of the above
      idk: I do not know
    vct: vocative
    nna: none of the above
    idk: I do not know
  vrb__prp: verbal adjective proper
  vrb__sbs: substantive verbal adjective
    nmn: nominative
      dpd: dependent
      ind: independent
      nna: none of the above
      idk: I do not know
    gnt: genitive
      dpd: dependent
      ind: independent
      nna: none of the above
      idk: I do not know
    dtv: dative
      dpd: dependent
      nna: none of the above
      idk: I do not know
    acc: accusative

In [378]:
# Finest labels
print_children(node, level=10)

adj: adjective
  prp: adjective proper
  sbs: substantive adjective
    nmn: nominative
      dpd: dependent
      ind: independent
        ctc: citation
        pnd: nominativus pendens
        exc: exclamation
        nna: none of the above
        idk: I do not know
      nna: none of the above
      idk: I do not know
    gnt: genitive
      dpd: dependent
        prp: proper
          pss__bln: possession or belonging
          dvd: divided whole
          qlt: quality
          exp: explanation
          mtr__cnt: material or contents
          msr: measure
            tmp: time
            spc: space
            dgr: degree
            nna: none of the above
            idk: I do not know
          sbj__obj: subjective or objective
            sbj: subjective
            obj: objective
            nna: none of the above
            idk: I do not know
          prc__val: price and value
          crm__acn: crime and accountability
          tpc: topic
          cnn: connection
  