# setup

In [1]:
from lxml import etree
import pandas as pd

In [2]:
!ls /veld/input/xmlanntools/

DEU100.ann.json  DEU100.conllu	DEU100.txt
DEU100.ann.xml	 DEU100.json	DEU100.xml


In [3]:
!ls /veld/input/teitok/

DEU100.xml  DEU100_tokenized.xml  DEU100_tokenized_parsed.xml


In [4]:
root_xmlanntools = etree.parse("/veld/input/xmlanntools/DEU100.ann.xml").getroot()
root_teitok = etree.parse("/veld/input/teitok/DEU100_tokenized_parsed.xml").getroot()

In [5]:
def search_xpath(root, xpath):
    return root.xpath(xpath, namespaces={"xmlns": "http://www.tei-c.org/ns/1.0"})

# create data structure for all linguistic annotation data

In [6]:
def get_attrib_count(root, xpath):
    attrib_count = {}
    attrib_key_list = ["token", "lemma", "upos", "xpos", "deprel", "feats"]
    for attrib_key in attrib_key_list:
        attrib_count[attrib_key] = {}
    for t in search_xpath(root, xpath):
        for attrib_key in attrib_key_list:
            if attrib_key == "token":
                token = t.text
                token_count = attrib_count[attrib_key].get(token, 0)
                attrib_count[attrib_key][token] = token_count + 1
            elif attrib_key == "feats":
                token_attrib_value = t.attrib.get(attrib_key)
                if token_attrib_value:
                    feats_tmp = token_attrib_value.split("|")
                    for feat in feats_tmp:
                        feats_split = feat.split("=")
                        if len(feats_split) == 2:
                            feat_key, feat_value = feats_split
                            if feat_key not in attrib_count[attrib_key]:
                                attrib_count[attrib_key][feat_key] = {}
                            feat_value_count = attrib_count[attrib_key][feat_key].get(feat_value, 0)
                            attrib_count[attrib_key][feat_key][feat_value] = feat_value_count + 1
            else:
                token_attrib_value = t.attrib[attrib_key]
                attrib_value_count = attrib_count[attrib_key].get(token_attrib_value, 0)
                attrib_count[attrib_key][token_attrib_value] = attrib_value_count + 1
    return attrib_count

attrib_count_xmlanntools = get_attrib_count(root_xmlanntools, "//xmlns:w")
attrib_count_teitok = get_attrib_count(root_teitok, "//xmlns:tok")

# count of tokens

In [7]:
def count_tokens():
    print("xmlanntools:", len(search_xpath(root_xmlanntools, "//xmlns:w")))
    print("teitok:", len(search_xpath(root_teitok, "//xmlns:tok")))

count_tokens()

xmlanntools: 49724
teitok: 51743


# count of sentences

In [8]:
def count_sentences():
    print("xmlanntools:", len(search_xpath(root_xmlanntools, "//xmlns:s")))
    print("teitok:", len(search_xpath(root_teitok, "//xmlns:s")))

count_sentences()

xmlanntools: 2869
teitok: 3373


# convert to dataframe for visual comparison

In [9]:
def convert_to_summary_df(attrib_count_xmlanntools, attrib_count_teitok, key): 

    def get_indices_sorted(l):
        l_indexed = [[i, v] for i, v in enumerate(l)]
        l_indexed = sorted(l_indexed, key=lambda x : -x[1])
        l_sorted = [l_i[0] for l_i in l_indexed]
        return l_sorted

    def convert_to_summary_lists(attrib_count, sub_key_list):
        count_abs_list = []
        count_rel_list = []
        count_sum = sum(v for v in attrib_count_xmlanntools[key].values())
        for sub_key in sub_key_list:
            abs_count = attrib_count[key].get(sub_key, 0)
            count_abs_list.append(abs_count)
            if count_sum != 0:
                count_rel_list.append(round(100 / count_sum * abs_count, 2))
            else:
                count_rel_list.append(0)
        return count_abs_list, count_rel_list

    if key not in attrib_count_xmlanntools:
        attrib_count_xmlanntools[key] = {}
    if key not in attrib_count_teitok:
        attrib_count_teitok[key] = {}
    sub_key_list = list(set(attrib_count_xmlanntools[key].keys()) | set(attrib_count_teitok[key].keys()))
    count_abs_list_xmlanntools, count_rel_list_xmlanntools = convert_to_summary_lists(attrib_count_xmlanntools, sub_key_list)
    count_abs_list_teitok, count_rel_list_teitok = convert_to_summary_lists(attrib_count_teitok, sub_key_list)
    avg_count_list = [(cx + ct) / 2 for cx, ct in zip(count_abs_list_xmlanntools, count_abs_list_teitok)]
    avg_count_list_sorted_indices = get_indices_sorted(avg_count_list)   
    df_data = {
        key: [sub_key_list[i] for i in avg_count_list_sorted_indices], 
        f"xmlanntools: absolute count": [count_abs_list_xmlanntools[i] for i in avg_count_list_sorted_indices], 
        f"xmlanntools: percentage": [count_rel_list_xmlanntools[i] for i in avg_count_list_sorted_indices],
        f"teitok: absolute count": [count_abs_list_teitok[i] for i in avg_count_list_sorted_indices], 
        f"teitok: percentage": [count_rel_list_teitok[i] for i in avg_count_list_sorted_indices],
        f"both: average absolute count": [avg_count_list[i] for i in avg_count_list_sorted_indices],
    }
    return pd.DataFrame(df_data)

# upos (universal part of speech)

In [10]:
convert_to_summary_df(attrib_count_xmlanntools, attrib_count_teitok, "upos")

Unnamed: 0,upos,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,PUNCT,7495,15.07,9536,19.18,8515.5
1,NOUN,7560,15.2,7209,14.5,7384.5
2,PRON,7105,14.29,7204,14.49,7154.5
3,VERB,5653,11.37,5757,11.58,5705.0
4,DET,4273,8.59,4264,8.58,4268.5
5,ADP,3964,7.97,3981,8.01,3972.5
6,ADJ,3743,7.53,3728,7.5,3735.5
7,ADV,2490,5.01,2516,5.06,2503.0
8,AUX,2179,4.38,2241,4.51,2210.0
9,CCONJ,2091,4.21,2119,4.26,2105.0


# xpos (extended part of speech)

In [11]:
convert_to_summary_df(attrib_count_xmlanntools, attrib_count_teitok, "xpos")

Unnamed: 0,xpos,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,NN,7546,15.18,7209,14.5,7377.5
1,$.,3327,6.69,5355,10.77,4341.0
2,"$,",4161,8.37,4161,8.37,4161.0
3,PPER,3917,7.88,3965,7.97,3941.0
4,ART,3853,7.75,3843,7.73,3848.0
5,VVFIN,3729,7.5,3738,7.52,3733.5
6,APPR,2978,5.99,2972,5.98,2975.0
7,ADV,2153,4.33,2154,4.33,2153.5
8,ADJA,2168,4.36,2132,4.29,2150.0
9,KON,1787,3.59,1819,3.66,1803.0


# deprel (dependency relation)

In [12]:
convert_to_summary_df(attrib_count_xmlanntools, attrib_count_teitok, "deprel")

Unnamed: 0,deprel,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,punct,7445,14.97,8593,17.28,8019.0
1,det,5408,10.88,5386,10.83,5397.0
2,nsubj,4657,9.37,4813,9.68,4735.0
3,advmod,3799,7.64,3826,7.69,3812.5
4,case,3699,7.44,3682,7.4,3690.5
5,obj,3614,7.27,3673,7.39,3643.5
6,root,2869,5.77,3373,6.78,3121.0
7,obl,2676,5.38,2675,5.38,2675.5
8,conj,2563,5.15,2564,5.16,2563.5
9,amod,2023,4.07,1983,3.99,2003.0


# feats (various features)

In [13]:
feat_keys = list(set(set(attrib_count_xmlanntools["feats"].keys()) | set(attrib_count_teitok["feats"].keys())))
feat_keys = sorted(feat_keys)
for k in feat_keys:
    df_k = convert_to_summary_df(attrib_count_xmlanntools["feats"], attrib_count_teitok["feats"], k)
    display(df_k)

Unnamed: 0,AdpType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Prep,3423,98.99,3417,98.81,3420.0
1,Circ,28,0.81,28,0.81,28.0
2,Post,7,0.2,6,0.17,6.5


Unnamed: 0,Aspect,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Perf,1008,100.0,1046,103.77,1027.0


Unnamed: 0,Case,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Dat,5542,34.41,5513,34.23,5527.5
1,Nom,5118,31.77,5219,32.4,5168.5
2,Acc,4255,26.42,4276,26.55,4265.5
3,Gen,1193,7.41,1194,7.41,1193.5


Unnamed: 0,ConjType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Comp,304,100.0,300,98.68,302.0


Unnamed: 0,Degree,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Pos,3549,90.42,3549,90.42,3549.0
1,Cmp,191,4.87,193,4.92,192.0
2,Sup,185,4.71,184,4.69,184.5


Unnamed: 0,Foreign,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Yes,98,100.0,116,118.37,107.0


Unnamed: 0,Gender,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Masc,6112,36.87,6266,37.8,6189.0
1,Fem,6424,38.76,5816,35.09,6120.0
2,Neut,4039,24.37,4108,24.78,4073.5


Unnamed: 0,Hyph,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Yes,14,100.0,0,0.0,7.0


Unnamed: 0,Mood,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Ind,5646,99.14,5708,100.23,5677.0
1,Imp,49,0.86,55,0.97,52.0


Unnamed: 0,NumType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Card,59,100.0,62,105.08,60.5


Unnamed: 0,Number,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Sing,21479,80.76,21097,79.32,21288.0
1,Plur,5117,19.24,5299,19.92,5208.0


Unnamed: 0,PartType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Vbp,506,61.11,530,64.01,518.0
1,Inf,289,34.9,299,36.11,294.0
2,Res,33,3.99,48,5.8,40.5


Unnamed: 0,Person,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,3,20650,91.95,20282,90.31,20466.0
1,1,1012,4.51,1085,4.83,1048.5
2,2,796,3.54,898,4.0,847.0


Unnamed: 0,Polarity,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Neg,432,100.0,450,104.17,441.0


Unnamed: 0,Poss,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Yes,1220,100.0,1240,101.64,1230.0


Unnamed: 0,PronType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Prs,5737,48.23,5805,48.8,5771.0
1,Art,4298,36.13,4288,36.05,4293.0
2,Rel,722,6.07,723,6.08,722.5
3,"Ind,Neg,Tot",514,4.32,526,4.42,520.0
4,Dem,332,2.79,345,2.9,338.5
5,Int,292,2.45,320,2.69,306.0


Unnamed: 0,PunctType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Peri,3327,44.39,5355,71.45,4341.0
1,Comm,4161,55.52,4161,55.52,4161.0
2,Brck,7,0.09,20,0.27,13.5


Unnamed: 0,Reflex,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Yes,600,100.0,600,100.0,600.0


Unnamed: 0,Tense,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Pres,2823,50.12,2838,50.38,2830.5
1,Past,2810,49.88,2847,50.54,2828.5


Unnamed: 0,Variant,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Short,1518,100.0,1539,101.38,1528.5


Unnamed: 0,VerbForm,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Fin,5695,72.71,5763,73.58,5729.0
1,Inf,1129,14.42,1189,15.18,1159.0
2,Part,1008,12.87,1046,13.36,1027.0


Unnamed: 0,VerbType,xmlanntools: absolute count,xmlanntools: percentage,teitok: absolute count,teitok: percentage,both: average absolute count
0,Mod,506,100.0,530,104.74,518.0
