In [2]:
import io, os, sys
import statistics

In [3]:
!pwd

/Users/liu.ying/Spaceship/learner_spanish_depparse/scripts


In [4]:
# Read CoNLL-U format
def conll_read_sentence(file_handle):
	sent = []
	for line in file_handle:
		line = line.strip('\n')
		if not line.startswith('#'):
			toks = line.split("\t")
		#	if len(toks) == 10 and '-' not in toks[0] and '.' not in toks[0]:
			if len(toks) == 10 and '.' not in toks[0]:
				if toks[0] == 'q1':
					toks[0] = '1'
				if toks[7] == 'ROOT':
					toks[7] = 'root'
				sent.append(toks)
			elif sent:
				for w in sent:
					if '-' in w[0]:
						sent.remove(w)
				yield sent
				sent = []
	if sent:
		yield sent  # Ensure the last sentence is returned

In [12]:
# Collect meta information including doc id and sent id
def collect_meta(file_handle):
    doc_meta_list = []
    sent_meta_list = []
    with open(file_handle, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip('\n')
            if line.startswith('# doc_id'):
                doc_meta_list.append(line.split(' = ')[1])
            if line.startswith('# sent_id'):
                sent_meta_list.append(line.split(' = ')[1])
    return doc_meta_list, sent_meta_list

In [8]:
# Collect dependency annotations from a given *.conllu file
def dependency_annotations(file_handle):
    doc_meta_list, sent_meta_list = collect_meta(file_handle)
    trees = []
    all_sents = []
    pos_annotations = []
    head_annotations = []
    deprel_annotations = []
    both_annotations = []
    
    with open(file_handle) as f:
        trees = list(conll_read_sentence(f))
        for sent in trees:
            all_sents.append(sent)
            pos_annotations.append([tok[3] for tok in sent])
            head_annotations.append([tok[6] for tok in sent])
            deprel_annotations.append([tok[7] for tok in sent])
            both_annotations.append([tok[6] + '_' + tok[7] for tok in sent])
    
    return doc_meta_list, sent_meta_list, all_sents, pos_annotations, head_annotations, deprel_annotations, both_annotations

In [22]:
annotator1_file = '../random_trees_inter_annotator.conllu'
annotator2_file = '../rpugh-annotated_random_iaa.conllu'

doc_meta_list1, sent_meta_list1, all_sents1, pos_annotations1, head_annotations1, deprel_annotations1, both_annotations1 = dependency_annotations(annotator1_file)
doc_meta_list2, sent_meta_list2, all_sents2, pos_annotations2, head_annotations2, deprel_annotations2, both_annotations2 = dependency_annotations(annotator2_file)

In [24]:
for i in range(len(doc_meta_list1)):
    if doc_meta_list1[i] != doc_meta_list2[i]:
        print('Document meta information does not match')
        print(i, doc_meta_list1[i], doc_meta_list2[i])

In [26]:
## Loop through each sentence and compare the dependencies
overall_pos_agree_score = []
overall_head_agree_score = []
overall_deprel_agree_score = []
overall_both_agree_score = []

def compute_agree_score(annotation_list1, annotation_list2):
    overall_agree_score = 0
    for i in range(len(annotation_list1)):
        agree_score = 0
        annotation1 = annotation_list1[i]
        annotation2 = annotation_list2[i]
        for z in range(len(annotation1)):
            if annotation1[z] == annotation2[z]:
                agree_score += 1
        overall_agree_score += agree_score / len(annotation1)
    return round(overall_agree_score / len(annotation_list1), 2)

print('POS tag agreement:', compute_agree_score(pos_annotations1, pos_annotations2))
print('Syntactic head agreement:', compute_agree_score(head_annotations1, head_annotations2))
print('Syntactic deprel agreement:', compute_agree_score(deprel_annotations1, deprel_annotations2))
print('Syntactic both agreement:', compute_agree_score(both_annotations1, both_annotations2))


POS tag agreement: 0.98
Syntactic head agreement: 0.93
Syntactic deprel agreement: 0.91
Syntactic both agreement: 0.88


In [27]:
disagree_doc_meta_list = []
disagree_sent_meta_list = []
disagree_sents = []
for i in range(len(all_sents1)):
    sent1 = all_sents1[i]
    sent2 = all_sents2[i]
    if sent1 != sent2:
        disagree_doc_meta_list.append(doc_meta_list1[i])
        disagree_sent_meta_list.append(sent_meta_list1[i])
        new_sent = []
        for z in range(len(sent1)):
            tok = sent1[z]
            tok[6] = sent1[z][6] + '|' + sent2[z][6]
            tok[7] = sent1[z][7] + '|' + sent2[z][7]
            new_sent.append(tok)

        disagree_sents.append(new_sent)
        words = ' '.join([tok[1] for tok in new_sent])

with open('../disagree_sents.conllu', 'w') as f:
    for i in range(len(disagree_sents)):
        doc_meta = disagree_doc_meta_list[i]
        sent_meta = disagree_sent_meta_list[i]
        f.write(doc_meta + '\n')
        f.write(sent_meta + '\n')
        
        sent = disagree_sents[i]
        for tok in sent:
            f.write('\t'.join(tok) + '\n')
        f.write('\n')


In [30]:
## Descriptive statistics for the cross_annotated conllu file
num_sents = 0
num_tokens = 0
with open('../random_trees_inter_annotator.conllu') as f:
    trees = list(conll_read_sentence(f))
    num_sents = len(trees)
    for sent in trees:
        num_tokens += len(sent)

print('Total number of sentences:', num_sents)
print('Total number of tokens:', num_tokens)

Total number of sentences: 48
Total number of tokens: 805


In [29]:
## Checking for duplicates
trees = []
num_sents = 0

with open('../all_random_sents_new_sentid.conllu') as f:
    trees = list(conll_read_sentence(f))
    num_sents = len(trees)

print('Total number of sentences:', num_sents)
print('Total number of unique sentences', len(trees))

Total number of sentences: 383
Total number of unique sentences 383
