In [1]:
import sys
from tqdm import tqdm
from collections import Counter

In [2]:
## Some hyperparameters

language = "english"
# language = "slovenian"
# language = "romanian"

# comparison = "within-1"
# comparison = "within-2"
comparison = "between"

In [3]:
if language=="english":
    # GUM (Georgetown University Multilayer) corpus from UD website
    corpus1_dir = "ud-en-gum/"
    train1 = corpus1_dir+"en_gum-ud-train.conllu"
    dev1 = corpus1_dir+"en_gum-ud-dev.conllu"

    # wsj corpus with different conventions
    # converted from Stanford dependencies (?)
    corpus2_dir = "wsj-DIFF-CONVENTIONS/"
    train2 = corpus2_dir+"train.conllu"
    dev2 = corpus2_dir+"dev.conllu"
    test2 = corpus2_dir+"test.conllu"

In [4]:
if language=="slovenian":
    # SSJ corpus
    corpus1_dir = "UD_Slovenian-SSJ/"
    train1 = corpus1_dir+"sl_ssj-ud-train.conllu"
    dev1 = corpus1_dir+"sl_ssj-ud-dev.conllu"

    # SST corpus
    corpus2_dir = "UD_Slovenian-SST/"
    train2 = corpus2_dir+"sl_sst-ud-train.conllu"
    dev2 = corpus2_dir+"sl_sst-ud-test.conllu"

In [5]:
if language=="romanian":
    # RRT corpus
    corpus1_dir = "UD_Romanian-RRT/"
    train1 = corpus1_dir+"ro_rrt-ud-train.conllu"
    dev1 = corpus1_dir+"ro_rrt-ud-dev.conllu"

    # Nonstandard corpus
    corpus2_dir = "UD_Romanian-Nonstandard/"
    train2 = corpus2_dir+"ro_nonstandard-ud-train.conllu"
    dev2 = corpus2_dir+"ro_nonstandard-ud-dev.conllu"

In [6]:
gum_relations = []
wsj_relations = []

In [7]:
def process_ud_data(ud_file):
    """
    Processes UD data into dictionaries of word pairs, relations, and sentences.
    """
    print(f"\nProcessing '{ud_file}' data file...")
    with open(ud_file) as infile:
        ud_lines = infile.readlines()
    # get list of sentences
    sentences = []
    sentence = []
    for line in ud_lines:
#         print(line)
        if "# sent_id = " in line:
            sentence.append(line.split(" sent_id = ")[1].strip())
        if "# text =" in line:
            sentence.append(line.split(" text =")[1].strip())
        elif line[0] == "#":
            continue
        elif len(line.strip()) == 0:
            sentences.append(sentence)
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)

    pair_to_relations = {}
    pair_relation_to_sentences = {}
    for sentence in tqdm(sentences):
#         print("\nSENTENCE:\t",sentence)
        # each sentence is a list of lines split on tabs
        sentence_id = sentence[0]
        sentence_text = sentence[1]
#         print("Sentence ID:\t"+sentence_id)
#         print("Sentence Text:\t"+sentence_text)
        for line in sentence[2:]:
#             print("LINE:\t",line)
            word_idx = line[0]
#             print("word idx:\t",word_idx)
            word = line[1]
#             print("word:\t"+word)
            head_idx = int(line[6])
#             print("head idx:\t",head_idx)
#             head_idx+1 needed to match place in list
            head_word = sentence[head_idx+1][1] if head_idx != 0 else "#ROOT#"
#             print("head word:\t"+head_word)
            relation = line[7]
            if relation not in gum_relations:
                gum_relations.append(relation)
#             print("relation:\t"+relation)
            word_pair = (head_word, word)
#             print("word pair:\t",word_pair)
            if word_pair in pair_to_relations:
                pair_to_relations[word_pair].append(relation)
            else:
                pair_to_relations[word_pair] = [relation]
            pair_relation = (word_pair, relation)
            if pair_relation in pair_relation_to_sentences:
                pair_relation_to_sentences[pair_relation].append([sentence_id, sentence_text])
            else:
                pair_relation_to_sentences[pair_relation] = [[sentence_id, sentence_text]]
    """
    pair_to_relations dict:
        e.g. {("Need", "'ll"): ["aux"], ...}

    pair_relation_to_sentences dict:
        e.g. {(("funnel", "Large"), "amod"): ["Large funnel or strainer to hold filter"], ...}
    """
    print(f"\t{len(pair_to_relations):,} head-dependent:relation pairs")
    print(f"\t{len(pair_relation_to_sentences):,} head-dependent-relation:sentence triples")
    return pair_to_relations, pair_relation_to_sentences

In [8]:
def process_wsj_data(ud_file):
    """
    Processes UD data into dictionaries of word pairs, relations, and sentences.
    """
    print(f"\nProcessing '{ud_file}' data file...")
    with open(ud_file) as infile:
        ud_lines = infile.readlines()
    # get list of sentences
    sentences = []
    sentence = []
    for line in ud_lines:
        if "# sent_id = " in line:
            sentence.append(line.split(" sent_id = ")[1].strip())
        if "# text =" in line:
            sentence.append(line.split(" text = ")[1].strip())
        elif line[0] == "#":
            continue
        elif len(line.strip()) == 0:
            sentences.append(sentence)
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)

    pair_to_relations = {}
    pair_relation_to_sentences = {}
    for sentence in tqdm(sentences):
#         print("\nSENTENCE:\t",sentence)
        # each sentence is a list of lines split on tabs
        sentence_id = sentences.index(sentence)
#         print("Sentence ID:\t"+str(sentence_id))
        sentence_text = []
        for line in sentence:
            word = line[1]
            sentence_text.append(word)
#         print("Sentence Text:\t"+" ".join(sentence_text))
        for line in sentence:
#             print("LINE:\t",line)
            word_idx = line[0]
#             print("word idx:\t",word_idx)
            word = line[1]
#             print("word:\t"+word)
            head_idx = int(line[6])
#             print("head idx:\t",head_idx)
#             head_idx+1 needed to match place in list
            head_word = sentence[head_idx-1][1] if head_idx != 0 else "#ROOT#"
#             print("head word:\t"+head_word)
            relation = line[7]
            if relation not in wsj_relations:
                wsj_relations.append(relation)
#             print("relation:\t"+relation)
            word_pair = (head_word, word)
#             print("word pair:\t",word_pair)
            if word_pair in pair_to_relations:
                pair_to_relations[word_pair].append(relation)
            else:
                pair_to_relations[word_pair] = [relation]
            pair_relation = (word_pair, relation)
            if pair_relation in pair_relation_to_sentences:
                pair_relation_to_sentences[pair_relation].append([sentence_id, " ".join(sentence_text)])
            else:
                pair_relation_to_sentences[pair_relation] = [[sentence_id, " ".join(sentence_text)]]
    """
    pair_to_relations dict:
        e.g. {("Need", "'ll"): ["aux"], ...}

    pair_relation_to_sentences dict:
        e.g. {(("funnel", "Large"), "amod"): ["Large funnel or strainer to hold filter"], ...}
    """
    print(f"\t{len(pair_to_relations):,} head-dependent:relation pairs")
    print(f"\t{len(pair_relation_to_sentences):,} head-dependent-relation:sentence triples")
    return pair_to_relations, pair_relation_to_sentences

In [9]:
if comparison == "within-1":
    train_file = train1
    dev_file = dev1
    train_output = language+"_train_mismatches_within_corpus1"+".tsv"
    dev_output = language+"_dev_mismatches_within_corpus1"+".tsv"
    # get list for training data
    train_list, train_sentences = process_ud_data(train_file)
    # get list for testing data
    dev_list, dev_sentences = process_ud_data(dev_file)
    
elif comparison == "within-2":
    train_file = train2
    dev_file = dev2
    train_output = language+"_train_mismatches_within_corpus2"+".tsv"
    dev_output = language+"_dev_mismatches_within_corpus_2"+".tsv"
    # get list for training data
    train_list, train_sentences = process_ud_data(train_file)
    # get list for testing data
    dev_list, dev_sentences = process_ud_data(dev_file)
    ## FOR ENGLISH ONLY (?)
    # get list for training data
#     train_list, train_sentences = process_wsj_data(train_file)
    # get list for testing data
#     dev_list, dev_sentences = process_wsj_data(dev_file)

    
elif comparison == "between":
#     train_file = wsj_train
#     dev_file = gum_train
    train_file = train1
    dev_file = train2
    train_output = language+"_train_mismatches_between_corpora"+".tsv"
    dev_output = language+"_dev_mismatches_between_corpora"+".tsv"
    # get list for training data
    train_list, train_sentences = process_ud_data(train_file)
    # get list for testing data
    # need to keep process_wsj_data for now for English data
    dev_list, dev_sentences = process_wsj_data(dev_file)
#     dev_list, dev_sentences = process_ud_data(dev_file)

  0%|          | 0/4287 [00:00<?, ?it/s]


Processing 'ud-en-gum/en_gum-ud-train.conllu' data file...


100%|██████████| 4287/4287 [00:00<00:00, 10534.96it/s]


	62,876 head-dependent:relation pairs
	63,502 head-dependent-relation:sentence triples

Processing 'wsj-DIFF-CONVENTIONS/train.conllu' data file...


100%|██████████| 39832/39832 [00:39<00:00, 1018.69it/s]

	467,290 head-dependent:relation pairs
	484,472 head-dependent-relation:sentence triples





In [10]:
# for triple in train_sentences:
# #     print(triple, train_sentences[triple])
#     for sentence in train_sentences[triple]:
#         print(triple, sentence)

In [11]:
# for pair in dev_list:
#     print(pair, dev_list[pair])

In [12]:
# for triple in dev_sentences:
# #     print(triple, dev_sentences[triple])
#     for sentence in dev_sentences[triple]:
#         print(triple, sentence)

In [13]:
# compare the two lists
dev_mismatches = {}
train_mismatches = {}
for dev_pair in dev_list.keys():
    # If the (head, dependent) pair in dev is in train
    if dev_pair in train_list.keys():
        # get the relations for that pair in dev
        dev_relations = dev_list[dev_pair]
#         print("dev pair:\t\t",dev_pair)
#         print("dev relations:\t\t",dev_relations)
        # and in train
        train_relations = train_list[dev_pair]
#         print("train relations:\t",train_relations)
        # TODO: decide which ones we actually care about...
        # get the relations in dev NOT in train
        not_in_train = [x for x in dev_relations if x not in set(train_relations)]
        # and get the relations in train NOT in dev
        not_in_dev = [x for x in train_relations if x not in set(dev_relations)]
#         print("relations not in dev:\t",not_in_train)
#         print("relations not in train:\t",not_in_dev)
        # if there are relations not in dev/train, add entry to dev_mismatches or train_mismatches for that pair-relation combo
        if len(not_in_train) != 0:
#             dev_mismatches[dev_pair] = list(set(not_in_train))
            dev_mismatches[dev_pair] = not_in_train
#             print("dev_mismatches:\t",dev_mismatches)
        if len(not_in_dev) != 0:
#             train_mismatches[dev_pair] = list(set(not_in_dev))
            train_mismatches[dev_pair] = not_in_dev

In [14]:
print("")
print(f"{len(dev_mismatches)} pairs with a relation in dev but not in train")
print(f"{len(train_mismatches)} pairs with a relation in train but not in dev")


3246 pairs with a relation in dev but not in train
2462 pairs with a relation in train but not in dev


In [15]:
# dev_mismatches

In [16]:
# train_mismatches

In [17]:
def generate_human_readable_output(filename, mismatches, sentences, this_data, other_data):
    """
    Generates human-readable output file for evaluation.
    
    Args:
        filename: the output filename
        
        mismatches: the dictionary of word pairs with a relation in one file but not the other
        
        sentences: the dictionary of word pairs, their relations, and the sentences they're found in
    """
    conversion_dict = {}
    with open(filename, "w") as output:
        header = ("ID" + "\t" +
                  "SENTENCE" + "\t" + 
                  "HEAD WORD" + "\t" + 
                  "DEPENDENT WORD" + "\t" + 
                  "RELATION" + "\t" + 
                  "TOP RELATION IN THIS DATA" + "\t" +
                  "COUNT OF TOP RELATION IN THIS DATA" + "\t" +
                  "PROPORTION OF TOP RELATION IN THIS DATA" + "\t"
                  "TOP RELATION IN OTHER DATA" + "\t" +
                  "COUNT OF TOP RELATION IN OTHER DATA" + "\t" +
                  "PROPORTION OF TOP RELATION IN OTHER DATA" + "\n")
        output.write(header)
        for pair in mismatches:
#             print("word pair:\t", pair)
            head_word = pair[0]
            dependent_word = pair[1]
            # relations for this pair in this partition / corpus
            these_relations = Counter(this_data[pair])
            # relations for this pair in other partition / corpus
            other_relations = Counter(other_data[pair])
#             print("other_relations:\t", sum(Counter(other_relations).values()))
            for relation in list(set(mismatches[pair])):
                triple = (pair, relation)
                sentence_ids = []
                sentence_texts = []
                for sentence in sentences[triple]:
                    sentence_id = str(sentence[0])
                    sentence_text = sentence[1]
                    sentence_ids.append(sentence_id)
                    sentence_texts.append(sentence_text)
                # get most common label/count for this data and other data
                most_common_label = these_relations.most_common(1)[0][0]
                most_common_count = these_relations.most_common(1)[0][1]
                sum_these_relations = sum(these_relations.values())
                most_common_label_other = other_relations.most_common(1)[0][0]
                most_common_count_other = other_relations.most_common(1)[0][1]
                sum_other_relations = sum(other_relations.values())
                line = ("('"+("', '").join(sentence_ids)+"')" + "\t" +
                        "('"+("', '").join(sentence_texts)+"')" + "\t" +
                        head_word + "\t" +
                        dependent_word + "\t" +
                        relation + "\t" +
                        most_common_label + "\t" +
                        str(most_common_count) + "\t" +
                        str(most_common_count/sum_these_relations) + "\t" +
                        most_common_label_other + "\t" +
                        str(most_common_count_other) + "\t" +
                        str(most_common_count_other/sum_other_relations)+ "\n"    
                        )
                key = (head_word, dependent_word, relation)
                value = {"most_common_label": most_common_label, 
                         "most_common_count": most_common_count, 
                         "proportion_these": most_common_count/sum_these_relations, 
                         "most_common_label_other": most_common_label_other, 
                         "most_common_count_other": most_common_count_other, 
                         "proportion_other": most_common_count_other/sum_other_relations}
                conversion_dict[key] = value
                print("LINE:\t"+line)
                output.write(line)
    return conversion_dict

In [18]:
dev_conversion = generate_human_readable_output(dev_output, dev_mismatches, dev_sentences, dev_list, train_list)

LINE:	('0', '28074')	('In an Oct. 19 review of `` The Misanthrope '' at Chicago 's Goodman Theatre ( `` Revitalized Classics Take the Stage in Windy City , '' Leisure & Arts ) , the role of Celimene , played by Kim Cattrall , was mistakenly attributed to Christina Haag .', 'The success of the NWA financing , and the failure of the UAL deal , also seem to highlight the important new role in takeover financing being played by Japanese banks .')	role	played	acl	acl	2	1.0	acl:relcl	1	1.0

LINE:	('36990')	('The combination of solid loan growth with tight expense control gave Wells Fargo a 1.25 % return on average assets for the quarter , about 40 % higher than Security Pacific 's and a profit ratio matched by only two or three other major banks in the U.S. .')	U.S.	in	amod	case	199	0.995	case	4	1.0

LINE:	('1849', '3352', '3354', '5867', '5872', '5874', '6222', '6247', '6289', '6396', '6398', '6399', '6414', '6414', '6415', '6649', '6736', '6807', '7042', '7094', '7132', '7347', '7726', '78

LINE:	('5171')	('But at least Burger King has signed on , and says that by year end it wo n't be using any shell eggs .')	using	it	nsubj	dobj	4	0.8	obj	1	1.0

LINE:	('3724', '6889', '11966', '13188')	('`` I 'm using it a lot , '' she says .', 'They renamed it Swiss Cantobank and are using it to expand abroad .', '`` It 's not as if we 're teaching language per se , '' he says , `` We 're just using it . ''', 'But unlike Mr. Ruder , who during the 1987 crash damaged himself by saying rather offhandedly that the markets might be closed , Mr. Breeden is turning the market drop to his own advantage , using it to further his agenda for the SEC .')	using	it	dobj	dobj	4	0.8	obj	1	1.0

LINE:	('3727')	('A surprising 78 % of people said they exercise regularly , up from 73 % in 1981 .')	said	up	advmod	advmod	1	1.0	ccomp	1	1.0

LINE:	('3733', '15921', '26111')	('`` It 's hard to know if people are responding truthfully .', '`` It 's hard to know right now if the change is fundamental or cyclical 

LINE:	('5173', '17412', '18633')	('So there is reason to believe that Michael 's hopes for a bacteria-free , long-shelf-life egg were n't all hype .', 'So the potato crop , once 47 million tons , is down to 35 million .', 'So too , according to many reports , is British Airways PLC , despite its public withdrawal from the buy-out .')	is	So	dep	dep	3	0.75	advmod	1	1.0

LINE:	('5180', '32470')	('Other institutional users reportedly include Marriott , which is moving away from fresh eggs on a region-by-region basis .', 'The parties are currently negotiating over who would manage the building , which will be emptied of 6,000 employees from Sears ' merchandise group , which is moving elsewhere .')	moving	which	nsubj	nsubj	2	1.0	obl	1	1.0

LINE:	('5187', '8500', '12971', '17022', '28345', '29779')	('( The company did n't put out a public announcement .', 'If there is n't , { the deal } wo n't be put forward '' to shareholders .', 'Seeking to allay European concerns , U.S. Agriculture Secreta

LINE:	('38401')	('This fatuous statement was not taken seriously when enacted in 1972 , and should not now be confused with the operative provisions of the statute .')	confused	be	auxpass	auxpass	1	1.0	aux:pass	1	1.0

LINE:	('38421')	('To combat that problem , National Geographic , like other magazines , began offering regional editions allowing advertisers to appear in only a portion of its magazines -- for example , ads can run only in the magazines sent to subscribers in the largest 25 markets .')	run	example	nmod	nmod	1	1.0	obl	1	1.0

LINE:	('38454')	('While not specifically mentioned in the FBI charges , dual trading became a focus of attempts to tighten industry regulations .')	mentioned	not	neg	neg	1	1.0	advmod	1	1.0

LINE:	('38498')	('By 1982 , however , the patent status of the Lanier microcassette had changed , permitting Dictaphone to develop its own competitive micro system , which it did .')	did	which	dobj	dobj	1	1.0	obj	1	1.0

LINE:	('38520')	('`` They either pick it up ,

In [19]:
train_conversion = generate_human_readable_output(train_output, train_mismatches, train_sentences, train_list, dev_list)

LINE:	('GUM_bio_padalecki-15')	('In 2000, he was cast as Dean Forester on the television series Gilmore Girls, a role he played until 2005.')	role	played	acl:relcl	acl:relcl	1	1.0	acl	2	1.0

LINE:	('GUM_academic_thrones-11')	('The study of how people, as fans, access and manage information within a transmedia system provides valuable insight that contributes not only to practitioners and scholars of the media industry, but to the wider context of cultural studies, by offering findings on this new model of the fan as consumer and information-user.')	access	information	obj	obj	1	1.0	nmod	1	0.5

LINE:	('GUM_interview_chomsky-15', 'GUM_interview_chomsky-16')	('I mean the theory was, whether you believe it or not, that it would be a defensive alliance against potential Soviet aggression, that’s the basic doctrine.', 'Well there’s no defense against Soviet aggression, so whether you believe that doctrine or not that’s gone.')	believe	not	conj	conj	2	1.0	neg	6	1.0

LINE:	('GUM_bio_gordon-27',

LINE:	('GUM_academic_replication-3', 'GUM_bio_higuchi-33', 'GUM_voyage_tulsa-10')	('Replication studies are considered by many to play a fundamental role in any scientific endeavor.', 'The last two are considered her best work.', 'The winters are considered to be very mild.')	considered	are	aux:pass	aux:pass	3	1.0	auxpass	9	1.0

LINE:	('GUM_academic_theropod-20')	('Interestingly, femur length tended to be greater in the experimental group than in both the control-weight and the control group (by 4 and 7%, respectively), although not signifcant.')	%	respectively	nmod	nmod	1	1.0	advmod	7	1.0

LINE:	('GUM_news_warming-18')	('The U.S. Environmental Protection Agency says some crops may have higher yields with increased levels of carbon dioxide.')	have	yields	obj	obj	1	1.0	dobj	3	1.0

LINE:	('GUM_fiction_giants-8', 'GUM_fiction_giants-21', 'GUM_fiction_wedding-38', 'GUM_news_crane-5')	('The young were mostly dead, and the old men had been taken away, they told us, to learn important new thi

LINE:	('GUM_news_election-1')	('New faces emerge as veteran politicians step down in Hong Kong legislative election')	step	down	compound:prt	compound:prt	1	1.0	advmod	1	1.0

LINE:	('GUM_academic_lighting-3')	('Electrical Engineering Department, Bataan Peninsula State University, 2100, Balanga City Bataan, Philippines')	Department	University	list	list	1	1.0	dep	1	1.0

LINE:	('GUM_fiction_veronique-36')	('It took my tired head a long time to sort that out.')	took	head	obj	obl	1	0.5	nmod	1	1.0

LINE:	('GUM_bio_theodorus-31')	('Within a few months his prediction came true, and Apa Horsiesios once more took his place as the head of the communities in both title and authority. [1]')	took	head	obl	obl	1	0.5	nmod	1	1.0

LINE:	('GUM_interview_herrick-34')	('As a result some folks believe wikiHow is a more humane and enjoyable place to work than other places online.')	believe	result	obl	obl	1	1.0	ccomp	1	1.0

LINE:	('GUM_interview_onion-60')	('If someone is continually telling unfunny jokes, do 

LINE:	('GUM_fiction_frankenstein-56')	('“We went in but she kept grabbing stuff and I had to take her out and now she’s upset.”')	went	kept	conj	conj	1	1.0	advcl	1	1.0

LINE:	('GUM_whow_arrogant-54')	('The more they hate someone, the more dangerous that person is to their fantasy land.')	person	that	det	mark	1	0.5	mark	1	1.0

LINE:	('GUM_whow_basil-38')	('It's best to put basil somewhere where it will get a good deal of sunshine and have well-drained soil.')	best	get	advcl	advcl	1	1.0	acl:relcl	1	1.0

LINE:	('GUM_fiction_oversite-52')	('I don’t dream about Renata, although when the dreams wake me up, it’s thinking about Renata that keeps me awake.')	thinking	it	nsubj	nsubj	1	1.0	nmod	1	1.0

LINE:	('GUM_whow_procrastinating-33')	('Simply force yourself to use the next 120 seconds to be productive and do the duty you normally would push off for hours or days. [1].')	do	duty	obj	obj	1	1.0	dobj	1	1.0

LINE:	('GUM_voyage_cleveland-18')	('Serving as a global model for urban rebirth, Clevelan

In [20]:
dev_conversion[('role', 'played', 'acl')]

{'most_common_label': 'acl',
 'most_common_count': 2,
 'proportion_these': 1.0,
 'most_common_label_other': 'acl:relcl',
 'most_common_count_other': 1,
 'proportion_other': 1.0}

In [21]:
train_conversion

{('role', 'played', 'acl:relcl'): {'most_common_label': 'acl:relcl',
  'most_common_count': 1,
  'proportion_these': 1.0,
  'most_common_label_other': 'acl',
  'most_common_count_other': 2,
  'proportion_other': 1.0},
 ('access', 'information', 'obj'): {'most_common_label': 'obj',
  'most_common_count': 1,
  'proportion_these': 1.0,
  'most_common_label_other': 'nmod',
  'most_common_count_other': 1,
  'proportion_other': 0.5},
 ('believe', 'not', 'conj'): {'most_common_label': 'conj',
  'most_common_count': 2,
  'proportion_these': 1.0,
  'most_common_label_other': 'neg',
  'most_common_count_other': 6,
  'proportion_other': 1.0},
 ('according', 'to', 'fixed'): {'most_common_label': 'fixed',
  'most_common_count': 6,
  'proportion_these': 1.0,
  'most_common_label_other': 'mwe',
  'most_common_count_other': 314,
  'proportion_other': 1.0},
 ('said', 'had', 'xcomp'): {'most_common_label': 'xcomp',
  'most_common_count': 1,
  'proportion_these': 0.5,
  'most_common_label_other': 'ccomp'

In [64]:
def apply_conversions(filename, conversion_dictionary):
    with open(filename) as infile:
        lines = infile.readlines()
    sentences = []
    sentence = []
    metadata = []
    for line in lines:
        if line[0] == "#":
            metadata.append(line)
            if "# text =" in line:
                sentence.append(line.split(" text = ")[1].strip())
        elif len(line.strip()) == 0:
            item = [metadata]
            item.append(sentence)
            sentences.append(item)
            metadata = []
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)
    for sentence in sentences:
        metadata = sentence[0]
        the_rest = sentence[1]
        sentence_text = sentence[1][0]
        for sent in sentence[1][1:]:
            idx = sent[0]
            word = sent[1]
            lemma = sent[2]
            head_idx = int(sent[6])
            head_word = sentence[1][head_idx][1] if head_idx != 0 else "#ROOT#"
            relation = sent[7]
            triple = (head_word, word, relation)
            if conversion_dictionary.get(triple):
                print("\n")
                print(triple)
                print(conversion_dictionary.get(triple))
                new_relation = conversion_dictionary[triple]["most_common_label_other"]
                print("OLD:\t"+relation)
                print("NEW:\t"+new_relation)
                print(sent)
                sent[7] = new_relation
                print(sent)
            else:
                continue


In [65]:
apply_conversions(train_file, train_conversion)



('Kingdom', 'United', 'amod')
{'most_common_label': 'amod', 'most_common_count': 5, 'proportion_these': 0.7142857142857143, 'most_common_label_other': 'compound', 'most_common_count_other': 4, 'proportion_other': 1.0}
OLD:	amod
NEW:	compound
['8', 'United', 'United', 'PROPN', 'NNP', 'Number=Sing', '9', 'amod', '_', 'Entity=(place-9\n']
['8', 'United', 'United', 'PROPN', 'NNP', 'Number=Sing', '9', 'compound', '_', 'Entity=(place-9\n']


('Kingdom', 'United', 'amod')
{'most_common_label': 'amod', 'most_common_count': 5, 'proportion_these': 0.7142857142857143, 'most_common_label_other': 'compound', 'most_common_count_other': 4, 'proportion_other': 1.0}
OLD:	amod
NEW:	compound
['7', 'United', 'United', 'PROPN', 'NNP', 'Number=Sing', '8', 'amod', '_', 'Entity=(place-9\n']
['7', 'United', 'United', 'PROPN', 'NNP', 'Number=Sing', '8', 'compound', '_', 'Entity=(place-9\n']


('Kingdom', 'United', 'amod')
{'most_common_label': 'amod', 'most_common_count': 5, 'proportion_these': 0.714285714285

NEW:	advmod
['4', 'where', 'where', 'SCONJ', 'WRB', 'PronType=Int', '7', 'mark', '_', '_\n']
['4', 'where', 'where', 'SCONJ', 'WRB', 'PronType=Int', '7', 'advmod', '_', '_\n']


('expected', 'were', 'aux:pass')
{'most_common_label': 'aux:pass', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'auxpass', 'most_common_count_other': 4, 'proportion_other': 1.0}
OLD:	aux:pass
NEW:	auxpass
['6', 'were', 'be', 'AUX', 'VBD', 'Mood=Ind|Tense=Past|VerbForm=Fin', '7', 'aux:pass', '_', '_\n']
['6', 'were', 'be', 'AUX', 'VBD', 'Mood=Ind|Tense=Past|VerbForm=Fin', '7', 'auxpass', '_', '_\n']


('using', 'method', 'obj')
{'most_common_label': 'obj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'dobj', 'most_common_count_other': 2, 'proportion_other': 1.0}
OLD:	obj
NEW:	dobj
['33', 'method', 'method', 'NOUN', 'NN', 'Number=Sing', '29', 'obj', '_', 'Entity=abstract-130)\n']
['33', 'method', 'method', 'NOUN', 'NN', 'Number=Sing', '29', 'dobj', '_'


('due', 'to', 'fixed')
{'most_common_label': 'fixed', 'most_common_count': 10, 'proportion_these': 1.0, 'most_common_label_other': 'mwe', 'most_common_count_other': 9, 'proportion_other': 1.0}
OLD:	fixed
NEW:	mwe
['20', 'to', 'to', 'PART', 'TO', '_', '19', 'fixed', '_', '_\n']
['20', 'to', 'to', 'PART', 'TO', '_', '19', 'mwe', '_', '_\n']


('forced', 'him', 'obj')
{'most_common_label': 'obj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'dobj', 'most_common_count_other': 1, 'proportion_other': 1.0}
OLD:	obj
NEW:	dobj
['29', 'him', 'he', 'PRON', 'PRP', 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs', '28', 'obj', '_', 'Entity=(person-1)\n']
['29', 'him', 'he', 'PRON', 'PRP', 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs', '28', 'dobj', '_', 'Entity=(person-1)\n']


('organization', 'Institute', 'nmod')
{'most_common_label': 'nmod', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'nsubj', 'most_common_count_o

{'most_common_label': 'obj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'nsubj', 'most_common_count_other': 1, 'proportion_other': 1.0}
OLD:	obj
NEW:	nsubj
['12', 'it', 'it', 'PRON', 'PRP', 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs', '11', 'obj', '_', 'Entity=(person-12)|SpaceAfter=No\n']
['12', 'it', 'it', 'PRON', 'PRP', 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs', '11', 'nsubj', '_', 'Entity=(person-12)|SpaceAfter=No\n']


('said', 'once', 'obl')
{'most_common_label': 'obl', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'advmod', 'most_common_count_other': 3, 'proportion_other': 1.0}
OLD:	obl
NEW:	advmod
['9', 'once', 'once', 'ADV', 'RB', 'NumType=Mult', '7', 'obl', '_', 'SpaceAfter=No\n']
['9', 'once', 'once', 'ADV', 'RB', 'NumType=Mult', '7', 'advmod', '_', 'SpaceAfter=No\n']


('said', 'thing', 'obj')
{'most_common_label': 'obj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_l

('opportunity', 'this', 'nsubj')
{'most_common_label': 'nsubj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'det', 'most_common_count_other': 3, 'proportion_other': 1.0}
OLD:	nsubj
NEW:	det
['7', 'this', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '11', 'nsubj', '_', 'Entity=(event-50)\n']
['7', 'this', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '11', 'det', '_', 'Entity=(event-50)\n']


('improve', 'coverage', 'obj')
{'most_common_label': 'obj', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'nsubj', 'most_common_count_other': 1, 'proportion_other': 1.0}
OLD:	obj
NEW:	nsubj
['15', 'coverage', 'coverage', 'NOUN', 'NN', 'Number=Sing', '13', 'obj', '_', 'Entity=abstract-64)\n']
['15', 'coverage', 'coverage', 'NOUN', 'NN', 'Number=Sing', '13', 'nsubj', '_', 'Entity=abstract-64)\n']


('think', 'have', 'conj')
{'most_common_label': 'conj', 'most_common_count': 1, 'proportion_these': 0.3333333333333333, 'most_common_l


('have', 'all', 'obl')
{'most_common_label': 'obl', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'nmod', 'most_common_count_other': 4, 'proportion_other': 0.5714285714285714}
OLD:	obl
NEW:	nmod
['20', 'all', 'all', 'PRON', 'DT', '_', '16', 'obl', '_', 'Entity=abstract-88)|SpaceAfter=No\n']
['20', 'all', 'all', 'PRON', 'DT', '_', '16', 'nmod', '_', 'Entity=abstract-88)|SpaceAfter=No\n']


('night', 'night', 'appos')
{'most_common_label': 'appos', 'most_common_count': 2, 'proportion_these': 1.0, 'most_common_label_other': 'nmod', 'most_common_count_other': 1, 'proportion_other': 1.0}
OLD:	appos
NEW:	nmod
['11', 'night', 'night', 'NOUN', 'NN', 'Number=Sing', '7', 'appos', '_', 'Entity=time-9)|SpaceAfter=No\n']
['11', 'night', 'night', 'NOUN', 'NN', 'Number=Sing', '7', 'nmod', '_', 'Entity=time-9)|SpaceAfter=No\n']


('visited', 'was', 'conj')
{'most_common_label': 'aux:pass', 'most_common_count': 1, 'proportion_these': 0.5, 'most_common_label_other': 'auxpa

NEW:	nsubj
['25', 'you', 'you', 'PRON', 'PRP', 'Case=Nom|Person=2|PronType=Prs', '23', 'obl', '_', 'Entity=(person-2)\n']
['25', 'you', 'you', 'PRON', 'PRP', 'Case=Nom|Person=2|PronType=Prs', '23', 'nsubj', '_', 'Entity=(person-2)\n']


('do', 'that', 'obj')
{'most_common_label': 'obj', 'most_common_count': 3, 'proportion_these': 0.6, 'most_common_label_other': 'dobj', 'most_common_count_other': 20, 'proportion_other': 0.5405405405405406}
OLD:	obj
NEW:	dobj
['30', 'that', 'that', 'PRON', 'WDT', 'PronType=Dem', '32', 'obj', '_', '_\n']
['30', 'that', 'that', 'PRON', 'WDT', 'PronType=Dem', '32', 'dobj', '_', '_\n']


('own', 'do', 'ccomp')
{'most_common_label': 'ccomp', 'most_common_count': 1, 'proportion_these': 1.0, 'most_common_label_other': 'aux', 'most_common_count_other': 2, 'proportion_other': 1.0}
OLD:	ccomp
NEW:	aux
['32', 'do', 'do', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '27', 'ccomp', '_', 'SpaceAfter=No\n']
['32', 'do', 'do', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|V

['23', 'International', 'International', 'PROPN', 'NNP', 'Number=Sing', '25', 'compound', '_', '_\n']


('found', 'which', 'nsubj:pass')
{'most_common_label': 'nsubj:pass', 'most_common_count': 3, 'proportion_these': 1.0, 'most_common_label_other': 'nsubjpass', 'most_common_count_other': 1, 'proportion_other': 0.3333333333333333}
OLD:	nsubj:pass
NEW:	nsubjpass
['27', 'which', 'which', 'PRON', 'WDT', 'PronType=Rel', '30', 'nsubj:pass', '_', '_\n']
['27', 'which', 'which', 'PRON', 'WDT', 'PronType=Rel', '30', 'nsubjpass', '_', '_\n']


('found', 'be', 'aux:pass')
{'most_common_label': 'aux:pass', 'most_common_count': 2, 'proportion_these': 1.0, 'most_common_label_other': 'auxpass', 'most_common_count_other': 6, 'proportion_other': 1.0}
OLD:	aux:pass
NEW:	auxpass
['29', 'be', 'be', 'AUX', 'VB', 'VerbForm=Inf', '30', 'aux:pass', '_', '_\n']
['29', 'be', 'be', 'AUX', 'VB', 'VerbForm=Inf', '30', 'auxpass', '_', '_\n']


('been', 'there', 'expl')
{'most_common_label': 'expl', 'most_common_cou