In [1]:
import sys
from tqdm import tqdm

In [2]:
"""
Set some hyperparameters
"""
## Comparison Type
# comparison = "within"
comparison = "between"

## Data Directory
data_dir = "data/"

## Output Directory
output_dir = "output/"

In [3]:
gum_relations = []

In [4]:
# def process_data(input_file, input_type):
    

In [5]:
def process_ud_data(ud_file):
    """
    Processes UD data into dictionaries of word pairs, relations, and sentences.
    """
    print(f"\nProcessing '{ud_file}' data file...")
    with open(ud_file) as infile:
        ud_lines = infile.readlines()
    # get list of sentences
    sentences = []
    sentence = []
    for line in ud_lines:
#         if "# sent_id = " in line:
#             sentence.append(line.split(" sent_id = ")[1].strip())
#         if "# text =" in line:
#             sentence.append(line.split(" text = ")[1].strip())
        if line[0] == "#":
            continue
        elif len(line.strip()) == 0:
            sentences.append(sentence)
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)

    pair_to_relations = {}
    pair_relation_to_sentences = {}
    for sentence in tqdm(sentences):
#         print("\nSENTENCE:\t",sentence)
        # each sentence is a list of lines split on tabs
        sentence_id = sentences.index(sentence)
#         print("Sentence ID:\t"+str(sentence_id))
        sentence_text = []
        for line in sentence:
            word = line[1]
            sentence_text.append(word)
#         print("Sentence Text:\t"+" ".join(sentence_text))
#         print("Sentence ID:\t"+sentence_id)
#         print("Sentence Text:\t"+sentence_text)
        for line in sentence:
#             print("LINE:\t",line)
            word_idx = line[0]
#             print("word idx:\t",word_idx)
            word = line[1]
#             print("word:\t"+word)
            head_idx = int(line[6])
#             print("head idx:\t",head_idx)
#             head_idx+1 needed to match place in list
            head_word = sentence[head_idx-1][1] if head_idx != 0 else "#ROOT#"
#             print("head word:\t"+head_word)
            relation = line[7]
            if relation not in gum_relations:
                gum_relations.append(relation)
#             print("relation:\t"+relation)
            word_pair = (head_word, word)
#             print("word pair:\t",word_pair)
            if word_pair in pair_to_relations:
                pair_to_relations[word_pair].append(relation)
            else:
                pair_to_relations[word_pair] = [relation]
            pair_relation = (word_pair, relation)
            if pair_relation in pair_relation_to_sentences:
                pair_relation_to_sentences[pair_relation].append([sentence_id, " ".join(sentence_text)])
            else:
                pair_relation_to_sentences[pair_relation] = [[sentence_id, " ".join(sentence_text)]]
    """
    pair_to_relations dict:
        e.g. {("Need", "'ll"): ["aux"], ...}

    pair_relation_to_sentences dict:
        e.g. {(("funnel", "Large"), "amod"): ["Large funnel or strainer to hold filter"], ...}
    """
    print(f"\t{len(pair_to_relations):,} head-dependent:relation pairs")
    print(f"\t{len(pair_relation_to_sentences):,} head-dependent-relation:sentence triples")
    return pair_to_relations, pair_relation_to_sentences

In [6]:
wsj_relations = []

In [7]:
def process_wsj_data(ud_file):
    """
    Processes WSJ data into dictionaries of word pairs, relations, and sentences.
    """
    print(f"\nProcessing '{ud_file}' data file...")
    with open(ud_file) as infile:
        ud_lines = infile.readlines()
    # get list of sentences
    sentences = []
    sentence = []
    for line in ud_lines:
#         if "# sent_id = " in line:
#             sentence.append(line.split(" sent_id = ")[1].strip())
#         if "# text =" in line:
#             sentence.append(line.split(" text = ")[1].strip())
        if line[0] == "#":
            continue
        elif len(line.strip()) == 0:
            sentences.append(sentence)
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)

    pair_to_relations = {}
    pair_relation_to_sentences = {}
    for sentence in tqdm(sentences):
#         print("\nSENTENCE:\t",sentence)
        # each sentence is a list of lines split on tabs
        sentence_id = sentences.index(sentence)
#         print("Sentence ID:\t"+str(sentence_id))
        sentence_text = []
        for line in sentence:
            word = line[1]
            sentence_text.append(word)
#         print("Sentence Text:\t"+" ".join(sentence_text))
        for line in sentence:
#             print("LINE:\t",line)
            word_idx = line[0]
#             print("word idx:\t",word_idx)
            word = line[1]
#             print("word:\t"+word)
            head_idx = int(line[6])
#             print("head idx:\t",head_idx)
#             head_idx+1 needed to match place in list
            head_word = sentence[head_idx-1][1] if head_idx != 0 else "#ROOT#"
#             print("head word:\t"+head_word)
            relation = line[7]
            if relation not in wsj_relations:
                wsj_relations.append(relation)
#             print("relation:\t"+relation)
            word_pair = (head_word, word)
#             print("word pair:\t",word_pair)
            if word_pair in pair_to_relations:
                pair_to_relations[word_pair].append(relation)
            else:
                pair_to_relations[word_pair] = [relation]
            pair_relation = (word_pair, relation)
            if pair_relation in pair_relation_to_sentences:
                pair_relation_to_sentences[pair_relation].append([sentence_id, " ".join(sentence_text)])
            else:
                pair_relation_to_sentences[pair_relation] = [[sentence_id, " ".join(sentence_text)]]
    """
    pair_to_relations dict:
        e.g. {("Need", "'ll"): ["aux"], ...}

    pair_relation_to_sentences dict:
        e.g. {(("funnel", "Large"), "amod"): ["Large funnel or strainer to hold filter"], ...}
    """
    print(f"\t{len(pair_to_relations):,} head-dependent:relation pairs")
    print(f"\t{len(pair_relation_to_sentences):,} head-dependent-relation:sentence triples")
    return pair_to_relations, pair_relation_to_sentences

In [8]:
# GUM (Georgetown University Multilayer) corpus from UD website
gum_directory = data_dir+"ud-en-gum/"
gum_train = gum_directory+"en_gum-ud-train.conllu"
gum_dev = gum_directory+"en_gum-ud-dev.conllu"

# wsj corpus with different conventions
# converted from Stanford dependencies (?)
wsj_directory = data_dir+"wsj-DIFF-CONVENTIONS/"
wsj_train = wsj_directory+"train.conllu"
wsj_dev = wsj_directory+"dev.conllu"
wsj_test = wsj_directory+"test.conllu"

In [9]:
if comparison == "within":
    train_file = gum_train
    dev_file = gum_dev
    train_output = output_dir+"train_mismatches_within_corpus.tsv"
    dev_output = output_dir+"dev_mismatches_within_corpus.tsv"
    # get list for training data
    train_list, train_sentences = process_ud_data(train_file)
    # get list for testing data
    dev_list, dev_sentences = process_ud_data(dev_file)

    
elif comparison == "between":
    train_file = wsj_train
    dev_file = gum_train
    train_output = output_dir+"train_mismatches_between_corpus.tsv"
    dev_output = output_dir+"dev_mismatches_between_corpus.tsv"
    # get list for training data
    train_list, train_sentences = process_wsj_data(train_file)
    # get list for testing data
    dev_list, dev_sentences = process_ud_data(dev_file)


Processing 'data/wsj-DIFF-CONVENTIONS/train.conllu' data file...


100%|██████████| 39832/39832 [00:45<00:00, 879.21it/s] 
  0%|          | 0/4287 [00:00<?, ?it/s]

	467,290 head-dependent:relation pairs
	484,472 head-dependent-relation:sentence triples

Processing 'data/ud-en-gum/en_gum-ud-train.conllu' data file...


100%|██████████| 4287/4287 [00:00<00:00, 4696.68it/s]

	62,876 head-dependent:relation pairs
	63,502 head-dependent-relation:sentence triples





In [10]:
# for pair in train_list:
#     print(pair, train_list[pair])

In [11]:
# for triple in train_sentences:
# #     print(triple, train_sentences[triple])
#     for sentence in train_sentences[triple]:
#         print(triple, sentence)

In [12]:
# for pair in dev_list:
#     print(pair, dev_list[pair])

In [13]:
# for triple in dev_sentences:
# #     print(triple, dev_sentences[triple])
#     for sentence in dev_sentences[triple]:
#         print(triple, sentence)

In [14]:
# compare the two lists
dev_mismatches = {}
train_mismatches = {}
for dev_pair in dev_list.keys():
    # If the (head, dependent) pair in dev is in train
    if dev_pair in train_list.keys():
        # get the relations for that pair in dev
        dev_relations = dev_list[dev_pair]
#         print("dev pair:\t\t",dev_pair)
#         print("dev relations:\t\t",dev_relations)
        # and in train
        train_relations = train_list[dev_pair]
#         print("train relations:\t",train_relations)
        # TODO: decide which ones we actually care about...
        # get the relations in dev NOT in train
        not_in_train = [x for x in dev_relations if x not in set(train_relations)]
        # and get the relations in train NOT in dev
        not_in_dev = [x for x in train_relations if x not in set(dev_relations)]
#         print("relations not in dev:\t",not_in_train)
#         print("relations not in train:\t",not_in_dev)
        # if there are relations not in dev/train, add entry to dev_mismatches or train_mismatches for that pair-relation combo
        if len(not_in_train) != 0:
            dev_mismatches[dev_pair] = list(set(not_in_train))
#             print("dev_mismatches:\t",dev_mismatches)
        if len(not_in_dev) != 0:
            train_mismatches[dev_pair] = list(set(not_in_dev))

In [15]:
print("")
print(f"{len(dev_mismatches)} pairs with a relation in dev but not in train")
print(f"{len(train_mismatches)} pairs with a relation in train but not in dev")


2462 pairs with a relation in dev but not in train
3246 pairs with a relation in train but not in dev


In [16]:
def generate_human_readable_output(filename, mismatches, sentences):
    """
    Generates human-readable output file for evaluation.
    
    Args:
        filename: the output filename
        
        mismatches: the dictionary of word pairs with a relation in one file but not the other
        
        sentences: the dictionary of word pairs, their relations, and the sentences they're found in
    """
    with open(filename, "w") as output:
        header = ("ID" + "\t" +
                  "SENTENCE" + "\t" + 
                  "HEAD WORD" + "\t" + 
                  "DEPENDENT WORD" + "\t" + 
                  "RELATION" + "\t" + 
                  "CORRECT?" + "\n")
        output.write(header)
        for pair in mismatches:
            head_word = pair[0]
            dependent_word = pair[1]
            for relation in mismatches[pair]:
                triple = (pair, relation)
                for sentence in sentences[triple]:
                    sentence_id = str(sentence[0])
                    sentence_text = sentence[1]
                    print(sentence_id)
                    print(sentence_text)
                    line = (sentence_id + "\t" +
                            sentence_text + "\t" +
                            head_word + "\t" +
                            dependent_word + "\t" +
                            relation + "\n"
                            )
                    print("LINE:\t"+line)
                    output.write(line)

In [17]:
generate_human_readable_output(dev_output, dev_mismatches, dev_sentences)

2
Claire Bailey-Ross claire.bailey-ross@port.ac.uk University of Portsmouth , United Kingdom
LINE:	2	Claire Bailey-Ross claire.bailey-ross@port.ac.uk University of Portsmouth , United Kingdom	Kingdom	United	amod

3
Andrew Beresford a.m.beresford@durham.ac.uk Durham University , United Kingdom
LINE:	3	Andrew Beresford a.m.beresford@durham.ac.uk Durham University , United Kingdom	Kingdom	United	amod

4
Daniel Smith daniel.smith2@durham.ac.uk Durham University , United Kingdom
LINE:	4	Daniel Smith daniel.smith2@durham.ac.uk Durham University , United Kingdom	Kingdom	United	amod

5
Claire Warwick c.l.h.warwick@durham.ac.uk Durham University , United Kingdom
LINE:	5	Claire Warwick c.l.h.warwick@durham.ac.uk Durham University , United Kingdom	Kingdom	United	amod

2319
Why do you think the Paralympic movement has so little visibility in the US compared to other countries like Australia , the United Kingdom and even Canada ?
LINE:	2319	Why do you think the Paralympic movement has so little vis

955
He became a Christian at a young age , when he responded to the altar call at his church " to accept Christ as my Saviour . "
LINE:	955	He became a Christian at a young age , when he responded to the altar call at his church " to accept Christ as my Saviour . "	call	to	case

3772
Hours of stress , or regular exposure to stressful situations , can cause serious health problems .
LINE:	3772	Hours of stress , or regular exposure to stressful situations , can cause serious health problems .	problems	health	amod

958
He ministered for a short time among the Tarahumara Indians in Chihuahua , Mexico , until health problems due to an inadequate diet and the high altitude forced him to leave .
LINE:	958	He ministered for a short time among the Tarahumara Indians in Chihuahua , Mexico , until health problems due to an inadequate diet and the high altitude forced him to leave .	forced	him	obj

959
Sometime in this period , Nida became a founding charter member of Wycliffe Bible Translators , 

The young were mostly dead , and the old men had been taken away , they told us , to learn important new things and to come back when they were ready to contribute fully .
LINE:	1322	The young were mostly dead , and the old men had been taken away , they told us , to learn important new things and to come back when they were ready to contribute fully .	told	us	obj

1322
The young were mostly dead , and the old men had been taken away , they told us , to learn important new things and to come back when they were ready to contribute fully .
LINE:	1322	The young were mostly dead , and the old men had been taken away , they told us , to learn important new things and to come back when they were ready to contribute fully .	ready	when	mark

1328
The traveler stood up from the table .
LINE:	1328	The traveler stood up from the table .	stood	up	compound:prt

1331
“ Nonetheless , ” the traveler said , and she walked out .
LINE:	1331	“ Nonetheless , ” the traveler said , and she walked out .	said

LINE:	2600	Some may not know exactly what Stardust or Stardust @ home is .	know	not	advmod

2601
Can you explain more about it for us ?
LINE:	2601	Can you explain more about it for us ?	more	it	obl

2604
Stardust is a NASA Discovery mission that was launched in 1999 .
LINE:	2604	Stardust is a NASA Discovery mission that was launched in 1999 .	launched	that	nsubj:pass

2604
Stardust is a NASA Discovery mission that was launched in 1999 .
LINE:	2604	Stardust is a NASA Discovery mission that was launched in 1999 .	launched	was	aux:pass

2787
The attack was launched on Wednesday by a user labelled " Anonymous " , on the website " Insurgency Wiki " , a spinoff of 4chan .
LINE:	2787	The attack was launched on Wednesday by a user labelled " Anonymous " , on the website " Insurgency Wiki " , a spinoff of 4chan .	launched	was	aux:pass

2609
By " sample return " of course I mean a mission that brings back extraterrestrial material .
LINE:	2609	By " sample return " of course I mean a mission that

Kristen Wilkins , curator of " Andy Warhol : Photographs and Prints from the University Collection " at the University of Southern Indiana , January 23 - March 9 2014 .
LINE:	2956	Kristen Wilkins , curator of " Andy Warhol : Photographs and Prints from the University Collection " at the University of Southern Indiana , January 23 - March 9 2014 .	University	Indiana	nmod

2943
USI 's art gallery , like 189 other educational galleries and museums around the country , is a recipient of a major Warhol donor program , and this program is cultivating new interest in Warhol 's photographic legacy .
LINE:	2943	USI 's art gallery , like 189 other educational galleries and museums around the country , is a recipient of a major Warhol donor program , and this program is cultivating new interest in Warhol 's photographic legacy .	program	this	det

2951
The Andy Warhol Foundation for the Visual Arts made two donations to USI Art Collections , in 2007 and a second recently .
LINE:	2951	The Andy Warh

In [18]:
generate_human_readable_output(train_output, train_mismatches, train_sentences)

2684
Any sale of Intelogic could have an impact on the battle between Mr. Edelman and New York attorney Martin Ackerman for control of Datapoint .
LINE:	2684	Any sale of Intelogic could have an impact on the battle between Mr. Edelman and New York attorney Martin Ackerman for control of Datapoint .	have	impact	dobj

4711
Index arbitrage , Mr. Carpenter said last week , does n't have a `` negative impact on the market as a whole '' and Kidder 's customers were `` sophisticated '' enough to know that .
LINE:	4711	Index arbitrage , Mr. Carpenter said last week , does n't have a `` negative impact on the market as a whole '' and Kidder 's customers were `` sophisticated '' enough to know that .	have	impact	dobj

4886
The high court 's action , refusing to hear appeals by several drug companies , is likely to have a significant impact at several levels .
LINE:	4886	The high court 's action , refusing to hear appeals by several drug companies , is likely to have a significant impact at sever

26511
So throughout the decade I have maintained my share of individual retirement accounts and CDs , and tinkered with stocks , bonds and mutual funds , as well as preserving my necessary position in the residential real-estate market .
LINE:	26511	So throughout the decade I have maintained my share of individual retirement accounts and CDs , and tinkered with stocks , bonds and mutual funds , as well as preserving my necessary position in the residential real-estate market .	as	as	mwe

26682
Newport Beach operations differ from the Hollywood boiler rooms in style as well as in dollars .
LINE:	26682	Newport Beach operations differ from the Hollywood boiler rooms in style as well as in dollars .	as	as	mwe

27306
I suggest that The Wall Street Journal ( as well as other U.S. news publications of like mind ) should put its money where its mouth is : Lend computer equipment to replace that damaged at El Espectador , buy ad space , publish stories under the bylines of El Espectador journal

11455
The bugs will cause problems in `` specific and rare circumstances that will not occur in typical applications '' such as word-processing and spreadsheets , said Michael Slater , editor of the Microprocessor Report , an industry newsletter .
LINE:	11455	The bugs will cause problems in `` specific and rare circumstances that will not occur in typical applications '' such as word-processing and spreadsheets , said Michael Slater , editor of the Microprocessor Report , an industry newsletter .	such	as	mwe

11457
The real culprits , they said , are computer makers such as IBM that have jumped the gun to unveil 486-based products .
LINE:	11457	The real culprits , they said , are computer makers such as IBM that have jumped the gun to unveil 486-based products .	such	as	mwe

11475
Machines using the 486 are expected to challenge higher-priced work stations and minicomputers in applications such as so-called servers , which connect groups of computers together , and in computer-aided de

LINE:	5425	Earlier this month the accounting firm of Ernst & Young and the securities firm of Goldman , Sachs & Co. , the experts hired by the creditors , contended that Eastern would have difficulty meeting earnings targets the airline was projecting .	have	difficulty	dobj

13296
He predicted that PWA would have little difficulty attracting prospective buyers .
LINE:	13296	He predicted that PWA would have little difficulty attracting prospective buyers .	have	difficulty	dobj

16796
`` I suspect October was n't as good as the third quarter , and they 'll have difficulty matching the third quarter in the fourth quarter , '' Mr. Blumstein said .
LINE:	16796	`` I suspect October was n't as good as the third quarter , and they 'll have difficulty matching the third quarter in the fourth quarter , '' Mr. Blumstein said .	have	difficulty	dobj

25805
Bankers say , however , that the government may have difficulty selling the institution even without a floor price .
LINE:	25805	Bankers say , h


11374
Mr. Antar , the SEC said , made more than $ 60 million from the sale of his shares between 1985 and 1987 .
LINE:	11374	Mr. Antar , the SEC said , made more than $ 60 million from the sale of his shares between 1985 and 1987 .	more	than	mwe

11472
Unveiled last April , the chip crams 1.2 million transistors on a sliver of silicon , more than four times as many as on Intel 's earlier model , 80386 .
LINE:	11472	Unveiled last April , the chip crams 1.2 million transistors on a sliver of silicon , more than four times as many as on Intel 's earlier model , 80386 .	more	than	mwe

11631
When problems surface , the temptation becomes strong to summarily overhaul a market system that has served for more than 100 years .
LINE:	11631	When problems surface , the temptation becomes strong to summarily overhaul a market system that has served for more than 100 years .	more	than	mwe

11816
As their varied strategies suggest , there is more than one way to respond to a disaster -- though it 's

31913
Dr. Grossman , who also is president of New England Medical Center Hospitals in Boston , noted that the hospitals he runs deal with more than 100 utilization management firms and that many of them have different procedures and requirements .
LINE:	31913	Dr. Grossman , who also is president of New England Medical Center Hospitals in Boston , noted that the hospitals he runs deal with more than 100 utilization management firms and that many of them have different procedures and requirements .	more	than	mwe

31948
On two occasions the inflation rate rose to more than 10 % a year .
LINE:	31948	On two occasions the inflation rate rose to more than 10 % a year .	more	than	mwe

32046
And Southwest lawmakers were a driving force behind $ 54.6 million for U.S.-Mexico border facilities , or more than double the administration 's request .
LINE:	32046	And Southwest lawmakers were a driving force behind $ 54.6 million for U.S.-Mexico border facilities , or more than double the administration

12858
The more we think about it , the more we suspect Mr. Brady does indeed have enough power where he already is .
LINE:	12858	The more we think about it , the more we suspect Mr. Brady does indeed have enough power where he already is .	have	is	advcl

1698
If you have not , it is probable that a thorough airing of the dispute by calm and rational debate would have been the better course .
LINE:	1698	If you have not , it is probable that a thorough airing of the dispute by calm and rational debate would have been the better course .	have	not	neg

2164
`` We have previously had discussions with representatives of Pinkerton 's Inc. concerning the { sale of the company } and we concluded that we did not have liability under the contract , '' says American Brands .
LINE:	2164	`` We have previously had discussions with representatives of Pinkerton 's Inc. concerning the { sale of the company } and we concluded that we did not have liability under the contract , '' says American Brands .	h

28152
Though he himself does n't expect a recession soon , Mr. Wyss advises people who do that `` the best thing to be in is long that is , 20-year to 30-year Treasury bonds . ''
LINE:	28152	Though he himself does n't expect a recession soon , Mr. Wyss advises people who do that `` the best thing to be in is long that is , 20-year to 30-year Treasury bonds . ''	is	that	nsubj

28207
Mr. Katz advocates issues with low price-earnings ratios -- that is , low prices in relation to the company 's earnings per share .
LINE:	28207	Mr. Katz advocates issues with low price-earnings ratios -- that is , low prices in relation to the company 's earnings per share .	is	that	nsubj

31081
The effete Tyrannosaurus Rex Had strict Cretaceous views on sex , And that is why you only see him Reproduced in the museum .
LINE:	31081	The effete Tyrannosaurus Rex Had strict Cretaceous views on sex , And that is why you only see him Reproduced in the museum .	is	that	nsubj

32514
One possible solution for Wall St

31376
`` After more than 200 years , something new has happened to pencils , '' said Arthur D. Little in a 1974 report that publicly described the previously secret item .
LINE:	31376	`` After more than 200 years , something new has happened to pencils , '' said Arthur D. Little in a 1974 report that publicly described the previously secret item .	described	that	nsubj

28972
It was during the quiet exodus down the pristine concrete ramps of the Stick that I really understood the point of all those Walkmen and Watchmen .
LINE:	28972	It was during the quiet exodus down the pristine concrete ramps of the Stick that I really understood the point of all those Walkmen and Watchmen .	understood	that	advmod

10012
At its November meeting , it will try to revise its quotas to satisfy Persian Gulf members that can produce far more oil than their allotments .
LINE:	10012	At its November meeting , it will try to revise its quotas to satisfy Persian Gulf members that can produce far more oil than t

31952
And that increase in the money supply would not have happened without the consent of the Federal Reserve .
LINE:	31952	And that increase in the money supply would not have happened without the consent of the Federal Reserve .	increase	that	det

39054
Most of that increase came in the first hour after the phone lines opened at 8 a.m .
LINE:	39054	Most of that increase came in the first hour after the phone lines opened at 8 a.m .	increase	that	det

6792
Companies that actually market speed as part of their service train their managers to lead and participate in teams that increase speed and improve quality in everyday operations .
LINE:	6792	Companies that actually market speed as part of their service train their managers to lead and participate in teams that increase speed and improve quality in everyday operations .	increase	that	nsubj

11610
The Senate bill contains many provisions that will increase the costs of trading .
LINE:	11610	The Senate bill contains many provisions t


2858
The upgrade reflected the 20 % decline in shares of the bank since the firm lowered its rating in early October , based on the belief the stock had become expensive .
LINE:	2858	The upgrade reflected the 20 % decline in shares of the bank since the firm lowered its rating in early October , based on the belief the stock had become expensive .	%	20	compound

2866
Ocean Drilling & Research dropped 1 1/4 to 21 1/2 following news of a restructuring plan that calls for the company to reorganize its drilling business into a separate company and offer a 15 % to 20 % stake to the public .
LINE:	2866	Ocean Drilling & Research dropped 1 1/4 to 21 1/2 following news of a restructuring plan that calls for the company to reorganize its drilling business into a separate company and offer a 15 % to 20 % stake to the public .	%	20	compound

3146
In late May , Newmark & Lewis announced a plan to cut prices 5 % to 20 % and eliminate what it called a `` standard discount-retailing practice '' of ne

On U.S. military presence in Asia : Asked if his offer to allow the American military to use facilities in Singapore would help preserve America 's presence in the region at bases in the Philippines , he said , `` What we have done is make it easier for the Philippines to continue to host American bases without it being said they are lackeys of the imperialists and the only ones in Asia or in Southeast Asia .
LINE:	1886	On U.S. military presence in Asia : Asked if his offer to allow the American military to use facilities in Singapore would help preserve America 's presence in the region at bases in the Philippines , he said , `` What we have done is make it easier for the Philippines to continue to host American bases without it being said they are lackeys of the imperialists and the only ones in Asia or in Southeast Asia .	said	is	ccomp

2933
People familiar with the exchange said another idea likely to be approved is expanding the monthly reports on program trading to cover specific

11783
According to the report , inventories outside the farm sector grew at an annual rate of $ 24.6 billion in the third quarter , up from a $ 19.5 billion pace in the second quarter .
LINE:	11783	According to the report , inventories outside the farm sector grew at an annual rate of $ 24.6 billion in the third quarter , up from a $ 19.5 billion pace in the second quarter .	grew	up	advmod

33306
They grew up with more brand choices than any generation and have shown less allegiance so far .
LINE:	33306	They grew up with more brand choices than any generation and have shown less allegiance so far .	grew	up	advmod

13181
As the stock market lurched into a 190-point free fall on Oct. 13 , Mr. Breeden found himself scurrying around the sixth floor of the SEC -- from his desk , where the New York Stock Exchange was on an open line , to his assistant 's office , where the Commodity Futures Trading Commission was connected , to a third room , where a computer monitored market moves .
LINE:	1


21400
At such prices , according to Mr. Savaiko , bargain hunting and short-covering -- buying back of contracts previously sold -- by speculators is n't uncommon .
LINE:	21400	At such prices , according to Mr. Savaiko , bargain hunting and short-covering -- buying back of contracts previously sold -- by speculators is n't uncommon .	according	to	mwe

21480
The issue was oversubscribed and `` doing very well , '' according to an official with lead underwriter Morgan Stanley .
LINE:	21480	The issue was oversubscribed and `` doing very well , '' according to an official with lead underwriter Morgan Stanley .	according	to	mwe

21525
He paid an immense fine and was lucky , according to a local wag , to escape the gas chamber .
LINE:	21525	He paid an immense fine and was lucky , according to a local wag , to escape the gas chamber .	according	to	mwe

21653
The FTC 's request was `` not unusual '' and Emerson will make a `` full and prompt '' response , according to a spokesman .
LINE:	2165

Traders said the market was exceptionally thin , as small investors remain on the sidelines .
LINE:	18376	Traders said the market was exceptionally thin , as small investors remain on the sidelines .	thin	was	cop

25546
Volume was extremely thin at 351.3 million shares , the lightest volume of the week and modestly under Thursday 's 387.4 million shares .
LINE:	25546	Volume was extremely thin at 351.3 million shares , the lightest volume of the week and modestly under Thursday 's 387.4 million shares .	thin	was	cop

35496
Trading was relatively thin at an estimated 650 million shares , though brisker than Monday 's 526 million .
LINE:	35496	Trading was relatively thin at an estimated 650 million shares , though brisker than Monday 's 526 million .	thin	was	cop

12515
Sex and violence are routinely included even when they are irrelevant to the script , and high-tech special effects are continually substituted for good plot and character development .
LINE:	12515	Sex and violence are rou

21260
`` General Foods 's relationships with its agencies are based on the agencies ' work , and will continue to be , '' said David Hurwitt , a vice president of Kraft General Foods .
LINE:	21260	`` General Foods 's relationships with its agencies are based on the agencies ' work , and will continue to be , '' said David Hurwitt , a vice president of Kraft General Foods .	based	are	auxpass

25420
Both Westwood Brick and Westwood Group are based in Boston .
LINE:	25420	Both Westwood Brick and Westwood Group are based in Boston .	based	are	auxpass

26854
But against that combined increase of 12 % in students chosen by academic criteria , the plan eliminates a large category in which admissions now are based on grades , test scores and `` supplemental points '' for factors such as high-school curriculum , English-language proficiency and an essay .
LINE:	26854	But against that combined increase of 12 % in students chosen by academic criteria , the plan eliminates a large category in whic

37284
`` It helps us , and people feel better talking to someone who 's gone through the same thing , '' an SBA official says .
LINE:	37284	`` It helps us , and people feel better talking to someone who 's gone through the same thing , '' an SBA official says .	gone	who	nsubjpass

1604
Mr. Bush said that the December meeting , which was announced simultaneously in Moscow , will be held in the unusual setting of ships at sea to hold down the `` fanfare '' and force the two sides to limit participation to just small groups of advisers .
LINE:	1604	Mr. Bush said that the December meeting , which was announced simultaneously in Moscow , will be held in the unusual setting of ships at sea to hold down the `` fanfare '' and force the two sides to limit participation to just small groups of advisers .	announced	was	auxpass

2740
LTV 's planned increase , which was announced in an Oct. 26 memo to district managers , does n't affect electrogalvanized steel or tin plate .
LINE:	2740	LTV 's plann

26920
Or an investor who wants his accountant 's advice may be told , `` You seem like a guy who can make up his own mind . ''
LINE:	26920	Or an investor who wants his accountant 's advice may be told , `` You seem like a guy who can make up his own mind . ''	told	be	auxpass

36445
And Mr. Shultz curtly told Mr. Abrams that the general should be told that only he could repair his tarnished image .
LINE:	36445	And Mr. Shultz curtly told Mr. Abrams that the general should be told that only he could repair his tarnished image .	told	be	auxpass

175
`` It may very well be what the Japanese banks want , '' he told Radio New Zealand .
LINE:	175	`` It may very well be what the Japanese banks want , '' he told Radio New Zealand .	told	be	ccomp

20400
Fortunately , these same parents do want their children to get a decent education as traditionally understood , and they have enough common sense to know what that demands .
LINE:	20400	Fortunately , these same parents do want their children to ge

526
What they wo n't tell you is not to go aloft in anything you do n't want to get wet .
LINE:	526	What they wo n't tell you is not to go aloft in anything you do n't want to get wet .	tell	you	dobj

3798
Mr. Busch says there is a scientific explanation for all haunts , and he can even tell you how to encourage the spirits .
LINE:	3798	Mr. Busch says there is a scientific explanation for all haunts , and he can even tell you how to encourage the spirits .	tell	you	dobj

4115
The odds are against him , as he himself would no doubt tell you .
LINE:	4115	The odds are against him , as he himself would no doubt tell you .	tell	you	dobj

6169
As for Lincoln , if you ca n't guess why he 's so sweet to his sister when everybody else hates her , then I 'm not going to tell you .
LINE:	6169	As for Lincoln , if you ca n't guess why he 's so sweet to his sister when everybody else hates her , then I 'm not going to tell you .	tell	you	dobj

9926
Says a spokeswoman for the Brussels Bourse : `` Nob

8402
Moreover , in the relatively unregulated Indian stock markets , investors frequently do n't know what they are getting when they subscribe to an issue .
LINE:	8402	Moreover , in the relatively unregulated Indian stock markets , investors frequently do n't know what they are getting when they subscribe to an issue .	know	n't	neg

9053
But while analysts say that municipal bonds still offer good value , you would n't know it by the way institutional investors are rushing to dump their holdings .
LINE:	9053	But while analysts say that municipal bonds still offer good value , you would n't know it by the way institutional investors are rushing to dump their holdings .	know	n't	neg

9543
Lilly officials said they had seen reports of hypoglycemic unawareness among some patients making the shift from animal to human insulin , but did n't know if the problem had caused any deaths .
LINE:	9543	Lilly officials said they had seen reports of hypoglycemic unawareness among some patients making

Mr. Noriega did n't suffer from any hesitation once he had the pistol .
LINE:	20947	Mr. Noriega did n't suffer from any hesitation once he had the pistol .	had	once	mark

462
A breakdown showed that food prices were the most active part of growth with a rise of 0.6 % .
LINE:	462	A breakdown showed that food prices were the most active part of growth with a rise of 0.6 % .	part	that	mark

2231
The Sun concluded that Mr. Pierce is only part of the problem -- and a part that 's gone .
LINE:	2231	The Sun concluded that Mr. Pierce is only part of the problem -- and a part that 's gone .	part	that	mark

2713
`` Those who testified { yesterday } have consistently maintained that anyone who did n't agree with them is part of a coverup , a whitewash , or the subject of excessive influence , '' Mr. Bickwit said .
LINE:	2713	`` Those who testified { yesterday } have consistently maintained that anyone who did n't agree with them is part of a coverup , a whitewash , or the subject of excessive inf

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [19]:
# unique relations in gum
for item in gum_relations:
    print(item)

amod
root
cc
conj
punct
case
nmod
flat
list
compound
mark
aux
nsubj
obl
det
obj
acl:relcl
cop
acl
advmod
nmod:poss
advcl
nsubj:pass
aux:pass
compound:prt
xcomp
appos
fixed
parataxis
csubj
dep
nmod:tmod
nummod
expl
ccomp
cc:preconj
det:predet
iobj
orphan
obl:tmod
goeswith
obl:npmod
nmod:npmod
vocative
discourse
dislocated
reparandum
csubj:pass


In [20]:
# unique relations in wsj
for item in wsj_relations:
    print(item)

case
det
compound
nummod
nmod
punct
nmod:poss
amod
nsubj
dep
dobj
cc
conj
nsubjpass
acl
auxpass
advmod
root
ccomp
mark
xcomp
nmod:tmod
appos
nmod:npmod
aux
cop
neg
acl:relcl
advcl
mwe
det:predet
csubj
parataxis
compound:prt
iobj
expl
cc:preconj
discourse
csubjpass


In [21]:
# uniquely gum relations
for item in gum_relations:
    if item not in wsj_relations:
        print(item)

flat
list
obl
obj
nsubj:pass
aux:pass
fixed
orphan
obl:tmod
goeswith
obl:npmod
vocative
dislocated
reparandum
csubj:pass


In [22]:
# uniquely wsj relations
for item in wsj_relations:
    if item not in gum_relations:
        print(item)

dobj
nsubjpass
auxpass
neg
mwe
csubjpass
