In [22]:
import json
import ijson
from collections import Counter

In [6]:
PAD_ID = 0
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
EMB_INIT_RANGE = 1.0
VOCAB_PREFIX = [PAD_TOKEN, UNK_TOKEN]
SUBJ_NER_TO_ID = {PAD_TOKEN: 0, UNK_TOKEN: 1, 'ORGANIZATION': 2, 'PERSON': 3}
OBJ_NER_TO_ID = {PAD_TOKEN: 0, UNK_TOKEN: 1, 'PERSON': 2, 'ORGANIZATION': 3, 'DATE': 4, 'NUMBER': 5, 'TITLE': 6, 'COUNTRY': 7, 'LOCATION': 8, 'CITY': 9, 'MISC': 10, 'STATE_OR_PROVINCE': 11, 'DURATION': 12, 'NATIONALITY': 13, 'CAUSE_OF_DEATH': 14, 'CRIMINAL_CHARGE': 15, 'RELIGION': 16, 'URL': 17, 'IDEOLOGY': 18}

In [7]:
def load_glove_vocab(file, wv_dim):
    """
    Load all words from glove.
    """
    vocab = set()
    with open(file, encoding='utf8') as f:
        for line in f:
            elems = line.split()
            token = ''.join(elems[0:-wv_dim])
            vocab.add(token)
    return vocab


In [40]:
def load_tokens_tac(filename):
    with open(filename) as infile:
        #data = ijson.items(infile, 'item')
        data = json.load(infile)
        tokens = []
        for i, d in enumerate(data):
            #if i % 1000 == 0:
            #    print('processing ', i)
            ts = d['token']
            ss, se, os, oe = d['subj_start'], d['subj_end'], d['obj_start'], d['obj_end']
            # do not create vocab for entity words
            ts[ss:se+1] = ['<PAD>']*(se-ss+1)
            ts[os:oe+1] = ['<PAD>']*(oe-os+1)
            tokens += list(filter(lambda t: t!='<PAD>', ts))
    print("{} tokens from {} examples loaded from {}.".format(len(tokens), i, filename))
    return tokens

def load_tokens_ucca(filename):
    with open(filename) as infile:
        #data = ijson.items(infile, 'item')
        data = json.load(infile)
        tokens = []
        for i, d in enumerate(data):
            
            ts = d['ucca_tokens']

            tac_to_ucca = { int(key):val for key, val in d['tac_to_ucca'].items() }
            ss = tac_to_ucca[d['subj_start']][0]
            se = tac_to_ucca[d['subj_end']][-1]
            os = tac_to_ucca[d['obj_start']][0]
            oe = tac_to_ucca[d['obj_end']][-1]


            # do not create vocab for entity words
            ts[ss:se+1] = ['<PAD>']*(se-ss+1)
            ts[os:oe+1] = ['<PAD>']*(oe-os+1)
            tokens += list(filter(lambda t: t!='<PAD>', ts))
    print("{} tokens from {} examples loaded from {}.".format(len(tokens), i, filename))
    return tokens

In [41]:
def build_vocab(tokens, glove_vocab, min_freq):
    """ build vocab from tokens and glove words. """
    counter = Counter(t for t in tokens)
    # if min_freq > 0, use min_freq, otherwise keep all glove words
    if min_freq > 0:
        v = sorted([t for t in counter if counter.get(t) >= min_freq], key=counter.get, reverse=True)
    else:
        v = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True)
    # add special tokens and entity mask tokens
    v = VOCAB_PREFIX + entity_masks() + v
    print("vocab built with {}/{} words.".format(len(v), len(counter)))
    return v


In [42]:
def entity_masks():
    """ Get all entity mask tokens as a list. """
    masks = []
    subj_entities = list(SUBJ_NER_TO_ID.keys())[2:]
    obj_entities = list(OBJ_NER_TO_ID.keys())[2:]
    masks += ["SUBJ-" + e for e in subj_entities]
    masks += ["OBJ-" + e for e in obj_entities]
    return masks


In [None]:
glove_dir = r'C:\Users\JYellin\re_1\glove'
wv_file = glove_dir + '\\' + r'glove.840B.300d.txt'
wv_dim = 300

print("loading glove...")
glove_vocab = load_glove_vocab(wv_file, wv_dim)
print("{} words loaded from glove.".format(len(glove_vocab)))


In [38]:
data_dir = r'C:\Users\JYellin\re_1\Others-Code\gcn-over-pruned-trees\dataset\tacred\enhanced\json-enhanced7\corenlp'
train_file = data_dir + r'\remote-out.json'
dev_file = data_dir + r'\dev.json'
test_file = data_dir + r'\test.json'
train_tokens = load_tokens(train_file)
#dev_tokens = load_tokens(dev_file)
#test_tokens = load_tokens(test_file)
print("building vocab...")
v = build_vocab(train_tokens, glove_vocab, 0)


2303883 tokens from 68105 examples loaded from C:\Users\JYellin\re_1\Others-Code\gcn-over-pruned-trees\dataset\tacred\enhanced\json-enhanced7\corenlp\remote-out.json.
building vocab...
vocab built with 53952/59418 words.


In [44]:
data_dir = r'C:\Users\JYellin\re_1\Others-Code\gcn-over-pruned-trees\dataset\tacred\enhanced\json-enhanced7\corenlp'
train_file = data_dir + r'\local-out.json'
dev_file = data_dir + r'\dev.json'
test_file = data_dir + r'\test.json'

train_tokens_tac = load_tokens_tac(train_file)
v_tac = build_vocab(train_tokens_tac, glove_vocab, 0)

train_tokens_ucca = load_tokens_ucca(train_file)
v_ucca = build_vocab(train_tokens_ucca, glove_vocab, 0)



2303883 tokens from 68105 examples loaded from C:\Users\JYellin\re_1\Others-Code\gcn-over-pruned-trees\dataset\tacred\enhanced\json-enhanced7\corenlp\local-out.json.
vocab built with 53952/59418 words.
2359378 tokens from 68105 examples loaded from C:\Users\JYellin\re_1\Others-Code\gcn-over-pruned-trees\dataset\tacred\enhanced\json-enhanced7\corenlp\local-out.json.
vocab built with 50990/54674 words.


In [45]:
set_tac = set(v_tac)
set_ucca = set(v_ucca)

In [47]:
diff = set_tac - set_ucca

In [51]:
len(diff)

4302

In [52]:
len(set_ucca)

50990

In [54]:
diff

{'anti-Islamic',
 'red-flagged',
 'wide-screen',
 'DVD-RW',
 '24-28',
 'all.When',
 'Anti-globalization',
 'child-rearing',
 '!!',
 'ex-rebels',
 'semi-regularly',
 '84-55',
 'long-haul',
 'post-Watergate',
 'third-place',
 'dead-tree',
 'customer-focused',
 'Sol-Angel',
 'nine-time',
 'Latin-infused',
 'holier-than-thou',
 're-name',
 'D-CA',
 'two-page',
 'neo-Classical',
 'free-market',
 'hot-seat',
 'R-Penn',
 '9-week-old',
 'all-Spanish',
 'START-UPS',
 'post-race',
 'growth-oriented',
 'Abdul-Aziz',
 '26-year-old',
 'financial-services',
 'second-best',
 ':43:00',
 'D-Mont.',
 'Tehran-based',
 'power-broker',
 'pro-life',
 'ex-military',
 'average-looking',
 'detail-oriented',
 'tough-guy',
 'pre-match',
 'Co-Moderator',
 '____',
 'Lashkar-e-Taiba',
 'Swiss-German',
 'D-Mass.',
 'coast-to-coast',
 'Pte.',
 'semi-retired',
 'die-hard',
 'next-to-last',
 'record-setting',
 'slam-dunk',
 'six-week',
 'mash-ups',
 'alien-ness',
 'seven-minute',
 'stem-cell',
 'seven-member',
 'active

In [55]:
import csv
with open('diff.csv', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(diff)