In [None]:
trex_path = 'trex_data'
annot_path = 'corenlp_annotations'

### Get T-Rex dataset 

In [None]:
%%bash

wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/8760241/TREx.zip
mkdir $trex_path
unzip TREx.zip $trex_path

### Annotate with Stanford CoreNLP

In [None]:
%%bash

pip install -U pip pycorenlp

In [None]:
hostname = ''

In [None]:
from pycorenlp import StanfordCoreNLP

container = 'http://' + hostname + ':9000'
nlp = StanfordCoreNLP(container)
nlp_properties = {
  'annotators': 'tokenize,ssplit,pos,depparse,ner',
  'outputFormat': 'json'
}

In [None]:
import glob
import json
from tqdm import tqdm_notebook as tqdm

def filter_triples(triples):
    result = []
    for triple in triples:
        if triple['predicate']['surfaceform'] and triple['object']['surfaceform'] and triple['subject']['surfaceform']:
            result.append(triple)
    return result

for dataset_file in glob.glob(os.path.join(trex_path, '*.json'))[418:]:
    dataset = pd.read_json(dataset_file)
    dataset = dataset[dataset['triples'].map(len) > 0]
    dataset['triples'] = dataset['triples'].map(filter_triples)
    dataset = dataset[dataset['triples'].map(len) > 0]  # filter documents before applying nlp annotation
    nlp_annot = {}

    for document in tqdm(range(dataset.shape[0])):
        docid = dataset.iloc[document].docid.split('/')[-1]
        nlp_annot.update({
            docid: nlp.annotate(dataset.iloc[document].text, properties=nlp_properties)['sentences']
        })
        
    json.dump(nlp_annot, open(dataset_file.replace(trex_path, annot_path), 'w'))

### Extract features 

In [None]:
import nltk
nltk.download('stopwords')

import networkx as nx
import nltk
import string
import re

stopwords_list = nltk.corpus.stopwords.words('english')
_digits = re.compile('\d')

def extract_tokens(annotation, arg1, arg2):
    def find_in_sentence(sentence_annotation, argument_annotation):
        start_token = 0        
        for token in sentence_annotation['tokens']:
            if token.get('characterOffsetBegin') == argument_annotation.get('boundaries')[0]:
                start_token = token['index']
            if token.get('characterOffsetEnd') == argument_annotation.get('boundaries')[1]:
                if start_token == token['index']:
                    # entity contains one token
                    return [token['index']]  # begin with 1!
                if start_token:
                    return [ind for ind in range(start_token, token['index'] + 1)]
    
    for i, sentence in enumerate(annotation):
        tok1 = find_in_sentence(sentence, arg1)
        if tok1:
            tok2 = find_in_sentence(sentence, arg2)
            if tok2:
                return [i, tok1, tok2]
    return [-1, -1, -1]

def _get_bow_between(tokens, tok1, tok2):
    tmp = []
    result = []
    tok_left, tok_right = sorted([tok1, tok2])
    for word in [tokens[i-1]['originalText'] for i in range(max(tok_left) + 1, min(tok_right))]:
        for pun in string.punctuation:
            word = word.strip(pun)
        if word != '':
            tmp.append(word.lower())
    for word in tmp:
        if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
            result.append(word)

    return ' '.join(result)

def _get_pos_between(tokens, tok1, tok2):
    result = []
    tok_left, tok_right = sorted([tok1, tok2])
    for pos in [tokens[i-1]['pos'] for i in range(max(tok_left) + 1, min(tok_right))]:
        if pos not in string.punctuation:
            result.append(pos)
    return '_'.join(result)

def _get_dep_path(dependencies, start, end):
    """
    Finds the shortest dependency path between two tokens in a sentence.
        Args:
            dependencies(list): List of dependencies in Stanford CoreNLP style
            start(int): Number of the first token
            end(int): Number of the second token
        Returns:
            list of tokens [start ... end] as they are presented in the shortest dependency path
    """
    edges = []
    deps = {}

    for edge in dependencies:
        edges.append((edge['governor'], edge['dependent']))
        deps[(min(edge['governor'], edge['dependent']),
              max(edge['governor'], edge['dependent']))] = edge

    graph = nx.Graph(edges)
    path = nx.shortest_path(graph, source=start, target=end)
    return [p for p in path]

def _get_shortest_path(dependencies, left_set, right_set):
    """
    Finds the shortest dependency path between two sets of tokens in a sentence.
    """
    result = [1] * len(dependencies)
    for a in left_set:
        for b in right_set:
            candidate = _get_dep_path(dependencies, a, b)
            if len(candidate) < len(result):
                result = candidate
    return result    

def _get_words_dep(tokens, dependency_path):
    result = [tokens[i-1]['word'] for i in dependency_path[1:-1]]
    return ' '.join(result)

def _get_trigger(tokens, dependency_path):
    result = []
    for word in [tokens[i-1]['lemma'] for i in dependency_path[1:-1]]:
        if word not in stopwords_list:
            result.append(word)
    return '|'.join(result)

def _get_entity_type(tokens, tok):
    _replace = {
        'PERSON_PERSON': 'PERSON',
        'ORGANIZATION_ORGANIZATION': 'ORGANIZATION'
    }
    result = '_'.join([tokens[token-1].get('ner') for token in tok])
    for key, value in _replace.items():
        result = result.replace(key, value)
    return result

def process_document(document, annotation):
    docid = document['docid'].split('/')[-1]
    #annotation = json.load(open(os.path.join('corenlp_annotations', docid + '.json'), 'r'))['sentences']
    result = []
    
    for triple in document['triples']:
        if triple['object']['surfaceform'] and triple['subject']['surfaceform'] and triple['predicate']['surfaceform']:
            #  print('>>>', triple)
            #  print('<<<', annotation[0])
            act_sent, tok1, tok2 = extract_tokens(annotation, 
                                                  triple['object'],
                                                  triple['subject'])
            if act_sent > -1:
                surface1 = '_'.join(triple['object']['surfaceform'].split())
                surface2 = '_'.join(triple['subject']['surfaceform'].split())
                surface_pred = '_'.join(triple['predicate']['surfaceform'].split())
                bow = _get_bow_between(annotation[act_sent]['tokens'], tok1, tok2)
                dependency_path = _get_shortest_path(annotation[act_sent]['enhancedPlusPlusDependencies'], tok1, tok2)
                trigger = _get_trigger(annotation[act_sent]['tokens'], dependency_path)
                pos = _get_pos_between(annotation[act_sent]['tokens'], tok1, tok2)
                ent1 = _get_entity_type(annotation[act_sent]['tokens'], tok1)
                ent2 = _get_entity_type(annotation[act_sent]['tokens'], tok2)
                path = _get_words_dep(annotation[act_sent]['tokens'], dependency_path)
                
                result.append({
                    '_docid': docid,
                    '_tok1': tok1,
                    '_tok2': tok2,
                    '_pred': surface_pred,
                    '_sent_id': triple['sentence_id'],
                    '_sentence': act_sent,
                    '_dep_path': dependency_path,
                    ## Titov features
                    'bow': bow,
                    'e1': surface1, 
                    'e2': surface2,
                    'trigger': trigger,
                    'pos': pos,
                    'pairtype': ent1 + '_' + ent2,
                    'e1type': ent1,
                    'e2type': ent2,
                    'path': path
                })
    return result

In [None]:
def extract_triples(data_chunk, annot_chunk):
    result = []

    for index, row in data_chunk.iterrows():
        annotation = annot_chunk.get(row['docid'].split('/')[-1])
        if annotation:
            result += process_document(row, annotation)
        
    return pd.DataFrame(result)

In [None]:
data_path = 'final_data'
! mkdir $data_path

for dataset_file in tqdm(glob.glob(os.path.join(trex_path, '*.json'))[219:]):
    data_chunk = pd.read_json(dataset_file)
    annot_chunk = json.load(open(dataset_file.replace(trex_path, annot_path), 'r'))
    features = extract_triples(data_chunk, annot_chunk)
    features = features[features['_sentence'] > -1]  # filter entities not given in the same sentence
    features.to_pickle(dataset_file.replace(trex_path, data_path))

In [None]:
def convert_yao_like(path, of):
    result = []
    all_files = glob.glob(path + '*.json')
    _train = int(len(all_files) * 0.6)
    _dev = (len(all_files) - _train) // 2
    
    for file in all_files:
        df = pd.read_pickle(file)
        df.replace('PERSON_PERSON', 'PERSON', inplace=True)
        df.replace('ORGANIZATION_ORGANIZATION_ORGANIZATION', 'ORGANIZATION', inplace=True)
        df.replace('ORGANIZATION_ORGANIZATION', 'ORGANIZATION', inplace=True)
        result.append(df[[key for key in df.keys() if key[0] != '_']])

    train = pd.concat(result[:_train])
    train.to_csv(of+'_train.csv', sep='\t', index=None, header=False)
    dev = pd.concat(result[_train:_train+_dev])
    dev.to_csv(of+'_dev.csv', sep='\t', index=None, header=False)
    test = pd.concat(result[_train+_dev:])
    test.to_csv(of+'_test.csv', sep='\t', index=None, header=False)

In [None]:
convert_yao_like('final_data/', 'trex')