In [None]:
trex_path = 'trex_data'
annot_path = 'corenlp_annotations'

### Get T-Rex dataset 

In [None]:
%%bash

wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/8760241/TREx.zip
mkdir $trex_path
unzip TREx.zip $trex_path

### Annotate with Stanford CoreNLP

In [None]:
%%bash

pip install -U pip pycorenlp

In [None]:
hostname = ''

In [None]:
from pycorenlp import StanfordCoreNLP

container = 'http://' + hostname + ':9000'
nlp = StanfordCoreNLP(container)
nlp_properties = {
  'annotators': 'tokenize,ssplit,pos,depparse,ner',
  'outputFormat': 'json'
}

In [None]:
temp = {}

for value in [1, 2, 3, 38]:
    temp.update({value: 'Opa'})

In [None]:
import glob
import json
from tqdm import tqdm_notebook as tqdm

def filter_triples(triples):
    result = []
    for triple in triples:
        if triple['predicate']['surfaceform'] and triple['object']['surfaceform'] and triple['subject']['surfaceform']:
            result.append(triple)
    return result

for dataset_file in glob.glob(os.path.join(trex_path, '*.json')):
    dataset = pd.read_json(dataset_file)
    dataset = dataset[dataset['triples'].map(len) > 0]
    dataset['triples'] = dataset['triples'].map(filter_triples)
    dataset = dataset[dataset['triples'].map(len) > 0]  # filter documents before applying nlp annotation
    nlp_annot = {}

    for document in tqdm(range(dataset.shape[0])):
        docid = dataset.iloc[document].docid.split('/')[-1]
        nlp_annot.update({
            docid: nlp.annotate(dataset.iloc[document].text, properties=nlp_properties)['sentences']
        })
        
    json.dump(nlp_annot, dataset_file.replace(trex_path, annot_path))

### Extract features 

In [None]:
import nltk
nltk.download('stopwords')

import networkx as nx
import nltk
import string
import re

stopwords_list = nltk.corpus.stopwords.words('english')
_digits = re.compile('\d')

def extract_tokens(annotation, arg1, arg2):
    def find_in_sentence(sentence_annotation, argument_annotation):
        for token in sentence_annotation['tokens']:
            if token.get('characterOffsetBegin') == argument_annotation.get('boundaries')[0] \
                and token.get('characterOffsetEnd') == argument_annotation.get('boundaries')[1]:
                return token['index']  # begin with 1!
    
    for i, sentence in enumerate(annotation):
        tok1 = find_in_sentence(sentence, arg1)
        if tok1:
            tok2 = find_in_sentence(sentence, arg2)
            if tok2:
                return [i, tok1, tok2]
        else:
            return [-1, -1, -1]

def _get_bow_between(tokens, tok1, tok2):
    tmp = []
    result = []
    for word in [tokens[i]['originalText'] for i in range(tok1, tok2-1)]:
        for pun in string.punctuation:
            word = word.strip(pun)
        if word != '':
            tmp.append(word.lower())
    for word in tmp:
        if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
            result.append(word)

    return ' '.join(result)

def _get_pos_between(tokens, tok1, tok2):
    result = []
    for pos in [tokens[i]['pos'] for i in range(tok1, tok2-1)]:
        if pos not in string.punctuation:
            result.append(pos)
    return '_'.join(result)

def _get_dep_path(dependencies, start, end):
    """
    Finds the shortest dependency path between two tokens in a sentence.
        Args:
            dependencies(list): List of dependencies in Stanford CoreNLP style
            start(int): Number of the first token
            end(int): Number of the second token
        Returns:
            list of tokens [start ... end] as they are presented in the shortest dependency path
    """
    edges = []
    deps = {}

    for edge in dependencies:
        edges.append((edge['governor'], edge['dependent']))
        deps[(min(edge['governor'], edge['dependent']),
              max(edge['governor'], edge['dependent']))] = edge

    graph = nx.Graph(edges)
    path = nx.shortest_path(graph, source=start, target=end)
    return [p for p in path]

def _get_words_dep(tokens, dependency_path):
    result = [tokens[i-1]['word'] for i in dependency_path[1:-1]]
    return ' '.join(result)

def _get_trigger(tokens, dependency_path):
    result = []
    for word in [tokens[i-1]['lemma'] for i in dependency_path[1:-1]]:
        if word not in stopwords_list:
            result.append(word)
    return '|'.join(result)

def _get_entity_type(token):
    return token.get('ner')

def process_document(document):
    docid = document['docid'].split('/')[-1]
    annotation = json.load(open(os.path.join('corenlp_annotations', docid + '.json'), 'r'))['sentences']
    result = []
    
    for triple in document['triples']:
        if triple['object']['surfaceform'] and triple['subject']['surfaceform'] and triple['predicate']['surfaceform']:
            act_sent, tok1, tok2 = extract_tokens(annotation, 
                                                  triple['object'],
                                                  triple['subject'])
            if act_sent > -1:
                surface1 = '_'.join(triple['object']['surfaceform'].split())
                surface2 = '_'.join(triple['subject']['surfaceform'].split())
                surface_pred = '_'.join(triple['predicate']['surfaceform'].split())
                bow = _get_bow_between(annotation[act_sent]['tokens'], tok1, tok2)
                dependency_path = _get_dep_path(annotation[act_sent]['enhancedPlusPlusDependencies'], tok1, tok2)
                trigger = _get_trigger(annotation[act_sent]['tokens'], dependency_path)
                pos = _get_pos_between(annotation[act_sent]['tokens'], tok1, tok2)
                ent1 = _get_entity_type(annotation[act_sent]['tokens'][tok1-1])
                ent2 = _get_entity_type(annotation[act_sent]['tokens'][tok2-1])
                path = _get_words_dep(annotation[act_sent]['tokens'], dependency_path)
                
                result.append({
                    '_docid': docid,
                    '_tok1': tok1,
                    '_tok2': tok2,
                    '_pred': surface_pred,
                    '_sent_id': triple['sentence_id'],
                    '_sentence': act_sent,
                    '_dep_path': dependency_path,
                    ## Titov features
                    'bow': bow,
                    'e1': surface1, 
                    'e2': surface2,
                    'trigger': trigger,
                    'pos': pos,
                    'pairtype': ent1 + '_' + ent2,
                    'e1type': ent1,
                    'e2type': ent2,
                    'path': path
                })
    return result

In [None]:
def extract_triples(data_chunk, annot_chunk):
    result = []

    for document in range(data_chunk.shape[0]):
        annotation = annot.get([dataset.iloc[document]['docid']])
        if annotation:
            result += process_document(dataset.iloc[document], annotation)
        
    return pd.DataFrame(result)

In [None]:
data_path = 'final_data'

for dataset_file in tqdm(glob.glob(os.path.join(trex_path, '*.json'))):
    data_chunk = pd.read_json(dataset_file)
    annot_chunk = dataset_file.replace(trex_path, annot_path)
    features = extract_triples(data_chunk, annot_chunk)
    features.to_pickle(dataset_file.replace(trex_path, data_path))