In [None]:
%%bash

pip install -U spacy==2.1.0 
pip install -U neuralcoref --no-binary neuralcoref

In [None]:
! python -m spacy validate

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
trex_path = 'trex_data'
annot_path = 'trex_corenlp_annotations'

In [None]:
import glob
import json
from tqdm import tqdm_notebook as tqdm
import os
import pandas as pd

### Get T-Rex dataset 

In [None]:
%%bash

wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/8760241/TREx.zip
mkdir $trex_path
unzip TREx.zip -d $trex_path

### Initialize coreference resolver

In [None]:
import spacy
import neuralcoref

coref = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(coref)

def resolve_coreference(text):
    doc = coref(text)
    return doc._.coref_resolved

In [None]:
resolve_coreference("I go to the park. It is beautiful.")

### Iniialize Stanford CoreNLP

In [None]:
%%bash

pip install -U pip pycorenlp

In [None]:
HOSTNAME = 'vmh2.isa.ru'
PORT = 9002

### Make annotations

In [None]:
aligned_documents = {}

In [None]:
import pickle

aligned_trex_path = 'trex_simpleqa_aligned'
aligned_documents = pickle.load(open(f'{aligned_trex_path}.pkl', 'rb'))
aligned_documents.update(pickle.load(open(f'{aligned_trex_path}_pt2.pkl', 'rb')))
aligned_documents.update(pickle.load(open(f'{aligned_trex_path}_pt3.pkl', 'rb')))
aligned_documents.update(pickle.load(open(f'{aligned_trex_path}_pt4.pkl', 'rb')))

In [None]:
len(aligned_documents.keys())

In [None]:
! ls -laht trex_data/*.json | wc -l

In [None]:
from pycorenlp import StanfordCoreNLP
import time


nlp = StanfordCoreNLP(f'http://{HOSTNAME}:{PORT}')
nlp_properties = {
    'annotators': 'tokenize,ssplit,tokenize,pos,ner,depparse,natlog,openie',
    'outputFormat': 'json'
}

deprec_rels = {'to', 'for', 'by', 'with', 'also', 'as of',
               'had', 'said', 'said in', 'felt', 'on', 'gave', 'saw', 'found', 'did',
               'at', 'as', 'e', 'as', 'de', 'mo', '’s', 'v', 'yr', 'al',
               "'", 'na', 'v.', "d'", 'et', 'mp', 'di', 'y',
               'ne', 'c.', 'be', 'ao', 'mi', 'im', 'h',
               'has', 'between', 'are', 'returned', 'began', 'became',
               'along', 'doors as', 'subsequently terrytoons in',
              }


def filter_ner(sentence):
    # save only triplets with at least one named entity
    # and does not contain a deprecated relation
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if not len(triplet['relation']) < 2:
            if not triplet['relation'] in deprec_rels: 
                for entity in sentence['entitymentions']:
                    if entity['text'] in [triplet['subject'], triplet['object']]:
                        openie.append(triplet)
                        continue
                
    return openie


def corenlp_annotate(text):
    text = resolve_coreference(text)
    try:
        return nlp.annotate(text, properties=nlp_properties)['sentences']
    except TypeError:
        time.sleep(10)
        result = nlp.annotate(text, properties=nlp_properties)
        if type(result) == str:
            return None
    return None

broken_ids = []

# for dataset_file in tqdm(glob.glob(os.path.join(trex_path, '*.json'))[205:214]):
    # for dataset_file in tqdm(['trex_data/re-nlg_4500000-4510000.json',]):
    
for dataset_file in tqdm(aligned_documents.keys()):
    if not os.path.isfile(dataset_file.replace(trex_path, annot_path)):
        dataset = pd.read_json(dataset_file).loc[aligned_documents[dataset_file]]
        nlp_annot = {}

        for document in tqdm(range(dataset.shape[0])):
            docid = dataset.iloc[document].docid.split('/')[-1]

            doc_annot = corenlp_annotate(dataset.iloc[document].text)
            if not doc_annot:
                broken_ids.append(id)
                continue

            clean_annot = []
            for sentence in doc_annot:
                new_sentence = sentence
                new_sentence['openie'] = filter_ner(sentence)
                if new_sentence['openie']:
                    clean_annot.append(new_sentence)

            nlp_annot.update({
                docid: clean_annot,
            })

        json.dump(nlp_annot, open(dataset_file.replace(trex_path, annot_path), 'w'))

In [None]:
text = "Who was Homi K. Bhabha especially influenced by?"

In [None]:
nlp.annotate(text, properties=nlp_properties)

In [None]:
nlp_annot

In [None]:
! ls -laht trex_corenlp_annotations/ | wc -l

In [None]:
! ls -laht $annot_path

In [None]:
from pycorenlp import StanfordCoreNLP
import time


nlp = StanfordCoreNLP(f'http://{HOSTNAME}:{PORT}')
nlp_properties = {
    'annotators': 'tokenize,ssplit,tokenize,pos,ner,depparse,natlog,openie',
    'outputFormat': 'json'
}

deprec_rels = {'in', 'is', 'was', 'of', "'s", 'to', 'for', 'by', 'with', 'also', 'as of',
               'had', 'said', 'said in', 'felt', 'on', 'gave', 'saw', 'found', 'did',
               'at', 'as', 'e', 'as', 'de', 'mo', '’s', 'v', 'yr', 'al',
               "'", 'na', 'v.', "d'", 'et', 'mp', 'di', 'y',
               'ne', 'c.', 'be', 'ao', 'mi', 'im', 'h',
               'has', 'between', 'are', 'returned', 'began', 'became',
               'along', 'doors as', 'subsequently terrytoons in',
              }


def filter_ner(sentence):
    # save only triplets with at least one named entity
    # and does not contain a deprecated relation
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if not len(triplet['relation']) < 3:
            if not triplet['relation'] in deprec_rels: 
                for entity in sentence['entitymentions']:
                    if entity['text'] in [triplet['subject'], triplet['object']]:
                        openie.append(triplet)
                        continue
                
    return openie


def corenlp_annotate(text):
    text = resolve_coreference(text)
    try:
        return nlp.annotate(text, properties=nlp_properties)['sentences']
    except TypeError:
        time.sleep(10)
        result = nlp.annotate(text, properties=nlp_properties)
        if type(result) == str:
            return None
    return None

broken_ids = []

for dataset_file in tqdm(glob.glob(os.path.join(trex_path, '*.json'))[196:]):
    dataset = pd.read_json(dataset_file).loc[aligned_documents[dataset_file]]
    nlp_annot = {}

    for document in tqdm(range(dataset.shape[0])):
        docid = dataset.iloc[document].docid.split('/')[-1]
        
        doc_annot = corenlp_annotate(dataset.iloc[document].text)
        if not doc_annot:
            broken_ids.append(id)
            continue
            
        clean_annot = []
        for sentence in doc_annot:
            new_sentence = sentence
            new_sentence['openie'] = filter_ner(sentence)
            if new_sentence['openie']:
                clean_annot.append(new_sentence)
        
        nlp_annot.update({
            docid: clean_annot,
        })
        
    json.dump(nlp_annot, open(dataset_file.replace(trex_path, annot_path), 'w'))

In [None]:
len(glob.glob(os.path.join(trex_path, '*.json')))

In [None]:
! ls -laht $annot_path

In [None]:
! ls -laht $annot_path | wc -l

In [None]:
json.dump(nlp_annot, open(dataset_file.replace(trex_path, annot_path), 'w'))

In [None]:
! mkdir trex_corenlp_annotations

In [None]:
nlp_annot.keys()

In [None]:
import requests


def get_item(item):
    url = 'https://www.wikidata.org/w/api.php'
    r = requests.get(url, params = {'format': 'json', 'search': item, 'action': 'wbsearchentities', 'type': 'item', 'language': 'en'})
    data = r.json()
    return data['search'][0]['label'] if len(data['search']) else None

In [None]:
get_item('Q6255339')

In [None]:
! ls -laht $annot_path

### Extract features 

In [None]:
import nltk
nltk.download('stopwords')

import networkx as nx
import nltk
import string
import re

stopwords_list = nltk.corpus.stopwords.words('english')
_digits = re.compile('\d')

def extract_tokens(annotation, arg1, arg2):
    def find_in_sentence(sentence_annotation, argument_annotation):
        start_token = 0        
        for token in sentence_annotation['tokens']:
            #print('>>', argument_annotation)
            if argument_annotation.get('boundaries'):
                if token.get('characterOffsetBegin') == argument_annotation.get('boundaries')[0]:
                    start_token = token['index']
                if token.get('characterOffsetEnd') == argument_annotation.get('boundaries')[1]:
                    if start_token == token['index']:
                        # entity contains one token
                        return [token['index']]  # begin with 1!
                    if start_token:
                        return [ind for ind in range(start_token, token['index'] + 1)]
                    
            else:
                if token['originalText'] == argument_annotation['surfaceform']:
                    # entity contains one token
                    return [token['index']]
                if start_token:
                    return [ind for ind in range(start_token, token['index'] + 1)]
            
    
    for i, sentence in enumerate(annotation):
        tok1 = find_in_sentence(sentence, arg1)
        if tok1:
            tok2 = find_in_sentence(sentence, arg2)
            if tok2:
                return [i, tok1, tok2]
    return [-1, -1, -1]

def _get_bow_between(tokens, tok1, tok2):
    tmp = []
    result = []
    tok_left, tok_right = sorted([tok1, tok2])
    for word in [tokens[i-1]['originalText'] for i in range(max(tok_left) + 1, min(tok_right))]:
        for pun in string.punctuation:
            word = word.strip(pun)
        if word != '':
            tmp.append(word.lower())
    for word in tmp:
        if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
            result.append(word)

    return ' '.join(result)

def _get_pos_between(tokens, tok1, tok2):
    result = []
    tok_left, tok_right = sorted([tok1, tok2])
    for pos in [tokens[i-1]['pos'] for i in range(max(tok_left) + 1, min(tok_right))]:
        if pos not in string.punctuation:
            result.append(pos)
    return '_'.join(result)

def _get_dep_path(dependencies, start, end):
    """
    Finds the shortest dependency path between two tokens in a sentence.
        Args:
            dependencies(list): List of dependencies in Stanford CoreNLP style
            start(int): Number of the first token
            end(int): Number of the second token
        Returns:
            list of tokens [start ... end] as they are presented in the shortest dependency path
    """
    edges = []
    deps = {}

    for edge in dependencies:
        edges.append((edge['governor'], edge['dependent']))
        deps[(min(edge['governor'], edge['dependent']),
              max(edge['governor'], edge['dependent']))] = edge

    graph = nx.Graph(edges)
    path = nx.shortest_path(graph, source=start, target=end)
    return [p for p in path]

def _get_shortest_path(dependencies, left_set, right_set):
    """
    Finds the shortest dependency path between two sets of tokens in a sentence.
    """
    result = [1] * len(dependencies)
    for a in left_set:
        for b in right_set:
            candidate = _get_dep_path(dependencies, a, b)
            if len(candidate) < len(result):
                result = candidate
    return result    

def _get_words_dep(tokens, dependency_path):
    result = [tokens[i-1]['word'] for i in dependency_path[1:-1]]
    return ' '.join(result)

def _get_trigger(tokens, dependency_path):
    result = []
    for word in [tokens[i-1]['lemma'] for i in dependency_path[1:-1]]:
        if word not in stopwords_list:
            result.append(word)
    return '|'.join(result)

def _get_entity_type(tokens, tok):
    _replace = {
        'PERSON_PERSON': 'PERSON',
        'ORGANIZATION_ORGANIZATION': 'ORGANIZATION'
    }
    result = '_'.join([tokens[token-1].get('ner') for token in tok])
    for key, value in _replace.items():
        result = result.replace(key, value)
    return result

def process_document(document, annotation):
    docid = document['docid'].split('/')[-1]
    #annotation = json.load(open(os.path.join('corenlp_annotations', docid + '.json'), 'r'))['sentences']
    result = []
    
    for triple in document['triples']:
        if triple['object']['surfaceform'] and triple['subject']['surfaceform']:
            #  print('>>>', triple)
            #  print('<<<', annotation[0])
            act_sent, tok1, tok2 = extract_tokens(annotation, 
                                                  triple['object'],
                                                  triple['subject'])
            if act_sent > -1:
                surface1 = '_'.join(triple['object']['surfaceform'].split())
                surface2 = '_'.join(triple['subject']['surfaceform'].split())
#                 surface_pred = '_'.join()
                #surface_pred = '_'.join(triple['predicate']['surfaceform'].split())
                bow = _get_bow_between(annotation[act_sent]['tokens'], tok1, tok2)
                dependency_path = _get_shortest_path(annotation[act_sent]['enhancedPlusPlusDependencies'], tok1, tok2)
                trigger = _get_trigger(annotation[act_sent]['tokens'], dependency_path)
                pos = _get_pos_between(annotation[act_sent]['tokens'], tok1, tok2)
                ent1 = _get_entity_type(annotation[act_sent]['tokens'], tok1)
                ent2 = _get_entity_type(annotation[act_sent]['tokens'], tok2)
                path = _get_words_dep(annotation[act_sent]['tokens'], dependency_path)
                
                result.append({
                    '_docid': docid,
                    '_tok1': tok1,
                    '_tok2': tok2,
                    #'_pred': surface_pred,
                    '_sent_id': triple['sentence_id'],
                    '_sentence': act_sent,
                    '_dep_path': dependency_path,
                    ## Titov features
                    'bow': bow,
                    'e1': surface1, 
                    'e2': surface2,
                    'trigger': trigger,
                    'pos': pos,
                    'pairtype': ent1 + '_' + ent2,
                    'e1type': ent1,
                    'e2type': ent2,
                    'path': path,
                    'relation': triple['predicate']['uri']
                })
    return result

In [None]:
def extract_triples(data_chunk, annot_chunk):
    result = []

    for index, row in data_chunk.iterrows():
        annotation = annot_chunk.get(row['docid'].split('/')[-1])
        if annotation:
            result += process_document(row, annotation)
        
    return pd.DataFrame(result).drop_duplicates(['e1', 'e2'])

In [None]:
def process_file(dataset_file):
    data_chunk = pd.read_json(dataset_file)
    annot_chunk = json.load(open(dataset_file.replace(trex_path, annot_path), 'r'))
    features = extract_triples(data_chunk, annot_chunk)
    features = features[features['_sentence'] > -1]  # filter entities not given in the same sentence
    features.to_pickle(dataset_file.replace(trex_path, data_path))

In [None]:
%%time
from multiprocessing import Pool
import glob
import os

data_path = 'final_data_ra'
! mkdir $data_path

pool = Pool(4)
files = glob.glob(os.path.join(trex_path, '*.json'))

In [None]:
%%time
from multiprocessing import Pool

data_path = 'final_data_ra'
! mkdir $data_path

pool = Pool(4)
files = glob.glob(os.path.join(trex_path, '*.json'))
pool.map(process_file, files)

In [None]:
process_file(files[0])

In [None]:
annot_chunk = json.load(open(files[0].replace(trex_path, annot_path), 'r'))

In [None]:
annot_chunk.keys()

In [None]:
annot_chunk['Q15052'][0].keys()

In [None]:
annot_chunk['Q15052'][1]['openie']

In [None]:
data_chunk.head()

In [None]:
data_chunk.iloc[0].triples

In [None]:
for file in files:
    data_chunk = pd.read_json(file)
    data_chunk = data_chunk[data_chunk.text.str.contains("Yves Klein")]
    if data_chunk.shape[0] != 0:
        print(file)
        print(data_chunk)

In [None]:
def get_relations(path):
    result = []
    all_files = glob.glob(path + '*.json')
    
    for file in all_files:
        df = pd.read_pickle(file)
        result += df['relation'].values.tolist()
        
    return pd.Series(result)

In [None]:
path = 'final_data_ra/'
result = []

all_files = glob.glob(path + '*.json')    
for file in all_files:
    df = pd.read_pickle(file)
    break

In [None]:
df.head()

In [None]:
rels = get_relations('final_data_ra/')

In [None]:
rels.unique().shape

In [None]:
rels.value_counts()[:100]

In [None]:
rels.unique().shape

In [None]:
rels.value_counts()[50:]

In [None]:
def convert_yao_like(path, of):
    result = []
    all_files = glob.glob(path + '*.json')
    _train = int(len(all_files) * 0.6)
    _dev = (len(all_files) - _train) // 2
    
    for file in tqdm(all_files):
        df = pd.read_pickle(file)
        df.replace('PERSON_PERSON', 'PERSON', inplace=True)
        df.replace('ORGANIZATION_ORGANIZATION_ORGANIZATION', 'ORGANIZATION', inplace=True)
        df.replace('ORGANIZATION_ORGANIZATION', 'ORGANIZATION', inplace=True)
        result.append(df[[key for key in df.keys() if key[0] != '_']])

    train = pd.concat(result[:_train])
    train.to_csv(of+'_train.csv', sep='\t', index=None, header=False)
    dev = pd.concat(result[_train:_train+_dev])
    dev.to_csv(of+'_dev.csv', sep='\t', index=None, header=False)
    test = pd.concat(result[_train+_dev:])
    test.to_csv(of+'_test.csv', sep='\t', index=None, header=False)

In [None]:
convert_yao_like('final_data_ra/', 'trex_ra')

In [None]:
! head -1000 trex_ra_train.csv >> data-sample.csv