# Prepare feature rich dataset ``data/dataset.pkl`` out of corenlp annotations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
! pip install wget

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import os
from pathlib import Path
import wget

W2V_MODEL_PATH ='models/'
W2V_MODEL_NAME = 'GoogleNews-vectors-negative300.bin.gz'  # 1.6G

directory = os.path.dirname(W2V_MODEL_PATH)
if not Path(directory).is_dir():
    print(f'Creating directory at {directory}',
          ' for saving word2vec pre-trained model')
    os.makedirs(directory)
if not Path(W2V_MODEL_PATH).is_file():
    w2v_archive = os.path.join(directory, W2V_MODEL_NAME)
    if not Path(w2v_archive).is_file():
        url = f'https://s3.amazonaws.com/dl4j-distribution/{W2V_MODEL_NAME}'
        print(f'Downloading word2vec pre-trained model to {w2v_archive}')
        wget.download(url, os.path.join(directory, W2V_MODEL_NAME))

In [4]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

if W2V_MODEL_NAME[-4:] in ['.vec', '.bin']:
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME,
                                                       binary=W2V_MODEL_NAME[-4:] == '.bin')
elif W2V_MODEL_NAME[-7:] == '.bin.gz':
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME, binary=True)
    
else:
    word2vec_model = Word2Vec.load(W2V_MODEL_PATH + W2V_MODEL_NAME)
    
word2vec_vector_length = len(word2vec_model.wv.get_vector('tree'))

  del sys.path[0]


In [5]:
! pip install iteration_utilities

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
_ner_kinds = ['TITLE', 'COUNTRY', 'DATE', 'PERSON', 'ORGANIZATION', 'MISC',
       'LOCATION', 'NUMBER', 'CAUSE_OF_DEATH', 'NATIONALITY', 'ORDINAL',
       'DURATION', 'CRIMINAL_CHARGE', 'CITY', 'RELIGION',
       'STATE_OR_PROVINCE', 'IDEOLOGY', 'SET', 'URL', 'PERCENT', 'TIME',
       'MONEY', 'HANDLE']

In [259]:
import networkx as nx
import multiprocessing
import numpy as np
from iteration_utilities import unique_everseen


def _extract_plain_features(document):
    
    def _extract(sentence):
        
        def get_postags_sequence(span, words):
            columns = ['JJ', 'CD', 'VBD', '', 'RB', 'VBN', 'PRP', 'IN', 'VBP', 'TO', 'NNP', 'VB', 
                       'VBZ', 'VBG', 'POS', 'NNS', 'NN', 'MD']
        
            sequence = [token['pos'] for token in sentence['tokens'][span[0]:span[1]] 
                         if token['originalText'] in words][:3]

            sequence = [[int(column == postag) for column in columns] for postag in sequence]
            
            result = np.zeros((3, len(columns)))
            
            if sequence:
                result[:len(sequence)] = sequence
            
            return result
        
        def get_ner_occurrences(span, words, obj=True):
            _ner_kinds = ['TITLE', 'COUNTRY', 'DATE', 'PERSON', 'ORGANIZATION', 'MISC',
                           'LOCATION', 'NUMBER', 'CAUSE_OF_DEATH', 'NATIONALITY', 'ORDINAL',
                           'DURATION', 'CRIMINAL_CHARGE', 'CITY', 'RELIGION',
                           'STATE_OR_PROVINCE', 'IDEOLOGY', 'SET', 'URL', 'PERCENT', 'TIME',
                           'MONEY', 'HANDLE']
            
            mentions = [token['ner'] for token in sentence['tokens'][span[0]:span[1]] 
                                 if token['originalText'] in words]
            
            mentions = [[int(_ner_kind == mention) for _ner_kind in _ner_kinds] for mention in mentions][:3]
            result = np.zeros((3, len(_ner_kinds)))
            
            if mentions:
                result[:len(mentions)] = mentions

            return result

        def tag_lemma(span, words, tag=False):
            if tag:
                return [token['lemma'].lower() + '_' + _penn_tagset[token['pos']]['fPOS'] for token in sentence['tokens'][span[0]:span[1]] 
                                 if token['originalText'] in words]
            else:
                return [token['lemma'].lower() for token in sentence['tokens'][span[0]:span[1]] 
                                 if token['originalText'] in words]

        def remove_repetition(words):
            if words[:len(words)//2] == words[len(words)//2:]:
                return words[:len(words)//2]
            return words
        
        def get_tokens(words, span):            
            return [token['originalText'].lower() for token in sentence['tokens'][span[0]:span[1]]
                                if token['originalText'] in words]
            
        def _build_dep_path(dependencies, tokens, start: int, end: int):
            edges = []
            deps = {}

            for edge in dependencies:
                edges.append((edge['governor'], edge['dependent']))
                deps[(min(edge['governor'], edge['dependent']),
                      max(edge['governor'], edge['dependent']))] = edge

            graph = nx.Graph(edges)
            path = nx.shortest_path(graph, source=start, target=end)
            return path[:-1]  # exclude right end

        def _tokens_by_index(indexes, tokens):
            return [token['originalText'] for token in tokens if token['index'] in indexes]

        def _lemmas_by_index(indexes, tokens):
            return [token['lemma'].lower() for token in tokens if token['index'] in indexes]
        
        def _embed(placeholder, words):
            for j in range(len(words)):
                if j == len(placeholder):
                    break

                word = words[j]
                if word and word in word2vec_model:
                    placeholder[j, :] = word2vec_model[word]
            return placeholder

        def _embed_arg(row):
            result = []
            result.append(_embed(np.zeros((3, word2vec_vector_length)), row['lemmas']))

            return result

        deprecated = set(['one', 'he', 'she', 'they', 'his', 'her', 'its', 'our', 'day', 'co.', 'inc.', 
              'society', 'people', 'inventor', 'head', 'poet', 'doctor', 'teacher', 'inventor', 
              'thanksgiving day', 'halloween',
              'sales person', 'model', 'board', 'technology', 'owner', 'one', 'two'])
        
        triplets = sentence['openie']
        #filtered_triplets = filter(lambda obj: obj['object'].lower() not in deprecated and obj['subject'].lower() not in deprecated, triplets)
        filtered_triplets = filter(lambda obj: obj['subject'].lower() not in deprecated, triplets)
        filtered_triplets = filter(lambda obj: len(obj['object']) > 2 or len(obj['subject']) > 2, filtered_triplets)
        filtered_triplets = filter(lambda obj: len(obj['relation'].split()) < 4, filtered_triplets)
        filtered_triplets = list(filtered_triplets)
        
        subjects, relations, objects, dep_path = [], [], [], []
        
        for triplet in filtered_triplets:

            _subject = {
                'tokens': get_tokens(triplet['subject'], triplet['subjectSpan']),
                'lemmas': tag_lemma(triplet['subjectSpan'], triplet['subject']),
                'dist_to_rel': triplet['relationSpan'][0] - triplet['subjectSpan'][0],
                'rel_pos': triplet['subjectSpan'][0] / len(sentence['tokens']),
                'ner': get_ner_occurrences(triplet['subjectSpan'], triplet['subject']),
                'postag': get_postags_sequence(triplet['subjectSpan'], triplet['subject']),
            }
            _subject.update({
                'w2v': _embed(np.zeros((3, word2vec_vector_length)), _subject['lemmas']),
            })
            
            _relation = {
                'tokens': get_tokens(triplet['relation'], triplet['relationSpan']),
                'lemmas': tag_lemma(triplet['relationSpan'], triplet['relation']),
                'dist_to_rel': 0,
                'rel_pos': triplet['relationSpan'][0] / len(sentence['tokens']),
                'ner': get_ner_occurrences(triplet['relationSpan'], triplet['relation']),
                'postag': get_postags_sequence(triplet['relationSpan'], triplet['relation'])
            }
            _relation.update({
                'w2v': _embed(np.zeros((3, word2vec_vector_length)), _relation['lemmas']),
            })
            
            _object = {
                'tokens': get_tokens(triplet['object'], triplet['objectSpan']),
                'lemmas': tag_lemma(triplet['objectSpan'], triplet['object']),
                'dist_to_rel': triplet['relationSpan'][0] - triplet['objectSpan'][0],
                'rel_pos': triplet['objectSpan'][0] / len(sentence['tokens']),
                'ner': get_ner_occurrences(triplet['objectSpan'], triplet['object']),
                'postag': get_postags_sequence(triplet['objectSpan'], triplet['object'])
            }
            _object.update({
                'w2v': _embed(np.zeros((3, word2vec_vector_length)), _object['lemmas']),
            })
            
            _dependency_path = ' '.join(_lemmas_by_index(_build_dep_path(sentence['basicDependencies'], 
                                                          sentence['tokens'], 
                                                          triplet['subjectSpan'][0], 
                                                          triplet['objectSpan'][-1]), sentence['tokens']))
            subjects.append(_subject)
            relations.append(_relation)
            objects.append(_object)
            dep_path.append(_dependency_path)
            
        #return pd.DataFrame(result, columns=header)
        return subjects, relations, objects
    
    subjects, relations, objects = [], [], []
    for sentence in document:
        _subject, _relation, _object = _extract(sentence)
        subjects += _subject
        relations += _relation
        objects += _object

    
    return subjects, relations, objects

def _mark_ner_object(row):
    return row['relation'] + (row['DATE_obj'] == 1) * ' date'\
                           + (row['LOCATION_obj'] == 1) * ' location'
    
def _extract_features(document):
    def _embed_arg(row):
        result = []
        result.append(_embed(np.zeros((3, word2vec_vector_length)), row['lemmas']))
            
        return result
    
    features = {}
    features['subject'], features['relation'], features['object'] = _extract_plain_features(document[1])

    #features.insert(loc=0, column='docid', value=document[0])
    #max_len = {'obj': 3, 'rel': 3, 'subj': 3}
    #features['w2v_subj'] = list(map(_embed_arg, features['subject']))
    #features['w2v_rel'] = list(map(_embed_arg, features['relation']))
    #features['w2v_obj'] = list(map(_embed_arg, features['object']))
    
    #print(features['w2v_subj'])
    
    #features['w2v_subj'] = features['subject'].map(lambda words: _embed(np.zeros((max_len['subj'], word2vec_vector_length)), words.lower().split()))
    #features['w2v_rel'] = features['relation'].map(lambda words: _embed(np.zeros((max_len['rel'], word2vec_vector_length)), words.lower().split()))
    #features['w2v_obj'] = features['object'].map(lambda words: _embed(np.zeros((max_len['obj'], word2vec_vector_length)), words.lower().split()))
       
    return pd.DataFrame(features)


def remove_repetitions(annot):
    for i in range(len(annot)):
        for j in range(len(annot[i])):
            annot[i][j]['openie'] = list(unique_everseen(annot[i][j]['openie']))
    return annot

                                           
class FeaturesProcessor:
    
    def __init__(self):
        self.pool = multiprocessing.Pool(processes=1)
                 
    def __call__(self, data):
        features = pd.concat(self.pool.map(_extract_features, data))
        return features

In [277]:
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd

#DATA_PATH = 'data/corenlp_annotations_ner_pairs'  #'data/filtered_annotations'
DATA_PATH = 'data/corenlp_annotations_only_ner'
RESULT_PATH = 'data/processed_separately'
! mkdir $RESULT_PATH 
result = []
extr = FeaturesProcessor()

def extract_matrix(row):
    _matrix = np.concatenate([row['ner'], row['postag'], row['w2v'], np.array([[row['dist_to_rel'], row['rel_pos']]] * 3)], axis=1)
    return _matrix

for file in tqdm(glob(DATA_PATH + '/*.json')):
    tmp = pd.read_json(file)
    tmp = tmp[tmp.loc[:, 1].map(len) > 0]
    tmp[1] = remove_repetitions(tmp[1].values)
    result = extr(tmp[[0, 1]].values)
    result['subject_matr'] = result.subject.map(extract_matrix)
    result['object_matr'] = result.object.map(extract_matrix)
    result['relation_matr'] = result.relation.map(extract_matrix)
    result.to_pickle(file.replace(DATA_PATH, RESULT_PATH).replace('.json', '.pkl'))

mkdir: cannot create directory ‘data/processed_separately’: File exists


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))




In [278]:
result.head()

Unnamed: 0,subject,relation,object,subject_matr,object_matr,relation_matr
0,"{'tokens': ['eisen'], 'lemmas': ['eisen'], 'di...","{'tokens': ['is'], 'lemmas': ['be'], 'dist_to_...","{'tokens': ['american'], 'lemmas': ['american'...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"{'tokens': ['eisen'], 'lemmas': ['eisen'], 'di...","{'tokens': ['was'], 'lemmas': ['be'], 'dist_to...","{'tokens': ['old'], 'lemmas': ['old'], 'dist_t...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"{'tokens': ['eisen'], 'lemmas': ['eisen'], 'di...","{'tokens': ['graduated'], 'lemmas': ['graduate...","{'tokens': ['1985'], 'lemmas': ['1985'], 'dist...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"{'tokens': ['eisen'], 'lemmas': ['eisen'], 'di...","{'tokens': ['lived', 'in'], 'lemmas': ['live',...","{'tokens': ['tennessee'], 'lemmas': ['tennesse...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,"{'tokens': ['radical', 'entertainment', 'inc.'...","{'tokens': ['is'], 'lemmas': ['be'], 'dist_to_...","{'tokens': ['video', 'game', 'developer'], 'le...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


#### Collect to one file 

In [279]:
! mv processed_ data/processed

mv: cannot stat 'processed_': No such file or directory


In [280]:
! ls -laht data/processed

total 687M
drwxr-xr-x 10 root root 4.0K Oct 22 14:37 ..
-rw-r--r--  1 root root 4.1M Oct  1 12:11 it_wiki_part_27.pkl
-rw-r--r--  1 root root 9.8M Oct  1 12:11 it_wiki_part_72.pkl
-rw-r--r--  1 root root  24M Oct  1 12:11 it_wiki_part_2.pkl
-rw-r--r--  1 root root 7.5M Oct  1 12:11 it_wiki_part_57.pkl
-rw-r--r--  1 root root 6.1M Oct  1 12:11 it_wiki_part_73.pkl
-rw-r--r--  1 root root 3.4M Oct  1 12:11 it_wiki_part_40.pkl
-rw-r--r--  1 root root 7.0M Oct  1 12:11 it_wiki_part_37.pkl
-rw-r--r--  1 root root 9.3M Oct  1 12:11 it_wiki_part_50.pkl
-rw-r--r--  1 root root 7.5M Oct  1 12:11 it_wiki_part_81.pkl
-rw-r--r--  1 root root 6.0M Oct  1 12:10 it_wiki_part_79.pkl
-rw-r--r--  1 root root 7.9M Oct  1 12:10 it_wiki_part_47.pkl
-rw-r--r--  1 root root 7.0M Oct  1 12:10 it_wiki_part_35.pkl
-rw-r--r--  1 root root 6.5M Oct  1 12:10 it_wiki_part_70.pkl
-rw-r--r--  1 root root  14M Oct  1 12:10 it_wiki_part_76.pkl
-rw-r--r--  1 root root 6.1M Oct  1 12:10 it_wiki_part_25.pkl

In [281]:
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd

DATA_PATH = 'data/processed_separately'

result = []
for file in tqdm(glob(DATA_PATH + '/*.pkl')):
    result.append(pd.read_pickle(file))
    
result = pd.concat(result)

HBox(children=(IntProgress(value=0, max=92), HTML(value='')))




In [282]:
result.shape

(32419, 6)