# Prepare feature rich dataset ``data/dataset.pkl`` out of corenlp annotations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import wget

W2V_MODEL_PATH ='models/'
W2V_MODEL_NAME = 'GoogleNews-vectors-negative300.bin.gz'

directory = os.path.dirname(W2V_MODEL_PATH)
if not Path(directory).is_dir():
    print(f'Creating directory at {directory}',
          ' for saving word2vec pre-trained model')
    os.makedirs(directory)
if not Path(W2V_MODEL_PATH).is_file():
    w2v_archive = os.path.join(directory, W2V_MODEL_NAME)
    if not Path(w2v_archive).is_file():
        url = f'https://s3.amazonaws.com/dl4j-distribution/{W2V_MODEL_NAME}'
        print(f'Downloading word2vec pre-trained model to {w2v_archive}')
        wget.download(url, os.path.join(directory, W2V_MODEL_NAME))

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

if W2V_MODEL_NAME[-4:] in ['.vec', '.bin']:
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME,
                                                       binary=W2V_MODEL_NAME[-4:] == '.bin')
elif W2V_MODEL_NAME[-7:] == '.bin.gz':
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME, binary=True)
    
else:
    word2vec_model = Word2Vec.load(W2V_MODEL_PATH + W2V_MODEL_NAME)
    
word2vec_vector_length = len(word2vec_model.wv.get_vector('tree'))

In [None]:
import networkx as nx
import multiprocessing
import numpy as np
from iteration_utilities import unique_everseen


def _extract_plain_features(sentence):
    
    # postag sequences occuring in dataset more than 20 times
    cols_postag_subj = ['VBN~IN~NNP~IN~CD', 'VBN~IN~NNP', 'VBN~IN~CD', 'NN~NN~NN~VBN~IN~NNP', 'NNP', 'NNP~NNP~NNP~NNP', 'NN', 'JJ~NN~NN', 'JJ~CD', 'NN~CD', 'NNP~NNPS', 'NNP~NNP~NNS', 'JJ~NNP~NNP', 'RB~VBN', 'PRP', 'CD', 'NNP~NNP~IN~NNP~NNP', 'JJ~NN', 'NN~IN~NN~NN', 'CD~NNS', 'NN~IN~NNP', 'NN~NN', 'NN~NN~NN', 'JJ~NN~NN~NN', 'JJ~NN~VBN~IN~NNP', 'NN~VBN~IN~NNP', 'NNP~CD', 'NN~NN~VBN~IN~NN~NNP', 'NN~NN~VBN~IN~JJ~NN~NNP', 'NNP~NNP~NNP', 'JJ~NNS', 'NNP~NNP', 'NN~NN~CD', 'NNP~NN', 'NN~IN~CD', 'JJ~NN~NN~VBN~IN~CD', 'NN~NN~VBN~IN~CD', 'CD~NNP~CD', 'NN~NN~VBN~IN~NNP~NNP~NNP', 'NN~IN~CD~NNS', 'NNP~NNP~NN~NN', 'JJ', 'NNP~POS~NN', 'PRP$~NN~NN', 'JJ~NN~NN~NN~VBN', 'NN~NN~NN~VBN', 'NNP~NN~NN', 'JJ~NNP', 'NNP~NNP~IN~NNP', 'PRP$~JJ~NN', 'PRP$~NN', 'VBN~IN~CD~IN~NNP~NNP', 'JJ~JJ', 'PRP$~CD~NN', 'NNP~NN~VBN', 'NN~IN~NNP~NNPS', 'NN~IN~NNP~NNP', 'CD~CD~NNS', 'JJ~NN~IN~NN', 'NN~NN~NN~CD', 'RB~NN', 'VBN~IN~NNP~NNP~IN~CD', 'CD~NN', 'NN~IN~NNP~NNP~NNP', 'JJ~NN~VBN~IN~NN', 'JJ~NN~VBN', 'JJ~JJ~NN', 'JJ~JJ~NN~VBN', 'PRP$~NNS', 'CD~NNS~IN~NN', 'CD~JJ~NNS', 'NN~VBN~IN~NNP~NNP~NNP', 'JJ~NN~IN~NNP~NNP', 'NN~NNP~NNP', 'JJ~NN~NNP~NNP', 'NNS', 'NN~NN~NN~NN', 'NN~NN~IN~NN', 'NNP~IN~NNP', 'NNP~NNS', 'JJ~JJ~NN~NN', 'CD~NN~IN~NN', 'NN~CD~CD', 'NN~NN~VBN~IN~NNP', 'JJ~NN~NN~VBN~IN~NNP', 'NNP~NNP~NN', 'JJ~NN~NN~NN~CD', 'NNP~CD~NN', 'JJ~NN~NN~NN~VBN~IN~NNP', 'JJ~JJ~NN~VBN~IN~NNP', 'JJ~NN~NNS', 'NN~VBN~IN~NNP~NNP', 'JJ~NN~VBN~IN~NNP~NNP', 'RB~NNP', 'NNP~NNP~CD', 'NNS~IN~NNP', 'NN~NN~VBN', 'CD~NNS~IN~NNS', 'CD~NN~NNS', 'NN~JJ~NN', 'NNS~IN~NNP~NNP', 'NN~IN~JJ~NN', 'NNP~NNP~NNP~IN~NNP', 'NN~NNS', 'JJ~NNP~NN', 'JJ~NN~IN~NNS', 'NN~NN~IN~NNP', 'NN~IN~PRP$~NN', 'NN~IN~NN', 'JJ~VBN', 'CD~NN~NN', 'PRP$~NN~IN~NN', 'PRP$~NN~JJ~NN', 'JJ~NN~NN~VBN', 'JJ~JJ~JJ~NN', 'VBN~IN~CD~IN~NN', 'CD~TO~CD', 'NNS~NN', 'CD~NNS~VBN', 'NN~VBN', 'NNP~IN~NNP~NNP', 'NN~NNP', 'JJ~NN~NNP', 'VBN~IN~CD~TO~CD', 'JJ~NN~VBN~IN~CD', 'NN~VBN~IN~CD', 'JJ~NN~IN~NNP', 'JJ~NN~IN~JJ~NN', 'JJ~JJ~NN~NN~VBN', 'VBN~IN~NNP~NNP', 'NN~NN~IN~NNP~NNP', 'NNS~IN~NN', 'JJ~NNS~IN~NNP', 'NN~IN~NN~IN~NNP', 'NN~NN~VBN~IN~NNP~NNP', 'JJ~NN~NN~VBN~IN~NNP~NNP', 'NN~IN~JJ~NNS', 'NN~NN~CD~NN', 'NN~IN~CD~JJ~NNS', 'JJ~JJ~NNS', 'CD~JJ~NN', 'PRP$~JJ~NN~IN~NN', 'PRP$~NN~NNS', 'VBN~IN~NNP~IN~CD~TO~CD']
    cols_postag_obj = ['VBN~IN~NNP~IN~CD', 'VBN~IN~NNP', 'VBN~IN~CD', 'NN~NN~NN~VBN~IN~NNP', 'NNP', 'NNP~NNP~NNP~NNP', 'NN', 'JJ~NN~NN', 'JJ~CD', 'NN~CD', 'NNP~NNPS', 'NNP~NNP~NNS', 'JJ~NNP~NNP', 'RB~VBN', 'PRP', 'CD', 'NNP~NNP~IN~NNP~NNP', 'JJ~NN', 'NN~IN~NN~NN', 'CD~NNS', 'NN~IN~NNP', 'NN~NN', 'NN~NN~NN', 'JJ~NN~NN~NN', 'JJ~NN~VBN~IN~NNP', 'NN~VBN~IN~NNP', 'NNP~CD', 'NN~NN~VBN~IN~NN~NNP', 'NN~NN~VBN~IN~JJ~NN~NNP', 'NNP~NNP~NNP', 'JJ~NNS', 'NNP~NNP', 'NN~NN~CD', 'NNP~NN', 'NN~IN~CD', 'JJ~NN~NN~VBN~IN~CD', 'NN~NN~VBN~IN~CD', 'CD~NNP~CD', 'NN~NN~VBN~IN~NNP~NNP~NNP', 'NN~IN~CD~NNS', 'NNP~NNP~NN~NN', 'JJ', 'NNP~POS~NN', 'PRP$~NN~NN', 'JJ~NN~NN~NN~VBN', 'NN~NN~NN~VBN', 'NNP~NN~NN', 'JJ~NNP', 'NNP~NNP~IN~NNP', 'PRP$~JJ~NN', 'PRP$~NN', 'VBN~IN~CD~IN~NNP~NNP', 'JJ~JJ', 'PRP$~CD~NN', 'NNP~NN~VBN', 'NN~IN~NNP~NNPS', 'NN~IN~NNP~NNP', 'CD~CD~NNS', 'JJ~NN~IN~NN', 'NN~NN~NN~CD', 'RB~NN', 'VBN~IN~NNP~NNP~IN~CD', 'CD~NN', 'NN~IN~NNP~NNP~NNP', 'JJ~NN~VBN~IN~NN', 'JJ~NN~VBN', 'JJ~JJ~NN', 'JJ~JJ~NN~VBN', 'PRP$~NNS', 'CD~NNS~IN~NN', 'CD~JJ~NNS', 'NN~VBN~IN~NNP~NNP~NNP', 'JJ~NN~IN~NNP~NNP', 'NN~NNP~NNP', 'JJ~NN~NNP~NNP', 'NNS', 'NN~NN~NN~NN', 'NN~NN~IN~NN', 'NNP~IN~NNP', 'NNP~NNS', 'JJ~JJ~NN~NN', 'CD~NN~IN~NN', 'NN~CD~CD', 'NN~NN~VBN~IN~NNP', 'JJ~NN~NN~VBN~IN~NNP', 'NNP~NNP~NN', 'JJ~NN~NN~NN~CD', 'NNP~CD~NN', 'JJ~NN~NN~NN~VBN~IN~NNP', 'JJ~JJ~NN~VBN~IN~NNP', 'JJ~NN~NNS', 'NN~VBN~IN~NNP~NNP', 'JJ~NN~VBN~IN~NNP~NNP', 'RB~NNP', 'NNP~NNP~CD', 'NNS~IN~NNP', 'NN~NN~VBN', 'CD~NNS~IN~NNS', 'CD~NN~NNS', 'NN~JJ~NN', 'NNS~IN~NNP~NNP', 'NN~IN~JJ~NN', 'NNP~NNP~NNP~IN~NNP', 'NN~NNS', 'JJ~NNP~NN', 'JJ~NN~IN~NNS', 'NN~NN~IN~NNP', 'NN~IN~PRP$~NN', 'NN~IN~NN', 'JJ~VBN', 'CD~NN~NN', 'PRP$~NN~IN~NN', 'PRP$~NN~JJ~NN', 'JJ~NN~NN~VBN', 'JJ~JJ~JJ~NN', 'VBN~IN~CD~IN~NN', 'CD~TO~CD', 'NNS~NN', 'CD~NNS~VBN', 'NN~VBN', 'NNP~IN~NNP~NNP', 'NN~NNP', 'JJ~NN~NNP', 'VBN~IN~CD~TO~CD', 'JJ~NN~VBN~IN~CD', 'NN~VBN~IN~CD', 'JJ~NN~IN~NNP', 'JJ~NN~IN~JJ~NN', 'JJ~JJ~NN~NN~VBN', 'VBN~IN~NNP~NNP', 'NN~NN~IN~NNP~NNP', 'NNS~IN~NN', 'JJ~NNS~IN~NNP', 'NN~IN~NN~IN~NNP', 'NN~NN~VBN~IN~NNP~NNP', 'JJ~NN~NN~VBN~IN~NNP~NNP', 'NN~IN~JJ~NNS', 'NN~NN~CD~NN', 'NN~IN~CD~JJ~NNS', 'JJ~JJ~NNS', 'CD~JJ~NN', 'PRP$~JJ~NN~IN~NN', 'PRP$~NN~NNS', 'VBN~IN~NNP~IN~CD~TO~CD']
    cols_postag_rel = ['VBZ', 'VBZ~DT~JJ~NN', 'VBD~RB~VBN~IN', 'VBD~VBN~IN', 'MD~VB', 'MD~RB~VB', 'VBD~NNS~IN', 'VBD', 'NN~IN', 'VBN', 'VBN~IN', 'IN', 'VB', 'VBZ~RB~TO', 'VBZ~TO', 'RB~VBN~IN', 'VBD~RB~VBN', 'VBD~VBN', 'VBG~IN', 'VB~NN', 'VBD~RB~VBN~TO', 'VB~NN~IN', 'VBZ~VBN', 'VBD~TO', 'VBZ~IN', 'VBD~IN', 'VBZ~NN~NN', 'VBG', 'POS~NNP', 'NN~NN', 'VBZ~NN', 'VBZ~NN~IN', '', 'VBZ~VBN~IN', 'VBZ~JJ~TO', 'POS', 'VBD~NN', 'VBD~NN~IN', 'RB', 'VBP', 'VBZ~DT~NN~IN', 'VBD~JJ~NN~IN', 'VBZ~JJ~NN', 'VBZ~JJ~NN~IN', 'POS~NN', 'MD', 'CD~IN', 'VB~NNS~IN', 'NN', 'NNP~IN', 'VBD~VBN~TO', 'VBG~NN', 'VBD~NNS', 'RB~VBD', 'VBP~RB~VBN', 'VBP~VBN', 'RB~VBZ', 'NNS~IN', 'JJ~NNS~IN', 'RB~VBZ~IN', 'VBP~JJ~NNS', 'NNS', 'VBZ~RB~VBN', 'VBN~TO', 'JJ~NN', 'VBD~DT~JJ~NN', 'JJ~IN', 'RB~VBN', 'VB~NNS', 'VBZ~NN~NNS', 'VBZ~RB~VBN~IN', 'VBG~TO', 'NN~NN~NN', 'VBZ~JJ~IN', 'VBD~JJ~IN', 'VBD~JJ', 'VB~JJ', 'VBZ~VBG', 'NN~TO', 'MD~VB~IN', 'VBD~DT~NN', 'VBP~IN', 'VBZ~NNS~IN', 'RB~VBP', 'VBD~VBG', 'RB~VBG', 'JJ~NN~IN', 'JJ~JJ~NN~IN', 'VBD~JJ~TO', 'VBN~NN~IN', 'VBD~JJ~NN', 'NN~NNS', 'VBD~NN~NN', 'VBG~NNS~IN', 'VBZ~DT~NN', 'VBD~NN~TO', 'VBD~JJ~VBN~IN', 'VBP~VBN~IN', 'VBP~NNS', 'RB~VB~NNS~IN', 'VBN~NN', 'RB~VB', 'VB~IN', 'VBZ~DT~NN~TO', 'VBN~VBN', 'VBZ~JJ', 'VBZ~NN~TO', 'VBP~VBN~TO', 'TO', 'VBD~DT~NN~NN', 'VB~TO', 'VBD~NN~NNS~IN', 'VBD~DT~NN~IN', 'VBD~NNP~IN', 'VBZ~VBN~TO', 'RB~VBD~NNS~IN', 'RB~VBD~IN', 'VBG~NN~IN', 'VBD~VBN~RP~IN', 'VBD~VBN~RP', 'VBP~JJ', 'VBD~RP', 'VBD~NN~NN~IN', 'VBZ~NN~NNS~IN', 'RB~VBZ~VBN', 'VBD~NNP', 'VBP~RB~VBN~IN', 'VBP~RB', 'VBZ~NNS', 'MD~VB~RP', 'VBP~DT~NN~IN', 'VBD~DT~NN~TO', 'JJ', 'VBD~RB', 'RB~VBP~IN', 'NNP', 'RB~VBG~IN', 'VBD~NNP~NNP', 'VBG~VBN', 'VBZ~NNP~IN', 'RB~VBD~TO', 'VBZ~RB~JJ~IN', 'VBD~NNP~NNP~NNP', 'RB~VBD~NN', 'VBZ~DT~NNP', 'VBZ~JJ~JJ~NN', 'VBP~JJ~IN', 'VBZ~VBN~NN~IN', 'RBS~VBN~IN', 'VBZ~VBN~NN', 'VBZ~NNP', 'VBG~VBN~IN', 'RBS~JJ~NN~IN', 'VBD~RB~TO', 'JJ~JJ~NN', 'MD~VB~NN', 'JJ~VBN~IN', 'VB~VBN~IN', 'VBD~RP~IN', 'VBG~NNS', 'RB~VBD~NN~NN', 'VBP~NN', 'VBZ~JJS~NN', 'VBN~VBN~IN', 'VBG~DT~NN', 'IN~IN', 'RB~IN', 'VB~VBN', 'VBP~VBN~NN', 'JJ~TO', 'VBP~TO', 'VBD~JJ~NNS', 'VBD~RB~IN', 'DT~IN', 'VBD~NN~NNS', 'JJ~CC~JJ~NNS', 'JJR~NN', 'VBD~DT~JJ', 'VBD~RBS~JJ~NN', 'VBP~JJ~NNS~IN', 'VBZ~CD', 'VBZ~CD~IN', 'NN~NN~IN', 'VBD~RP~NN', 'VBP~NNS~IN', 'RB~VBD~RB', 'VBD~JJ~VBN', 'MD~NN~IN', 'RB~VBD~NN~IN', 'RB~NNP', 'VBG~RB~TO', 'VBZ~RBS~JJ~IN', 'VBG~NNP~NNS', 'VBZ~RB~JJ~NN', 'VBP~NN~IN', 'VBP~NNP', 'RB~VBG~RB', 'VBG~RB', 'VBP~RP~TO', 'VB~RB~RBR', 'VB~RBR', 'VBD~NN~NN~NNS', 'VBZ~VB~NN~IN', 'VBZ~VBG~NN~IN', 'RBR', 'NNS~NNP~IN', 'JJ~NNP', 'RB~JJ~NNP', 'NNP~NN~NN~IN', 'VBZ~NNS~RB~IN', 'VBG~CD~NN~IN', 'VBG~PRP$~NNS~TO', 'RBS~RB']
    
    def get_postags_sequence(span, words, columns):
        sequence = '~'.join([token['pos'] for token in sentence[0]['tokens'][span[0]:span[1]] 
                     if token['originalText'] in words])
    
        result = tuple(int(sequence == column) for column in columns)
        return result
    
    def get_before(span):
        return sentence[0]['tokens'][span[0]]['before']
    
    def get_after(span):
        return sentence[0]['tokens'][span[-1] - 1]['after']
    
    _ner_kinds = ['TITLE', 'COUNTRY', 'DATE', 'PERSON', 'ORGANIZATION', 'MISC',
       'LOCATION', 'NUMBER', 'CAUSE_OF_DEATH', 'NATIONALITY', 'ORDINAL',
       'DURATION', 'CRIMINAL_CHARGE', 'CITY', 'RELIGION',
       'STATE_OR_PROVINCE', 'IDEOLOGY', 'SET', 'URL', 'PERCENT', 'TIME',
       'MONEY', 'HANDLE']
    
    def get_ner_occurrences(span, words):
        mentions = [token['ner'] for token in sentence[0]['tokens'][span[0]:span[1]] 
                             if token['originalText'] in words]
        result = tuple(int(_ner_kind in mentions) for _ner_kind in _ner_kinds)
        return result
    
    def tag_lemma(span, words, tag=False):
        if tag:
            return [token['lemma'].lower() + '_' + _penn_tagset[token['pos']]['fPOS'] for token in sentence[0]['tokens'][span[0]:span[1]] 
                             if token['originalText'] in words]
        else:
            return [token['lemma'].lower() for token in sentence[0]['tokens'][span[0]:span[1]] 
                             if token['originalText'] in words]
    
    def remove_repetition(words):
        if words[:len(words)//2].strip() == words[len(words)//2:].strip():
            return words[:len(words)//2].strip()
        return words
    
    def _build_dep_path(dependencies, tokens, start: int, end: int):
        edges = []
        deps = {}

        for edge in dependencies:
            edges.append((edge['governor'], edge['dependent']))
            deps[(min(edge['governor'], edge['dependent']),
                  max(edge['governor'], edge['dependent']))] = edge

        graph = nx.Graph(edges)
        path = nx.shortest_path(graph, source=start, target=end)
        return path[:-1]  # exclude right end

    def _tokens_by_index(indexes, tokens):
        return [token['originalText'] for token in tokens if token['index'] in indexes]
    
    def _lemmas_by_index(indexes, tokens):
        return [token['lemma'].lower() for token in tokens if token['index'] in indexes]

    result = []
    header = ['subject', 'relation', 'object', 
              'dep_path',
              'distance_0', 'distance_1', 
              'rel_pos_subj', 'rel_pos_rel', 'rel_pos_obj',
              #'postags_subj', 'postags_rel', 'postags_obj',
              'lemma_subj', 'lemma_rel', 'lemma_obj'] +\
            [ner + '_subj' for ner in _ner_kinds] +\
            [ner + '_rel' for ner in _ner_kinds] +\
            [ner + '_obj' for ner in _ner_kinds] +\
            [col + '_subj' for col in cols_postag_subj] +\
            [col + '_obj' for col in cols_postag_obj] +\
            [col + '_rel' for col in cols_postag_rel]

    for triplet in sentence[0]['openie']:
        result.append((
            remove_repetition(triplet['subject']), 
            remove_repetition(triplet['relation']), 
            remove_repetition(triplet['object']),
            ' '.join(_lemmas_by_index(_build_dep_path(sentence[0]['basicDependencies'], 
                                                      sentence[0]['tokens'], 
                                                      triplet['subjectSpan'][0], 
                                                      triplet['objectSpan'][-1]), sentence[0]['tokens'])),
            triplet['relationSpan'][0] - triplet['subjectSpan'][0],
            triplet['objectSpan'][0] - triplet['relationSpan'][0],
            triplet['subjectSpan'][0] / len(sentence[0]['tokens']),
            triplet['relationSpan'][0] / len(sentence[0]['tokens']),
            triplet['objectSpan'][0] / len(sentence[0]['tokens']),
            tag_lemma(triplet['subjectSpan'], triplet['subject']),
            tag_lemma(triplet['relationSpan'], triplet['relation']),
            tag_lemma(triplet['objectSpan'], triplet['object']),
        ) +\
            get_ner_occurrences(triplet['subjectSpan'], triplet['subject']) +\
            get_ner_occurrences(triplet['relationSpan'], triplet['relation']) +\
            get_ner_occurrences(triplet['objectSpan'], triplet['object']) +\
            get_postags_sequence(triplet['subjectSpan'], triplet['subject'], cols_postag_subj) +\
            get_postags_sequence(triplet['objectSpan'], triplet['relation'], cols_postag_obj) +\
            get_postags_sequence(triplet['relationSpan'], triplet['object'], cols_postag_rel)
        ) 
        
    return pd.DataFrame(result, columns=header)

def _embed(placeholder, words):
    for j in range(len(words)):
        if j == len(placeholder):
            break
            
        word = words[j]
        if word and word in word2vec_model:
            placeholder[j, :] = word2vec_model[word]
    return placeholder
    
def _extract_features(sentence):
    features = _extract_plain_features(sentence[1])
    features.insert(loc=0, column='docid', value=sentence[0])
    max_len = {'obj': 25, 'rel': 10, 'subj': 10}
    features['w2v_subj'] = features['lemma_subj'].map(lambda words: _embed(np.zeros((max_len['subj'], word2vec_vector_length)), words))
    features['w2v_rel'] = features['lemma_rel'].map(lambda words: _embed(np.zeros((max_len['rel'], word2vec_vector_length)), words))
    features['w2v_obj'] = features['lemma_obj'].map(lambda words: _embed(np.zeros((max_len['obj'], word2vec_vector_length)), words))
                                                                  
    return features

def remove_repetitions(annot):
    for i in range(len(annot)):
        for j in range(len(annot[i])):
            annot[i][j]['openie'] = list(unique_everseen(annot[i][j]['openie']))
    return annot

                                           
class FeaturesProcessor:
    
    def __init__(self):
        self.pool = multiprocessing.Pool(processes=4)
                 
    def __call__(self, data):
        features = pd.concat(self.pool.map(_extract_features, data))
        return features

In [None]:
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd

DATA_PATH = 'data/corenlp_annotations_ner_pairs'  #'data/filtered_annotations'
RESULT_PATH = 'data/processed'
! mkdir $RESULT_PATH 
result = []
extr = FeaturesProcessor()

for file in tqdm(glob(DATA_PATH + '/*.json')):
    tmp = pd.read_json(file)
    tmp = tmp[tmp.loc[:, 1].map(len) > 0]
    tmp[1] = remove_repetitions(tmp[1].values)
    result = extr(tmp[[0, 1]].values)
    result.to_pickle(file.replace(DATA_PATH, RESULT_PATH).replace('.json', '.pkl'))

In [10]:
result.select_dtypes(include='object').head()

Unnamed: 0,subject,relation,object,dep_path,lemma_subj,lemma_rel,lemma_obj,w2v_subj,w2v_rel,w2v_obj
0,Houthi movement,is,Islamic religious-political-armed movement,,"[houthi, movement]",[be],"[islamic, religious-political-armed, movement]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-0.228515625, -0.08837890625, 0.1279296875, ...","[[-0.158203125, 0.0361328125, 0.3515625, 0.437..."
1,Houthi movement,is,Islamic movement,,"[houthi, movement]",[be],"[islamic, movement]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-0.228515625, -0.08837890625, 0.1279296875, ...","[[-0.158203125, 0.0361328125, 0.3515625, 0.437..."
0,CUBIT Cubit,is,computer user interface system for multi-touch...,system device design,"[cubit, cubit]",[be],"[computer, user, interface, system, for, multi...","[[0.322265625, 0.1259765625, 0.142578125, -0.0...","[[-0.228515625, -0.08837890625, 0.1279296875, ...","[[0.107421875, -0.201171875, 0.123046875, 0.21..."
1,CUBIT Cubit,is,computer user interface system for multi-touch...,system device design,"[cubit, cubit]",[be],"[computer, user, interface, system, for, multi...","[[0.322265625, 0.1259765625, 0.142578125, -0.0...","[[-0.228515625, -0.08837890625, 0.1279296875, ...","[[0.107421875, -0.201171875, 0.123046875, 0.21..."
2,CUBIT Cubit,is,computer user interface system for devices des...,system device design,"[cubit, cubit]",[be],"[computer, user, interface, system, for, devic...","[[0.322265625, 0.1259765625, 0.142578125, -0.0...","[[-0.228515625, -0.08837890625, 0.1279296875, ...","[[0.107421875, -0.201171875, 0.123046875, 0.21..."


#### Collect to one file 

In [19]:
! mv data/processed/processed_ processed_

In [20]:
! rm -r data/processed

In [24]:
! mv processed_ data/processed

In [26]:
! ls data

annotations			      enwiki-latest-pages-articles.xml.bz2.1
categories.filter		      filtered_annotations
corenlp_annotations_filtered	      it_wiki_articles.json
corenlp_annotations_ner_pairs	      prepare_wiki_dataset.ipynb
corenlp_annotations_new		      processed
dataset.pkl			      sample_dataset.pkl
enwiki-latest-pages-articles.xml.bz2  stopwords.filter


In [30]:
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd

DATA_PATH = 'data/processed'

result = []
for file in tqdm(glob(DATA_PATH + '/*.pkl')):
    result.append(pd.read_pickle(file))
    
result = pd.concat(result)

HBox(children=(IntProgress(value=0, max=132), HTML(value='')))




In [31]:
result.shape

(48983, 586)

In [32]:
result.to_pickle('data/dataset.pkl')

#### BOW for relation 

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_rel = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
bow_rel = vectorizer_rel.fit_transform(result['relation'].values).toarray()
columns = [column + '_bow_rel' for column in vectorizer_rel.get_feature_names()]
features = pd.concat([result.reset_index(drop=True), pd.DataFrame(bow_rel, columns=columns)], axis=1)

In [39]:
import pickle

pickle.dump(vectorizer_rel, open('models/relation_vectorizer.pkl', 'wb'))

#### BOW for synt dependency path

In [37]:
vectorizer_path = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
bow_path = vectorizer_path.fit_transform(features['dep_path'].values).toarray()
columns = [column + '_bow_path' for column in vectorizer_path.get_feature_names()]
features = pd.concat([features.reset_index(drop=True), pd.DataFrame(bow_path, columns=columns)], axis=1)

In [40]:
pickle.dump(vectorizer_path, open('models/path_vectorizer.pkl', 'wb'))

In [42]:
features.shape

(48983, 2586)

## Simple clustering example

In [None]:
X = features
excluding_cols = ['docid', 'subject', 'relation', 'object', 'dep_path', 'lemma_subj', 'lemma_rel', 'lemma_obj']
embedding_cols = ['w2v_subj', 'w2v_rel', 'w2v_obj']

In [47]:
from sklearn.cluster import KMeans

kmeans=KMeans(n_clusters=100)
kmeans.fit(X.drop(columns=embedding_cols+excluding_cols))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=100, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [58]:
def show_cluster_sample(number):
    return X[kmeans.labels_ == number][['docid', 'subject', 'relation', 'object']].sample(frac=1).iloc[:10] 

In [65]:
show_cluster_sample(3)

Unnamed: 0,docid,subject,relation,object
5820,37976837,He,is known as,founder director of Defense
21530,1281824,Koster,is recognized for,his work
19517,14581096,model,is,three-layer model for network design proposed ...
34127,142597,He,is best known for,his work in fields of operational research
47459,48327467,Spindel,is known for,his involvement in union leader Jimmy Hoffa 's...
17933,16064919,Baer,is known for,his contributions at University of Kansas
19251,27717386,He,is son of,Zhiuli Shartava killed by Abkhaz militias duri...
25902,8207891,Vaynerchuk,is best known for,his work as chairman as CEO
5822,37976837,He,is best known as,founder director
42085,1628191,He,is leader,cares about his people


In [66]:
show_cluster_sample(90)

Unnamed: 0,docid,subject,relation,object
12358,8783360,Chevrolet Volt,is,plug-in hybrid car also marketed as Holden Volt
35592,576403,Renault Mégane,is,small family car produced by car manufacturer ...
37976,1822962,Daewoo Lanos,is,subcompact car produced by manufacturer Daewoo...
44577,373793,Volkswagen Polo,is,car produced by manufacturer Volkswagen
44837,409837,Mercedes-Benz C-Class,is line of,compact executive cars produced
10492,43797566,Opel Karl,is,city car designated as their entry model for l...
41739,10523335,MG 3,is,subcompact car produced by giant SAIC
19421,14507998,Elegia,is,genus of grass-like plants described as genus ...
12371,8783360,Chevrolet Volt,is,plug-in hybrid car marketed in rebadged varian...
2811,9557050,Suzuki Lapin,is,kei car with five-door hatchback body manufact...
