In [1]:
import argparse
import json
from datetime import datetime

import utils

In [2]:
from collections import defaultdict

In [13]:
parser = argparse.ArgumentParser()
parser.add_argument('--split', default='train', choices=['train', 'dev', 'test'])
parser.add_argument('--partialfile', required=False, help='Partially constructed outfile used for initial '
                                                          'population of feature values for the subset of all '
                                                          'entities')
parser.add_argument('--outfile', default='data/dbpedia-overlap-features-test.json')
args = parser.parse_args(args=[])
print(args)

Namespace(outfile='data/dbpedia-overlap-features-test.json', partialfile=None, split='train')


In [4]:
word_probs = utils.load_word_probs()

In [5]:
len(word_probs)

6098921

In [6]:
question_neighbors = utils.load_question_neighbors()

In [7]:
len(question_neighbors)

79330

In [20]:
neighbor_triples = utils.load_neighbor_triples()

In [21]:
len(neighbor_triples)

453745

In [22]:
if args.partialfile:
    with open(args.partialfile) as pf:
        neighbor_features = json.load(pf)
else:
    neighbor_features = {}

i = 0

In [23]:
lukovnikov_entities_path = 'data/entities.json'

In [24]:
with open(lukovnikov_entities_path) as entities_file:
    lukovnikov_entities = json.load(entities_file)

In [25]:
len(lukovnikov_entities)

83502

In [26]:
question_entities = defaultdict(list)

In [27]:
for question_id, entities in lukovnikov_entities.items():
    for entity in entities:
        question_entities[question_id].append((entity, 1.0))

In [14]:
split_total_questions = 0
qblink_split = utils.load_qblink_split(args.split)

In [15]:
len(qblink_split)

22818

In [None]:
for sequence in qblink_split:
    for question in ['q1', 'q2', 'q3']:
        question_id = str(sequence[question]['t_id'])
        question_answer = f"<http://dbpedia.org/resource/{sequence[question]['wiki_page']}>"
        if (not sequence[question]['wiki_page'] or question_id not in question_neighbors or
                question_answer not in question_neighbors[question_id]):
            continue

        question_text = sequence[question]['quetsion_text']
        question_tokens = set(utils.tokenize(question_text))

        if question == 'q1':
            previous_answer = None
        elif question == 'q2':
            previous_answer = f"<http://dbpedia.org/resource/{sequence['q1']['wiki_page']}>"
        elif question == 'q3':
            previous_answer = f"<http://dbpedia.org/resource/{sequence['q2']['wiki_page']}>"
        if previous_answer is not None:
            previous_answer_tokens = set(utils.uri_tokens(previous_answer))
        else:
            previous_answer_tokens = None
        # features_total = [0.0] * 10

        if question_id not in neighbor_features:
            neighbor_features[question_id] = {}

        for candidate_entity in question_neighbors[question_id]:  # candidate_entity is e_i in illustration.pdf
            if candidate_entity in neighbor_features[question_id]:
                continue

            features = {
                'f_o(q, p)': 0.0,
                'f_o(q, lit)': 0.0,
                'f_o(q, cat)': 0.0,
                'f_o(q, ent)': 0.0,
                'f_o(a, s)': 0.0
            }

            counts = {
                'f_o(q, p)': 0,
                'f_o(q, lit)': 0,
                'f_o(q, cat)': 0,
                'f_o(q, ent)': 0,
                'f_o(a, s)': 0
            }

            if 'lit' in neighbor_triples[candidate_entity]:
                for lit_triple in neighbor_triples[candidate_entity]['lit']:
                    pred = lit_triple[0]
                    lit_text = lit_triple[1]
                    pred_tokens = utils.uri_tokens(pred)
                    features['f_o(q, p)'] += utils.word_overlap_score(question_tokens, pred_tokens, word_probs)
                    counts['f_o(q, p)'] += 1

                    lit_tokens = lit_text.split(' ')
                    features['f_o(q, lit)'] += utils.word_overlap_score(question_tokens, lit_tokens, word_probs)
                    counts['f_o(q, lit)'] += 1
                    if previous_answer_tokens is not None:
                        score = utils.word_overlap_score(previous_answer_tokens, lit_tokens, word_probs)
                        features['f_o(a, s)'] += score
                        counts['f_o(a, s)'] += 1
                        # print(previous_answer_tokens, lit_tokens, score)

            if 'cat' in neighbor_triples[candidate_entity]:
                for cat_triple in neighbor_triples[candidate_entity]['cat']:
                    cat = cat_triple[1]
                    cat_tokens = utils.uri_tokens(cat)
                    features['f_o(q, cat)'] += utils.word_overlap_score(question_tokens, cat_tokens, word_probs)
                    counts['f_o(q, cat)'] += 1
                    if previous_answer_tokens is not None:
                        score = utils.word_overlap_score(previous_answer_tokens, cat_tokens, word_probs)
                        features['f_o(a, s)'] += score
                        counts['f_o(a, s)'] += 1

            ent_triples = []
            if 'subj' in neighbor_triples[candidate_entity]:
                ent_triples.extend(neighbor_triples[candidate_entity]['subj'])
            if 'obj' in neighbor_triples[candidate_entity]:
                ent_triples.extend(neighbor_triples[candidate_entity]['obj'])
            for ent_triple in ent_triples:
                pred = ent_triple[0]
                ent = ent_triple[1]
                pred_tokens = utils.uri_tokens(pred)
                features['f_o(q, p)'] += utils.word_overlap_score(question_tokens, pred_tokens, word_probs)
                counts['f_o(q, p)'] += 1

                ent_tokens = utils.uri_tokens(ent)
                features['f_o(q, ent)'] += utils.word_overlap_score(question_tokens, ent_tokens, word_probs)
                counts['f_o(q, ent)'] += 1
                if previous_answer_tokens is not None:
                    score = utils.word_overlap_score(previous_answer_tokens, ent_tokens, word_probs)
                    features['f_o(a, s)'] += score
                    counts['f_o(a, s)'] += 1
                    # print(previous_answer_tokens, ent_tokens, score)

            neighbor_features[question_id][candidate_entity] = [
                utils.div_pos(features['f_o(q, p)'], counts['f_o(q, p)']),
                utils.div_pos(features['f_o(q, lit)'], counts['f_o(q, lit)']),
                utils.div_pos(features['f_o(q, cat)'], counts['f_o(q, cat)']),
                utils.div_pos(features['f_o(q, ent)'], counts['f_o(q, ent)']),
                utils.div_pos(features['f_o(a, s)'], counts['f_o(a, s)'])
            ]
        i += 1
        print(f'Processed {i} items. Current time: {datetime.now().strftime("%H:%M:%S")}.')
print(f"Processed split {args.split}. Total number of filtered questions: {i}.")

Processed 1 items. Current time: 10:53:09.
Processed 2 items. Current time: 10:53:09.
Processed 3 items. Current time: 10:53:10.
Processed 4 items. Current time: 10:53:13.
Processed 5 items. Current time: 10:53:14.
Processed 6 items. Current time: 10:53:15.
Processed 7 items. Current time: 10:53:17.
Processed 8 items. Current time: 10:53:17.
Processed 9 items. Current time: 10:53:18.
Processed 10 items. Current time: 10:53:21.
Processed 11 items. Current time: 10:53:22.
Processed 12 items. Current time: 10:53:29.
Processed 13 items. Current time: 10:53:29.
Processed 14 items. Current time: 10:53:33.
Processed 15 items. Current time: 10:53:36.
Processed 16 items. Current time: 10:53:36.
Processed 17 items. Current time: 10:53:41.
Processed 18 items. Current time: 10:53:42.
Processed 19 items. Current time: 10:53:44.
Processed 20 items. Current time: 10:53:53.
Processed 21 items. Current time: 10:53:55.
Processed 22 items. Current time: 10:53:56.
Processed 23 items. Current time: 10:53:5

Processed 186 items. Current time: 11:00:32.
Processed 187 items. Current time: 11:00:32.
Processed 188 items. Current time: 11:00:35.
Processed 189 items. Current time: 11:00:36.
Processed 190 items. Current time: 11:00:36.
Processed 191 items. Current time: 11:00:36.
Processed 192 items. Current time: 11:00:36.
Processed 193 items. Current time: 11:00:46.
Processed 194 items. Current time: 11:00:46.
Processed 195 items. Current time: 11:00:47.
Processed 196 items. Current time: 11:00:49.
Processed 197 items. Current time: 11:01:02.
Processed 198 items. Current time: 11:01:02.
Processed 199 items. Current time: 11:01:07.
Processed 200 items. Current time: 11:01:11.
Processed 201 items. Current time: 11:01:12.
Processed 202 items. Current time: 11:01:12.
Processed 203 items. Current time: 11:01:14.
Processed 204 items. Current time: 11:01:19.
Processed 205 items. Current time: 11:01:20.
Processed 206 items. Current time: 11:01:20.
Processed 207 items. Current time: 11:01:31.
Processed 

Processed 369 items. Current time: 11:09:47.
Processed 370 items. Current time: 11:09:52.
Processed 371 items. Current time: 11:09:52.
Processed 372 items. Current time: 11:09:53.
Processed 373 items. Current time: 11:09:55.
Processed 374 items. Current time: 11:09:57.
Processed 375 items. Current time: 11:10:02.
Processed 376 items. Current time: 11:10:02.
Processed 377 items. Current time: 11:10:03.
Processed 378 items. Current time: 11:10:03.
Processed 379 items. Current time: 11:10:03.
Processed 380 items. Current time: 11:10:04.
Processed 381 items. Current time: 11:10:09.
Processed 382 items. Current time: 11:10:09.
Processed 383 items. Current time: 11:10:09.
Processed 384 items. Current time: 11:10:09.
Processed 385 items. Current time: 11:10:23.
Processed 386 items. Current time: 11:10:23.
Processed 387 items. Current time: 11:10:26.
Processed 388 items. Current time: 11:10:27.
Processed 389 items. Current time: 11:10:32.
Processed 390 items. Current time: 11:10:34.
Processed 

Processed 552 items. Current time: 11:16:27.
Processed 553 items. Current time: 11:16:31.
Processed 554 items. Current time: 11:16:33.
Processed 555 items. Current time: 11:16:38.
Processed 556 items. Current time: 11:16:38.
Processed 557 items. Current time: 11:16:41.
Processed 558 items. Current time: 11:16:43.
Processed 559 items. Current time: 11:16:44.
Processed 560 items. Current time: 11:16:47.
Processed 561 items. Current time: 11:16:47.
Processed 562 items. Current time: 11:16:58.
Processed 563 items. Current time: 11:16:58.
Processed 564 items. Current time: 11:16:58.
Processed 565 items. Current time: 11:16:59.
Processed 566 items. Current time: 11:17:00.
Processed 567 items. Current time: 11:17:00.
Processed 568 items. Current time: 11:17:01.
Processed 569 items. Current time: 11:17:01.
Processed 570 items. Current time: 11:17:01.
Processed 571 items. Current time: 11:17:01.
Processed 572 items. Current time: 11:17:05.
Processed 573 items. Current time: 11:17:11.
Processed 

Processed 736 items. Current time: 11:23:37.
Processed 737 items. Current time: 11:23:41.
Processed 738 items. Current time: 11:23:41.
Processed 739 items. Current time: 11:23:45.
Processed 740 items. Current time: 11:23:47.
Processed 741 items. Current time: 11:23:59.
Processed 742 items. Current time: 11:24:04.
Processed 743 items. Current time: 11:24:05.
Processed 744 items. Current time: 11:24:05.
Processed 745 items. Current time: 11:24:07.
Processed 746 items. Current time: 11:24:07.
Processed 747 items. Current time: 11:24:10.
Processed 748 items. Current time: 11:24:11.
Processed 749 items. Current time: 11:24:13.
Processed 750 items. Current time: 11:24:14.
Processed 751 items. Current time: 11:24:14.
Processed 752 items. Current time: 11:24:20.
Processed 753 items. Current time: 11:24:20.
Processed 754 items. Current time: 11:24:20.
Processed 755 items. Current time: 11:24:20.
Processed 756 items. Current time: 11:24:24.
Processed 757 items. Current time: 11:24:24.
Processed 

Processed 919 items. Current time: 11:31:33.
Processed 920 items. Current time: 11:31:33.
Processed 921 items. Current time: 11:31:35.
Processed 922 items. Current time: 11:31:35.
Processed 923 items. Current time: 11:31:37.
Processed 924 items. Current time: 11:31:40.
Processed 925 items. Current time: 11:31:43.
Processed 926 items. Current time: 11:31:47.
Processed 927 items. Current time: 11:31:47.
Processed 928 items. Current time: 11:32:01.
Processed 929 items. Current time: 11:32:01.
Processed 930 items. Current time: 11:32:06.
Processed 931 items. Current time: 11:32:08.
Processed 932 items. Current time: 11:32:08.
Processed 933 items. Current time: 11:32:11.
Processed 934 items. Current time: 11:32:12.
Processed 935 items. Current time: 11:32:13.
Processed 936 items. Current time: 11:32:19.
Processed 937 items. Current time: 11:32:21.
Processed 938 items. Current time: 11:32:24.
Processed 939 items. Current time: 11:32:40.
Processed 940 items. Current time: 11:32:55.
Processed 

Processed 1101 items. Current time: 11:38:49.
Processed 1102 items. Current time: 11:38:52.
Processed 1103 items. Current time: 11:38:53.
Processed 1104 items. Current time: 11:38:53.
Processed 1105 items. Current time: 11:38:55.
Processed 1106 items. Current time: 11:38:59.
Processed 1107 items. Current time: 11:39:06.
Processed 1108 items. Current time: 11:39:06.
Processed 1109 items. Current time: 11:39:07.
Processed 1110 items. Current time: 11:39:07.
Processed 1111 items. Current time: 11:39:08.
Processed 1112 items. Current time: 11:39:09.
Processed 1113 items. Current time: 11:39:13.
Processed 1114 items. Current time: 11:39:13.
Processed 1115 items. Current time: 11:39:15.
Processed 1116 items. Current time: 11:39:16.
Processed 1117 items. Current time: 11:39:17.
Processed 1118 items. Current time: 11:39:20.
Processed 1119 items. Current time: 11:39:21.
Processed 1120 items. Current time: 11:39:21.
Processed 1121 items. Current time: 11:39:22.
Processed 1122 items. Current time

In [None]:
with open(args.outfile, 'w') as f:
    json.dump(neighbor_features, f, sort_keys=False,
              indent=4, separators=(',', ': '))