In [28]:
%config IPCompleter.greedy=True
import json
from collections import defaultdict
from operator import itemgetter
import re
import numpy as np
import random

In [3]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

In [85]:
dataset = json_read('../dataset compilation/kbqa_russian_dataset.json')

In [111]:
test_uids = json_read('../dataset compilation/test_uids.json')
dev_uids = json_read('../dataset compilation/dev_uids.json')

## Linking

### Labels

In [26]:
cnt_labels = 0
index_entities = set()
with open('../entity retrieval/labels_token.txt') as inf:
    for line in inf:
        line = line.strip()
        cnt_labels += 1
        qid = line.split(':')[0]
        index_entities.add(qid)

In [27]:
print(f'Unique entities: {len(index_entities)}')
print(f'Total labels: {cnt_labels}')

Unique entities: 4114595
Total labels: 5430657


### Question matches

In [57]:
bns = ['1_5000', '5001_10000', '10001_14000', '14001_end']

In [58]:
match_lens = []
for bn in bns:
    matches_data = json_read(f'/Users/ne0n/itmo-jb/kbqa/kbqa-russian-dataset/toloka/questions/q_matches_{bn}.json')
    for entry in matches_data:
        match_lens.append(len(entry['matches']))
match_lens = np.array(match_lens)

In [61]:
print(f'Average length of candidates list for questions is {match_lens.mean()}')

Average length of candidates list for questions is 8.55832368885911


## Dataset

### Dataset entities

In [87]:
all_entities = set()
q_entities = set()
a_entities = set()
for entry in dataset[:1200]:
    for e in entry['question_uris']:
        all_entities.add(e)
        q_entities.add(e)
    for a in entry['answers']:
        if a['type'] == 'uri':
            e = a['value']
            all_entities.add(e)
            a_entities.add(e)

In [88]:
print(f'Overall entities: {len(all_entities)}')
print(f'Question entities: {len(q_entities)}')
print(f'Answer entities: {len(a_entities)}')

Overall entities: 2357
Question entities: 1218
Answer entities: 1250


### Dataset relations

In [130]:
all_relations = set()
for entry in dataset[:1200]:
    query = entry['query']
    props = []
    props.extend(re.findall(r'wdt:P\d+', query))
    props.extend(re.findall(r'p:P\d+', query))
    props.extend(re.findall(r'p[s|q]:P\d+', query))
    for p in props:
        all_relations.add(p)

In [132]:
print(f'Overall relations: {len(all_relations) + 3}')

Overall relations: 242


### Dataset tags

In [115]:
tag_count = defaultdict(int)
for entry in dataset:
    for t in entry['tags']:
        tag_count[t] += 1

In [116]:
tag_count_test = defaultdict(int)
for entry in dataset:
    if entry['uid'] in test_uids:
        for t in entry['tags']:
            tag_count_test[t] += 1
        
tag_count_dev = defaultdict(int)
for entry in dataset:
    if entry['uid'] in dev_uids:
        for t in entry['tags']:
            tag_count_dev[t] += 1

In [117]:
for t, c in sorted(tag_count.items(), key=itemgetter(1), reverse=True):
    print(t, c)

1-hop 958
no-answer 300
multi-constraint 131
multi-hop 69
reverse 35
qualifier-constraint 26
exclusion 22
ranking 19
0-hop 15
qualifier-answer 6
count 5


In [118]:
for t, c in sorted(tag_count_test.items(), key=itemgetter(1), reverse=True):
    print(t, c)

1-hop 760
no-answer 240
multi-constraint 110
multi-hop 55
reverse 29
qualifier-constraint 22
exclusion 18
ranking 16
0-hop 12
qualifier-answer 5
count 4


In [119]:
for t, c in sorted(tag_count_dev.items(), key=itemgetter(1), reverse=True):
    print(t, c)

1-hop 198
no-answer 60
multi-constraint 21
multi-hop 14
reverse 6
qualifier-constraint 4
exclusion 4
ranking 3
0-hop 3
qualifier-answer 1
count 1


In [122]:
tag_comb_count = defaultdict(int)
for entry in dataset:
#     if entry['uid'] in test_uids:
        tag_comb_count[tuple(sorted(entry['tags']))] += 1

In [123]:
s = 0
for t, c in sorted(tag_comb_count.items(), key=itemgetter(1), reverse=True):
    print(t, c)
    s += c
s

('1-hop',) 921
('no-answer',) 300
('multi-constraint',) 96
('multi-hop',) 69
('1-hop', 'exclusion') 21
('qualifier-constraint',) 16
('multi-constraint', 'reverse') 15
('0-hop',) 15
('1-hop', 'reverse') 12
('multi-constraint', 'ranking') 8
('qualifier-constraint', 'ranking') 6
('multi-constraint', 'ranking', 'reverse') 5
('1-hop', 'count') 4
('multi-constraint', 'qualifier-answer') 4
('qualifier-constraint', 'reverse') 3
('qualifier-answer',) 2
('exclusion', 'multi-constraint') 1
('multi-constraint', 'qualifier-constraint') 1
('count', 'multi-constraint') 1


1500

In [124]:
tag_comb_count = defaultdict(int)
for entry in dataset:
    if entry['uid'] in dev_uids:
        tag_comb_count[tuple(sorted(entry['tags']))] += 1

In [125]:
s = 0
for t, c in sorted(tag_comb_count.items(), key=itemgetter(1), reverse=True):
    print(t, c)
    s += c
s

('1-hop',) 191
('no-answer',) 60
('multi-constraint',) 15
('multi-hop',) 14
('1-hop', 'exclusion') 4
('multi-constraint', 'reverse') 3
('0-hop',) 3
('qualifier-constraint',) 3
('1-hop', 'reverse') 2
('qualifier-constraint', 'ranking') 1
('multi-constraint', 'qualifier-answer') 1
('1-hop', 'count') 1
('multi-constraint', 'ranking') 1
('multi-constraint', 'ranking', 'reverse') 1


300

In [72]:
cands = []
for entry in dataset:
    if tuple(sorted(entry['tags'])) == ('1-hop',):
        cands.append((entry['uid'], entry['question_text'], entry['query']))
#         print(entry['uid'])
#         print(entry['question_text'])
#         print(entry['query'])
#         print()
random.shuffle(cands)
cands[:15]

[(209,
  'Кто автор бессмертной "Похвалы Глупости"?',
  'SELECT ?answer \nWHERE {\n  wd:Q569869 wdt:P50 ?answer\n}'),
 (534,
  'Кто из австралийских писателей является автором книги «Ковчег Шиндлера», по которой был поставлен оскароносный фильм?',
  'SELECT ?answer \nWHERE {\n  wd:Q1975638 wdt:P50 ?answer\n}'),
 (458,
  'При каком российском монархе возник Государственный Музей Эрмитаж?',
  'SELECT ?answer \nWHERE {\n  wd:Q132783 wdt:P112 ?answer\n}'),
 (694,
  'В какой стране расположен вулкан Гарибальди?',
  'SELECT ?answer \nWHERE {\n  wd:Q183027 wdt:P17 ?answer\n}'),
 (118,
  'Какой продукт составляет основу сырников',
  'SELECT ?answer \nWHERE {\n  wd:Q1726195 wdt:P186 ?answer\n}'),
 (3140,
  'В каком подмосковном городе находится Ново-Иерусалимский монастырь?',
  'SELECT ?answer\nWHERE {\n  wd:Q773979 wdt:P131 ?answer.\n}'),
 (38,
  'В каком виде спорта прославилась Курникова?',
  'SELECT ?answer \nWHERE {\n  wd:Q131120 wdt:P641 ?answer\n}'),
 (388,
  'как звали собаку билла клин

In [73]:
for c in cands[:189]:
    print(c[0])

209
534
458
694
118
3140
38
388
357
3114
4
56
3111
96
573
261
642
801
492
413
658
92
776
107
840
164
475
457
224
154
306
310
104
295
76
793
3139
236
419
609
626
331
816
764
549
526
726
483
280
814
3004
566
373
423
552
211
31
436
286
243
57
2042
676
630
645
222
808
745
115
260
343
289
578
592
580
25
68
3048
484
14
3075
737
90
3056
251
327
461
792
2116
533
541
101
3062
431
834
532
640
629
41
176
481
584
339
415
22
465
2026
571
281
127
47
550
358
159
84
7
581
138
519
435
203
304
311
3023
405
221
365
770
334
330
324
494
811
789
256
335
408
40
2097
145
3036
129
3097
279
262
759
499
258
611
242
228
65
3059
738
177
277
714
3090
859
668
425
732
3109
623
393
3102
616
688
480
779
344
543
3092
818
3115
89
467
361
158
527
82
74
821
156
215
835
422
657
3126


### Question length

In [133]:
word_lens = []
word_lens_eng = []
for entry in dataset:
    q = entry['question_text']
    q_eng = entry['question_eng']
    q_tokens = list(filter(None, re.split(r'\W', q)))
    q_eng_tokens = list(filter(None, re.split(r'\W', q_eng)))
    word_lens.append(len(q_tokens))
    word_lens_eng.append(len(q_eng_tokens))
print(f'Average question length: {np.array(word_lens).mean()} words')
print(f'Average english question length: {np.array(word_lens_eng).mean()} words')
print(f'Median question length: {np.median(np.array(word_lens))} words')
print(f'Median english question length: {np.median(np.array(word_lens_eng))} words')

Average question length: 7.987333333333333 words
Average english question length: 10.582666666666666 words
Median question length: 7.0 words
Median english question length: 10.0 words


### Questions with list answers

In [134]:
cnt = 0
for entry in dataset:
    if len(entry['answers']) > 1:
        cnt += 1
print(f'Number of questions with list of answers: {cnt}')

Number of questions with list of answers: 131


### Types of answers

In [135]:
tps = defaultdict(int)
for entry in dataset:
    anses = entry['answers']
    if not anses:
        continue
    t = anses[0]['type']
    tps[t] += 1
print(f'{tps["uri"]} URI answers')
print(f'{tps["literal"]} literal answers')

1154 URI answers
46 literal answers
