In [5]:
%config IPCompleter.greedy=True
import re
import json
import csv
import requests
import re
from collections import defaultdict
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
from time import time, sleep
import numpy as np
import pandas as pd
from qwikidata.sparql import return_sparql_query_results

In [6]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

### Dataset schema

In [131]:
class DatasetEntry(dict):
    def __init__(
        self,
        uid=None,
        question_text=None,
        query=None,
        answer_text=None,
        question_uris=None,
        question_props=None,
        answer_uris=None,
        tags=None
    ):
        self['uid'] = uid
        self['question_text'] = question_text
        self['query'] = query
        self['answer_text'] = answer_text
        self['question_uris'] = question_uris
        self['question_props'] = question_props
        self['answer_uris'] = answer_uris
        self['tags'] = tags if tags is not None else []
        
    def set_uid(self, uid):
        self['uid'] = uid
        
    def set_question_text(self, qt):
        self['question_text'] = qt
        
    def set_query(self, query):
        self['query'] = query
        
    def set_answer_text(self, at):
        self['answer_text'] = answer_text
    
    def set_question_uris(self, question_uris):
        self['question_uris'] = question_uris
        
    def add_question_uri(self, question_uri):
        if self['question_uris'] is None:
            self['question_uris'] = [question_uri]
        else:
            self['question_uris'].append(question_uri)
        
    def add_question_prop(self, question_prop):
        if self['question_props'] is None:
            self['question_uris'] = [question_uri]
        else:
            self['question_uris'].append(question_uri)
        
    def set_answer_uris(self, answer_uris):
        self['answer_uris'] =  set(answer_uris)
        
    def add_answer_uri(self, answer_uri):
        if self['answer_uris'] is None:
            self['answer_uris'] = set([answer_uri])
        else:
            self['answer_uris'].add(answer_uri)
            
    def add_tag(self, tag):
        self['tags'].append(tag)
    
    def validate(self):
        return self['uid'] is not None and \
               self['question_text'] is not None and \
               self['query'] is not None and \
               self['answer_text'] is not None and \
               self['question_uris'] is not None and \
               self['answer_uris'] is not None
    
    def json(self):
        return self

### 1-hop fully correct

In [132]:
def build_sparql_query_for_simple_question(qe_uri, prop_uri):
    qid = re.sub(r'.*/', '', qe_uri)
    pid = re.sub(r'.*/', '', prop_uri)
    query = \
    f'''
SELECT ?answer 
WHERE {{
  wd:{qid} wdt:{pid} ?answer
}}
    '''.strip()
    return query

In [49]:
data = pd.read_csv('1hop_fully_correct.csv', header=0)

In [55]:
data = data.fillna('')

In [56]:
data.head()

Unnamed: 0,q,a,qe_name,qe_link,prop_name,prop_link,ans_link
0,Что может вызвать цунами?,Землетрясение,Цунами,http://www.wikidata.org/entity/Q8070,причиной является,http://www.wikidata.org/prop/direct/P828,http://www.wikidata.org/entity/Q7944
1,Кто написал роман «Хижина дяди Тома»?,Г. Бичер-Стоу,Хижина дяди Тома,http://www.wikidata.org/entity/Q2222,автор,http://www.wikidata.org/prop/direct/P50,http://www.wikidata.org/entity/Q102513
2,Кто автор пьесы «Ромео и Джульетта»?,Шекспир,Ромео и Джульетта,http://www.wikidata.org/entity/Q83186,автор,http://www.wikidata.org/prop/direct/P50,http://www.wikidata.org/entity/Q692
3,Как называется столица Румынии?,Бухарест,Румыния,http://www.wikidata.org/entity/Q218,столица,http://www.wikidata.org/prop/direct/P36,http://www.wikidata.org/entity/Q19660
4,Какой стране принадлежит знаменитый остров Пасхи?,Чили,Остров Пасхи,http://www.wikidata.org/entity/Q14452,государство,http://www.wikidata.org/prop/direct/P17,http://www.wikidata.org/entity/Q298


In [57]:
uid = 0
dataset_1hop_fully_correct = []
for index, row in data.iterrows():
    question_text = row['q']
    answer_text = row['a']
    answer_uri = row['ans_link']
    qe_uri = row['qe_link']
    prop_uri = row['prop_link']
    query = build_sparql_query_for_simple_question(qe_uri, prop_uri)
    dataset_entry = DatasetEntry(
        uid=uid,
        question_text=question_text,
        query=query,
        answer_text=answer_text
    )
    dataset_entry.add_question_uri(qe_uri)
    dataset_entry.add_answer_uri(answer_uri)
    dataset_entry.add_tag('simple')
    if not dataset_entry.validate():
        raise Exception
    uid += 1
    dataset_1hop_fully_correct.append(dataset_entry)

In [58]:
dataset_1hop_fully_correct_json = [entry.json() for entry in dataset_1hop_fully_correct]
json_dump(dataset_1hop_fully_correct_json, 'dataset_1hop_fully_correct.json')

### 2-hop fully correct

In [133]:
def build_sparql_query_for_2hop_question(qe_uri, prop_uri_1, prop_uri_2):
    qid = re.sub(r'.*/', '', qe_uri)
    pid_1 = re.sub(r'.*/', '', prop_uri_1)
    pid_2 = re.sub(r'.*/', '', prop_uri_2)
    query = \
    f'''
SELECT ?answer 
WHERE {{
  wd:{qid} wdt:{pid_1} [ wdt:{pid_2} ?answer ].
}}
    '''.strip()
    return query

In [88]:
data = pd.read_csv('2hop_fully_correct.csv', header=0)

In [89]:
data.head()

Unnamed: 0,q,a,qe_name,qe_link,prop_1_name,prop_1_link,prop_2_name,prop_2_link,ans_link
0,Какой специалист занимается изучением неопозна...,Уфолог,неопознанный летающий объект,http://www.wikidata.org/entity/Q421,изучается в,http://www.wikidata.org/prop/direct/P2579,практикуется,http://www.wikidata.org/prop/direct/P3095,http://www.wikidata.org/entity/Q18921227
1,"Как называется кривая, по которой Земля движет...",Эллипс,Земля,http://www.wikidata.org/entity/Q2,тип орбиты,http://www.wikidata.org/prop/direct/P522,имеет форму,http://www.wikidata.org/prop/direct/P1419,http://www.wikidata.org/entity/Q40112
2,Какая девичья фамилия была у Петуньи Дурсли?,Эванс,Петуния Дурсль,http://www.wikidata.org/entity/Q1963397,отец,http://www.wikidata.org/prop/direct/P22,фамилия,http://www.wikidata.org/prop/direct/P734,http://www.wikidata.org/entity/Q1260663
3,В каком селе Эдуард Успенский прописал кота Ма...,Простоквашино,Кот Матроскин,http://www.wikidata.org/entity/Q4235896,представлено в работе,http://www.wikidata.org/prop/direct/P1441,история происходит в месте,http://www.wikidata.org/prop/direct/P840,http://www.wikidata.org/entity/Q4381521
4,Какая страна – родина саксофона?,Бельгия,саксофон,http://www.wikidata.org/entity/Q9798,первооткрыватель или изобретатель,http://www.wikidata.org/prop/direct/P61,гражданство,http://www.wikidata.org/prop/direct/P27,http://www.wikidata.org/entity/Q31


In [90]:
uid = 1000
dataset_2hop_fully_correct = []
for index, row in data.iterrows():
    question_text = row['q']
    answer_text = row['a']
    answer_uri = row['ans_link']
    qe_uri = row['qe_link']
    prop_uri_1 = row['prop_1_link']
    prop_uri_2 = row['prop_2_link']
    query = build_sparql_query_for_2hop_question(qe_uri, prop_uri_1, prop_uri_2)
    dataset_entry = DatasetEntry(
        uid=uid,
        question_text=question_text,
        query=query,
        answer_text=answer_text
    )
    dataset_entry.add_question_uri(qe_uri)
    dataset_entry.add_answer_uri(answer_uri)
    dataset_entry.add_tag('2-hop')
    if not dataset_entry.validate():
        raise Exception
    uid += 1
    dataset_2hop_fully_correct.append(dataset_entry)

In [93]:
for entry in dataset_2hop_fully_correct:
    query = entry.query
    ans_uri = entry.answer_uris[0]
    print(ans_uri)
    answer_uris = return_sparql_query_results(query)['results']['bindings']
    answer_uris = [dct['answer']['value'] for dct in answer_uris]
    entry.set_answer_uris(answer_uris)
    print(answer_uris)
    print()
    sleep(1)

http://www.wikidata.org/entity/Q18921227
['http://www.wikidata.org/entity/Q18921227']

http://www.wikidata.org/entity/Q40112
['http://www.wikidata.org/entity/Q40112']

http://www.wikidata.org/entity/Q1260663
['http://www.wikidata.org/entity/Q1260663']

http://www.wikidata.org/entity/Q4381521
['http://www.wikidata.org/entity/Q4381521', 'http://www.wikidata.org/entity/Q4381521']

http://www.wikidata.org/entity/Q31
['http://www.wikidata.org/entity/Q31']

http://www.wikidata.org/entity/Q34664
['http://www.wikidata.org/entity/Q34664']

http://www.wikidata.org/entity/Q804402
['http://www.wikidata.org/entity/Q804402']

http://www.wikidata.org/entity/Q202
['http://www.wikidata.org/entity/Q202']

http://www.wikidata.org/entity/Q1297
['http://www.wikidata.org/entity/Q1297']

http://www.wikidata.org/entity/Q718
['http://www.wikidata.org/entity/Q718']

http://www.wikidata.org/entity/Q4381521
['http://www.wikidata.org/entity/Q4381521', 'http://www.wikidata.org/entity/Q4381521']

http://www.wikidata

In [97]:
for entry in dataset_2hop_fully_correct:
    aus = entry.answer_uris
    entry.answer_uris = list(set(aus))

In [98]:
dataset_2hop_fully_correct_json = [entry.json() for entry in dataset_2hop_fully_correct]
json_dump(dataset_2hop_fully_correct_json, 'dataset_2hop_fully_correct.json')

In [101]:
print('SELECT ?answer \nWHERE {\n  wd:Q13 wdt:P279 [ wdt:P642 ?answer ].\n}')

SELECT ?answer 
WHERE {
  wd:Q13 wdt:P279 [ wdt:P642 ?answer ].
}


### Edit bad chains

In [134]:
def get_query_tags(ql):
    tags = []
    if 'COUNT' in ql[0]:
        tags.append('count')
    if 'LIMIT' in ql[-1] and 'ORDER' in ql[-2]:
        tags.append('order-limit')
        ql = ql[:-2]
    ql = ql[2:-2]
    if 'FILTER' in ql[-1] and 'not in' in ql[-1]:
        tags.append('not-in')
        ql = ql[:-1]
    if ql[0].split()[0] == '?answer':
        tags.append('reverse')
    if len(ql) == 1:
        if 'VALUES' in ql[0]:
            tags.append('0-hop')
            return tags
        if '[' in ql[0]:
            clause = re.findall(r'\[(.*)\]', ql[0])[0]
            if ';' not in clause:
                tags.append('multihop')
            else:
                clauses = clause.split(';')
                if '?answer' in clause[1]:
                    tags.append('qualifier-answer')
                else:
                    tags.append('qualifier-constraint')
        else:
            tags.append('simple')
    else:
        tags.append('multi-constraint')
    return tags

In [135]:
re.findall(r'\[(.*)\]', '?answer p:P39 [ps:P39 wd:Q218295; pq:P1545 "1"]')

['ps:P39 wd:Q218295; pq:P1545 "1"']

#### From 1-hop

In [104]:
def build_initial_sparql_query_for_simple_question(qe_uri, prop_uri):
    qid = re.sub(r'.*/', '', qe_uri)
    pid = re.sub(r'.*/', '', prop_uri)
    query = \
    f'''
SELECT ?answer ?answerLabel
WHERE {{
  wd:{qid} wdt:{pid} ?answer
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }}
}}
    '''.strip()
    return query

In [102]:
data = pd.read_csv('1hop_to_edit.csv', header=0)

In [103]:
data.head()

Unnamed: 0,q,a,qe_name,qe_link,prop_name,prop_link,ans_link
0,Как звали знаменитого капитана Врунгеля?,Христофор Бонифатьевич,Приключения капитана Врунгеля,http://www.wikidata.org/entity/Q1985844,персонажи,http://www.wikidata.org/prop/direct/P674,http://www.wikidata.org/entity/Q4127228
1,Как звали знаменитого капитана Врунгеля?,Христофор Бонифатьевич,Приключения капитана Врунгеля,http://www.wikidata.org/entity/Q4378458,персонажи,http://www.wikidata.org/prop/direct/P674,http://www.wikidata.org/entity/Q4127228
2,В какой стране родился и вырос Че Гевара?,Аргентина,Эрнесто Че Гевара,http://www.wikidata.org/entity/Q5809,место жительства,http://www.wikidata.org/prop/direct/P551,http://www.wikidata.org/entity/Q414
3,Как звали жену Одиссея?,Пенелопа,Одиссея,http://www.wikidata.org/entity/Q35160,персонажи,http://www.wikidata.org/prop/direct/P674,http://www.wikidata.org/entity/Q165769
4,В каком городе жил и работал К.Э. Циолковский?,Калуга,Константин Эдуардович Циолковский,http://www.wikidata.org/entity/Q41239,место смерти,http://www.wikidata.org/prop/direct/P20,http://www.wikidata.org/entity/Q2837


In [112]:
sep = '-----'
with open('1hop_editor.txt', 'w') as ouf:
    for idx, row in data.iterrows():
        qe_uri = row['qe_link']
        prop_uri = row['prop_link']
        ans_uri = row['ans_link']
        print(idx, file=ouf)
        print(row['q'], file=ouf)
        print(row['a'], file=ouf)
        print(f'{qe_uri} [ {row["qe_name"]} ] ->', file=ouf)
        print(f'{prop_uri} {{ {row["prop_name"]} }} ->', file=ouf)
        print(row['ans_link'], file=ouf)
        print(build_initial_sparql_query_for_simple_question(qe_uri, prop_uri), file=ouf)
        print('edited=0', file=ouf)
        print(sep, file=ouf)

In [140]:
cnt = 0
with open('1hop_editor.txt', 'r') as inf:
    for line in inf:
        if line.strip() == sep:
            cnt += 1
        if line.startswith('edited'):
            mark = line.strip().replace('edited=', '')
            if mark == '1':
                cnt -= 1
print(cnt)

0


#### From 2-hop

In [122]:
def build_initial_sparql_query_for_2hop_question(qe_uri, prop_uri_1, prop_uri_2):
    qid = re.sub(r'.*/', '', qe_uri)
    pid_1 = re.sub(r'.*/', '', prop_uri_1)
    pid_2 = re.sub(r'.*/', '', prop_uri_2)
    query = \
    f'''
SELECT ?answer ?answerLabel
WHERE {{
  wd:{qid} wdt:{pid_1} [ wdt:{pid_2} ?answer ] .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }}
}}
    '''.strip()
    return query

In [123]:
data = pd.read_csv('2hop_to_edit.csv', header=0)

In [124]:
data.head()

Unnamed: 0,q,a,qe_name,qe_link,prop_1_name,prop_1_link,prop_2_name,prop_2_link,ans_link
0,Где находится озеро Лох-Несс?,В Шотландии,Лох-Несс,http://www.wikidata.org/entity/Q49650,бассейн объекта в странах,http://www.wikidata.org/prop/direct/P205,заменил,http://www.wikidata.org/prop/direct/P1365,http://www.wikidata.org/entity/Q230791
1,Кто был соавтором С. В. Михалкова в написании ...,Эль-Регистан,СССР,http://www.wikidata.org/entity/Q15180,гимн,http://www.wikidata.org/prop/direct/P85,автор либретто,http://www.wikidata.org/prop/direct/P87,http://www.wikidata.org/entity/Q453447
2,Как называется мера счета мелких галантерейных...,Гросс,гросс,http://www.wikidata.org/entity/Q1547498,предположительно одно и то же с,http://www.wikidata.org/prop/direct/P460,предположительно одно и то же с,http://www.wikidata.org/prop/direct/P460,http://www.wikidata.org/entity/Q1547498
3,Какую болезнь в России когда-то называли инфлю...,Грипп,болезнь,http://www.wikidata.org/entity/Q12136,свойства у этого класса,http://www.wikidata.org/prop/direct/P1963,пример использования свойства,http://www.wikidata.org/prop/direct/P1855,http://www.wikidata.org/entity/Q2840
4,Какой голос у великого Лучано Паваротти?,Тенор,Лучано Паваротти,http://www.wikidata.org/entity/Q37615,певческий голос,http://www.wikidata.org/prop/direct/P412,подкласс от,http://www.wikidata.org/prop/direct/P279,http://www.wikidata.org/entity/Q27914


In [125]:
sep = '-----'
with open('2hop_editor.txt', 'w') as ouf:
    for idx, row in data.iterrows():
        qe_uri = row['qe_link']
        prop_uri_1 = row['prop_1_link']
        prop_uri_2 = row['prop_2_link']
        ans_uri = row['ans_link']
        print(idx, file=ouf)
        print(row['q'], file=ouf)
        print(row['a'], file=ouf)
        print(f'{qe_uri} [ {row["qe_name"]} ] ->', file=ouf)
        print(f'{prop_uri_1} {{ {row["prop_1_name"]} }} ->', file=ouf)
        print(f'{prop_uri_2} {{ {row["prop_2_name"]} }} ->', file=ouf)
        print(row['ans_link'], file=ouf)
        print(build_initial_sparql_query_for_2hop_question(qe_uri, prop_uri_1, prop_uri_2), file=ouf)
        print('edited=0', file=ouf)
        print(sep, file=ouf)

### Parse and tag questions

In [18]:
def get_query_tags(ql):
    tags = []
    if 'COUNT' in ql[0]:
        tags.append('count')
    if 'LIMIT' in ql[-1] and 'ORDER' in ql[-2]:
        tags.append('ranking')
        ql = ql[:-2]
    ql = ql[2:-2]
    if 'FILTER' in ql[-1] and 'not in' in ql[-1]:
        tags.append('exclusive')
        ql = ql[:-1]
    if ql[0].split()[0] == '?answer':
        tags.append('reverse')
    if len(ql) == 1:
        if 'VALUES' in ql[0]:
            tags.append('0-hop')
            return tags
        if '[' in ql[0]:
            clause = re.findall(r'\[(.*)\]', ql[0])[0]
            if ';' not in clause:
                tags.append('multi-hop')
            else:
                clauses = clause.split(';')
                if '?answer' in clause[1]:
                    tags.append('qualifier-answer')
                else:
                    tags.append('qualifier-constraint')
        else:
            tags.append('1-hop')
    else:
        tags.append('multi-constraint')
    return tags

In [19]:
def get_query_entities(ql):
    ent_set = set()
    for line in ql:
        ent_set.update(set(re.findall(r'Q\d+', line)))
    return ['http://www.wikidata.org/entity/' + ent for ent in ent_set]

In [45]:
def get_query_props(ql):
    prop_set = set()
    for line in ql:
        prop_set.update(set(re.findall(r'P\d+', line)))
        if 'skos:altLabel' in line:
            prop_set.add('altLabel')
        if 'rdfs:label' in line:
            prop_set.add('label')
    return list(prop_set)

In [301]:
edited_1hop_qs = []
cnt = 0
with open('1hop_editor.txt', 'r') as inf:
    lines = [line.strip('\n') for line in inf]
    ptr = 0
    while ptr < len(lines):
        cur_lines = []
        while lines[ptr] != sep:
            cur_lines.append(lines[ptr])
            ptr += 1
        ptr += 1
        q = cur_lines[1]
        a = cur_lines[2]
        query_lines = [line.strip() for line in cur_lines[6:-1]]
        entry = DatasetEntry(
            uid=(2000 + cnt),
            question_text=q,
            answer_text=a,
            question_uris=get_query_entities(query_lines),
            question_props=get_query_props(query_lines),
            answer_uris=[cur_lines[5]],
            tags=get_query_tags(query_lines)
        )
        query = '\n'.join(cur_lines[6:-1])
        query = query.replace(' ?answerLabel', '')
        query = query.replace('  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }\n', '')
        entry.set_query(query)
        assert(entry.validate())
        edited_1hop_qs.append(entry)
        cnt += 1 
print(cnt)
json_dump(edited_1hop_qs, 'dataset_1hop_edited.json')

124


In [302]:
edited_2hop_qs = []
cnt = 0
with open('2hop_editor.txt', 'r') as inf:
    lines = [line.strip('\n') for line in inf]
    ptr = 0
    while ptr < len(lines):
        cur_lines = []
        while lines[ptr] != sep:
            cur_lines.append(lines[ptr])
            ptr += 1
        ptr += 1
        q = cur_lines[1]
        a = cur_lines[2]
        query_lines = [line.strip() for line in cur_lines[7:-1]]
        entry = DatasetEntry(
            uid=(3000 + cnt),
            question_text=q,
            answer_text=a,
            question_uris=get_query_entities(query_lines),
            question_props=get_query_props(query_lines),
            answer_uris=[cur_lines[6]],
            tags=get_query_tags(query_lines)
        )
        query = '\n'.join(cur_lines[7:-1])
        query = query.replace(' ?answerLabel', '')
        query = query.replace('  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }\n', '')
        entry.set_query(query)
        assert(entry.validate())
        edited_2hop_qs.append(entry)
        cnt += 1 
print(cnt)
json_dump(edited_2hop_qs, 'dataset_2hop_edited.json')

149


In [154]:
sep = '-----'
edited_count_qs = []
cnt = 0
with open('count_editor.txt', 'r') as inf:
    lines = [line.strip('\n') for line in inf]
    ptr = 0
    while ptr < len(lines):
        cur_lines = []
        while lines[ptr] != sep:
            cur_lines.append(lines[ptr])
            ptr += 1
        ptr += 1
        q = cur_lines[1]
        a = cur_lines[2]
        query_lines = [line.strip() for line in cur_lines[7:-1]]
        entry = DatasetEntry(
            uid=(4000 + cnt),
            question_text=q,
            answer_text=a,
            question_uris=get_query_entities(query_lines),
            question_props=get_query_props(query_lines),
            answer_uris=[cur_lines[6]],
            tags=get_query_tags(query_lines)
        )
        query = '\n'.join(cur_lines[7:-1])
        query = query.replace(' ?answerLabel', '')
        query = query.replace('  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }\n', '')
        entry.set_query(query)
        assert(entry.validate())
        edited_count_qs.append(entry)
        cnt += 1 
print(cnt)
json_dump(edited_count_qs, 'dataset_count_edited.json')

22


### Merge

In [256]:
dataset = json_read('dataset_1hop_fully_correct.json') + \
          json_read('dataset_2hop_fully_correct.json') + \
          json_read('dataset_1hop_edited.json') + \
          json_read('dataset_2hop_edited.json')
json_dump(dataset, 'kbqa_russian_dataset.json')

### Editing queries

In [146]:
dataset = json_read('kbqa_russian_dataset.json')

In [147]:
editions = {}
with open('answers_check.txt', 'r') as inf:
    lines = [line.strip('\n') for line in inf]
    query_lines = []
    for line in lines:
        if line != sep:
            query_lines.append(line)
        else:
            uid = query_lines[0]
            query_lines = query_lines[1:]
            tags = get_query_tags(query_lines)
            qe_uris = get_query_entities(query_lines)
            prop_uris = get_query_props(query_lines)
            editions[uid] = {}
            editions[uid]['qe_uris'] = qe_uris
            editions[uid]['prop_uris'] = prop_uris
            editions[uid]['tags'] = tags
            query = '\n'.join(query_lines)
            query = query.replace(' ?answerLabel', '')
            query = query.replace('  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }\n', '')
            editions[uid]['query'] = query
            print(uid)
            print(query)
            print(qe_uris)
            print(prop_uris)
            print(tags)
            query_lines = []

FileNotFoundError: [Errno 2] No such file or directory: 'answers_check_count.txt'

In [364]:
editions['468']

{'qe_uris': ['http://www.wikidata.org/entity/Q165',
  'http://www.wikidata.org/entity/Q29'],
 'prop_uris': ['P206', 'P31', 'P279'],
 'tags': ['multi-constraint'],
 'query': 'SELECT ?answer \nWHERE {\n  wd:Q29 wdt:P206 ?answer .\n  ?answer wdt:P31/wdt:P279* wd:Q165 \n}'}

In [365]:
for entry in dataset:
    uid = str(entry['uid'])
    if uid in editions:
        new_e = editions[uid]
        entry['query'] = new_e['query']
        entry['question_uris'] = new_e['qe_uris']
        entry['question_props'] = new_e['prop_uris']
        entry['tags'] = new_e['tags']

In [366]:
dataset[455]

{'uid': 468,
 'question_text': 'Какое море омывает берега Испании?',
 'query': 'SELECT ?answer \nWHERE {\n  wd:Q29 wdt:P206 ?answer .\n  ?answer wdt:P31/wdt:P279* wd:Q165 \n}',
 'answer_text': 'Средиземное',
 'question_uris': ['http://www.wikidata.org/entity/Q165',
  'http://www.wikidata.org/entity/Q29'],
 'question_props': ['P206', 'P31', 'P279'],
 'answer_uris': ['http://www.wikidata.org/entity/Q4918'],
 'tags': ['multi-constraint']}

In [367]:
json_dump(dataset, 'kbqa_russian_dataset_edited.json')

### Get actual answers by queries

In [95]:
dataset = json_read('kbqa_russian_dataset.json')

In [70]:
len(dataset)

1500

In [65]:
return_sparql_query_results(dataset[0]['query'])['results']['bindings']

[{'answer': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q7944'}},
 {'answer': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q60186'}},
 {'answer': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q167903'}},
 {'answer': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q2580904'}},
 {'answer': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q5975740'}},
 {'answer': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q7692360'}}]

In [66]:
actual_answers = {}

In [71]:
for entry in tqdm(dataset[1055:1200]):
    sleep(0.75)
    uid = entry['uid']
    query = entry['query']
    response = requests.get('https://query.wikidata.org/sparql', params={'format': 'json', 'query': query})
    to_sleep = 3
    while response.status_code == 429:
        sleep(to_sleep)
        to_sleep += 2
        response = requests.get('https://query.wikidata.org/sparql', params={'format': 'json', 'query': query})
        print('slept')
    answers = response.json()['results']['bindings']
    actual_answers[uid] = answers

HBox(children=(IntProgress(value=0, max=145), HTML(value='')))

In [91]:
print(json.dumps([ans['answer'] for ans in actual_answers[2105]], indent=4, ensure_ascii=False))

[
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "CO"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "Carbon monoxide"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "Моноксид углерода"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "окись углерода"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "оксид углерода (II)"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "Оксид углерода(II)"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "Равновесие Будуара"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "угарный газ"
    },
    {
        "xml:lang": "ru",
        "type": "literal",
        "value": "Углерода окись"
    }
]


In [97]:
sep = '-----'
with open('final_check.txt', 'w') as ouf:
    for entry in dataset[:1200]:
        uid = entry['uid']
        ans_uris = sorted([ans['value'] for ans in entry['answers']])
        act_ans_uris = [m['answer']['value'] for m in actual_answers[uid]]
        act_ans_uris = sorted(list(set(act_ans_uris)))
        query = entry['query']
        query = query.replace('SELECT ?answer', 'SELECT ?answer ?answerLabel')
        query = query.replace('\n}', '\n  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],ru". }\n}')
        if len(ans_uris) != len(act_ans_uris):
            print(uid, file=ouf)
            print(entry['question_text'], file=ouf)
            print(entry['question_uris'], file=ouf)
            print(ans_uris, file=ouf)
            print(act_ans_uris, file=ouf)
            print(query, file=ouf)
            print(sep, file=ouf)
        else:
            for i in range(len(ans_uris)):
                if ans_uris[i] != act_ans_uris[i]:
                    print(uid, file=ouf)
                    print(entry['question_text'], file=ouf)
                    print(entry['question_uris'], file=ouf)
                    print(ans_uris, file=ouf)
                    print(act_ans_uris, file=ouf)
                    print(query, file=ouf)
                    print(sep, file=ouf)
                    break

In [155]:
actual_answers

{4000: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '116'}}],
 4001: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
    'type': 'literal',
    'value': '33'}}],
 4002: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
    'type': 'literal',
    'value': '12'}}],
 4003: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '2'}}],
 4004: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '4'}}],
 4005: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '300'}}],
 4006: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '6'}}],
 4007: [{'answer': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
    'type': 'literal',
    'value': '13'}}],
 4008: [{'answer': {'datatype': '

In [156]:
for entry in dataset:
    uid = entry['uid']
    cur_answers = [e['answer'] for e in actual_answers[uid]]
    print(uid, cur_answers)
    tags = entry['tags']
    del entry['tags']
    entry['answers'] = cur_answers
    entry['tags'] = tags

4000 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '116'}]
4001 [{'datatype': 'http://www.w3.org/2001/XMLSchema#decimal', 'type': 'literal', 'value': '33'}]
4002 [{'datatype': 'http://www.w3.org/2001/XMLSchema#decimal', 'type': 'literal', 'value': '12'}]
4003 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '2'}]
4004 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '4'}]
4005 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '300'}]
4006 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '6'}]
4007 [{'datatype': 'http://www.w3.org/2001/XMLSchema#decimal', 'type': 'literal', 'value': '13'}]
4008 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '1724'}]
4009 [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '6'}]
4010 [{'type': 'uri'

In [159]:
dataset

[{'uid': 4000,
  'question_text': 'Сколько длилась столетняя война?',
  'query': 'SELECT ?answer\nWHERE {\n  wd:Q12551 wdt:P580 ?begin .\n  wd:Q12551 wdt:P582 ?end .\n  BIND (YEAR(?end) - YEAR(?begin) AS ?answer) .\n}',
  'answer_text': '116',
  'question_uris': ['http://www.wikidata.org/entity/Q12551'],
  'question_props': ['P580', 'P582'],
  'answers': [{'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
    'type': 'literal',
    'value': '116'}],
  'tags': ['multi-constraint']},
 {'uid': 4001,
  'question_text': 'Сколько всего букв в русском языке?',
  'query': 'SELECT ?answer\nWHERE {\n  wd:Q187846 wdt:P1114 ?answer\n}',
  'answer_text': '33',
  'question_uris': ['http://www.wikidata.org/entity/Q187846'],
  'question_props': ['P1114'],
  'answers': [{'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
    'type': 'literal',
    'value': '33'}],
  'tags': ['1-hop']},
 {'uid': 4002,
  'question_text': 'Сколько серий в фильме "Семнадцать мгновений весны"?',
  'query': 'SELEC

In [420]:
dataset = json_read('kbqa_russian_dataset.json')

In [421]:
tagset = set()
for entry in dataset:
    for tag in entry['tags']:
        tagset.add(tag)

In [422]:
json_dump(list(tagset), 'tagset.json')

In [424]:
dataset = json_read('kbqa_russian_dataset.json')

In [429]:
props = defaultdict(int)
for entry in dataset:
    for p in entry['question_props']:
        props[p] += 1

In [432]:
props = {k: v for k, v in sorted(props.items(), reverse=True, key=lambda x: x[1])}

In [440]:
props

{'P17': 144,
 'P50': 74,
 'P36': 61,
 'P131': 60,
 'P31': 57,
 'P276': 39,
 'P19': 39,
 'P279': 33,
 'P170': 32,
 'P106': 25,
 'P57': 24,
 'P1346': 24,
 'P641': 22,
 'P86': 22,
 'P159': 21,
 'P27': 19,
 'P527': 19,
 'P175': 18,
 'P361': 17,
 'P206': 17,
 'P495': 16,
 'P26': 16,
 'P735': 16,
 'P112': 15,
 'P61': 15,
 'P38': 12,
 'P161': 12,
 'P22': 11,
 'P40': 11,
 'P20': 11,
 'P840': 9,
 'P21': 9,
 'P180': 9,
 'P1303': 8,
 'P1441': 7,
 'P155': 7,
 'P403': 7,
 'P101': 7,
 'P144': 7,
 'P585': 7,
 'P453': 7,
 'P580': 7,
 'P25': 6,
 'P53': 6,
 'P460': 6,
 'P676': 6,
 'P706': 6,
 'P610': 6,
 'P54': 6,
 'P412': 6,
 'P127': 6,
 'P138': 6,
 'P39': 6,
 'P84': 5,
 'P800': 5,
 'P186': 5,
 'P2416': 5,
 'P97': 5,
 'P2579': 5,
 'P37': 5,
 'P156': 5,
 'P734': 5,
 'P1545': 5,
 'P582': 5,
 'P569': 5,
 'P157': 4,
 'P551': 4,
 'P837': 4,
 'P30': 4,
 'P59': 4,
 'P413': 4,
 'P136': 4,
 'P149': 4,
 'P1376': 4,
 'P509': 4,
 'P166': 4,
 'P793': 4,
 'P674': 4,
 'P1448': 4,
 'P1477': 4,
 'P1880': 3,
 'P171': 3,

In [436]:
len([k for k, v in props.items() if v == 1])

69

### Sample for Sveta

In [13]:
dataset = json_read('kbqa_russian_dataset.json')

In [14]:
import random as rnd

In [66]:
ids = rnd.sample(range(0, 800), 10)

In [114]:
for entry in dataset:
    if entry['uid'] in ids:
        if len(entry['tags']) == 1 and entry['tags'][0] == '1-hop':
            pass
        print(entry)
        samples.append({
            'question': entry['question_text'],
            'query': entry['query'],
            'answers': entry['answers'],
            'translation': ''
        })

{'uid': 159, 'question_text': 'Кто автор сказки "Бременские музыканты"?', 'query': 'SELECT ?answer \nWHERE {\n  wd:Q112015 wdt:P50 ?answer\n}', 'answer_text': 'Братья Гримм', 'question_uris': ['http://www.wikidata.org/entity/Q112015'], 'question_props': ['P50'], 'answers': [{'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q2793'}], 'tags': ['1-hop']}


In [130]:
json_dump(samples, 'samples_20.json')

In [122]:
trs = '''Which artist was criticized by French high society for the scandalous "Portrait of Madame X"?
Who wrote the novel "The phantom of the Opera"?
In which sport did Ekaterina Ilyukhina win Russia's first Olympic medal in 2010?
What period in the history of Japan followed immediately after the Heian period?
Who wrote the novel and play of the same name "the Lady with camellias"?
What musical instrument does Carlos Santana play?
Which city was the capital of the Orange Free State?
Who painted the portrait of Colonel Evgraf Davydov of the life guards of the hussar regiment, which has long been considered an image of Denis Davydov?
Which mountain is the highest point in Poland?
Which country hosted the XXI Olympic games
What is the name of the husband's sister?
Which actress was married to Luc Besson?
The birthplace of the singer Shakira?
In what musical direction did Ella Fitzgerald and Billie holiday become famous?
Who from participants of group "Beatles" was older than the other three?
In what country was Nicolaus Copernicus born?
What, according to Ohm's law, will happen if the voltage is divided by the resistance?
Who played Bormann in the popular television series " Seventeen moments of spring"
Some of the African players won the "Golden ball"?
Who is the author of the fairy tale "Bremen town musicians"?'''

In [127]:
trs = trs.split('\n')

In [128]:
for i, s in enumerate(samples):
    s['translation'] = trs[i]

In [129]:
samples

[{'question': 'Какой художник был подвергнут критике французского высшего общества за скандальный "Портрет мадам Икс"?',
  'query': 'SELECT ?answer \nWHERE {\n  wd:Q2664039 wdt:P170 ?answer\n}',
  'answers': [{'type': 'uri',
    'value': 'http://www.wikidata.org/entity/Q155626'}],
  'translation': 'Which artist was criticized by French high society for the scandalous "Portrait of Madame X"?'},
 {'question': 'Кто написал роман "Призрак оперы"?',
  'query': 'SELECT ?answer \nWHERE {\n  wd:Q272506 wdt:P50 ?answer\n}',
  'answers': [{'type': 'uri',
    'value': 'http://www.wikidata.org/entity/Q298827'}],
  'translation': 'Who wrote the novel "The phantom of the Opera"?'},
 {'question': 'В каком виде спорта первую для России олимпийскую медаль выиграла в 2010 году Екатерина Илюхина?',
  'query': 'SELECT ?answer \nWHERE {\n  wd:Q2153574 wdt:P641 ?answer\n}',
  'answers': [{'type': 'uri',
    'value': 'http://www.wikidata.org/entity/Q178131'}],
  'translation': "In which sport did Ekaterina I

### Translation

In [164]:
dataset = json_read('kbqa_russian_dataset.json')

In [166]:
with open('question_text_only.txt', 'w') as ouf:
    for entry in dataset:
        q = entry['question_text']
        print(q, file=ouf)

In [168]:
with open('question_text_only_eng.txt') as eng:
    translation = {}
    q_eng = [line.strip() for line in eng]
    for i, entry in enumerate(dataset):
        uid = entry['uid']
        translation[uid] = q_eng[i]
    json_dump(translation, 'questions_translation.json')

In [169]:
dataset = json_read('kbqa_russian_dataset.json')
translation = json_read('questions_translation.json')

In [170]:
for entry in dataset:
    uid = entry['uid']
    q_trans = translation[str(uid)]
    entry['question_eng'] = q_trans

In [172]:
json_dump(dataset, 'dataset_with_trans.json')

### Questions with no answer

In [183]:
noc_qs = json_read('/Users/ne0n/itmo-jb/kbqa/kbqa-russian-dataset/chain generation/no_chain_questions.json')

In [184]:
noc_qs_ftd = []
for q in noc_qs:
    if 'назыв' not in q:
        noc_qs_ftd.append(q)

In [185]:
noc_qs = {'q': noc_qs_ftd}

In [186]:
noc_qs_df = pd.DataFrame(noc_qs)

In [187]:
noc_qs_df.to_csv('/Users/ne0n/itmo-jb/kbqa/kbqa-russian-dataset/chain generation/no_chain_questions_ftd.csv', index=False)

In [196]:
no_answer_qs = pd.read_csv('no_chain_questions.csv')
no_answer_qs = list(no_answer_q.dropna()['q'].values)

In [201]:
json_dump(no_answer_qs, 'dataset_no_answer.json')

In [204]:
with open('qna_trans.txt') as inf:
    no_answer_qs_trans = [line.strip() for line in inf]

In [209]:
qa_map = {}
with open('/Users/ne0n/itmo-jb/kbqa/kbqa-russian-dataset/entity retrieval/quiz_dataset.txt') as inf:
    for i, line in enumerate(inf):
        t = line.strip()
        if i % 2 == 0:
            q = t
        else:
            qa_map[q] = t

In [202]:
dataset = json_read('kbqa_russian_dataset.json')

In [214]:
for i, q in enumerate(no_answer_qs):
    entry = {
        'uid': 5000 + i,
        'question_text': q,
        'query': None,
        'answer_text': qa_map[q],
        'question_uris': None,
        'question_props': None,
        'answers': [],
        'tags': ["no-answer"],
        'question_eng': no_answer_qs_trans[i]
    }
    dataset.append(entry)

In [216]:
json_dump(dataset, 'kbqa_russian_dataset_qna.json')

### Dev/Test split

In [8]:
dataset = json_read('kbqa_russian_dataset.json')

In [12]:
dev_uids = []
with open('dev_ids.txt') as inf:
    for line in inf:
        dev_uids.append(int(line.strip()))

In [13]:
json_dump(dev_uids, 'dev_uids.json')

In [14]:
test_uids = []
for entry in dataset:
    uid = entry['uid']
    if uid not in dev_uids:
        test_uids.append(uid)

In [17]:
json_dump(test_uids, 'test_uids.json')

### Get more precise relations

In [111]:
dataset = json_read('kbqa_russian_dataset.json')

In [116]:
for entry in dataset[:1200]:
    query = entry['query']
    props = entry['question_props']
    prec_props = []
    prec_props.extend(re.findall(r'wdt:P\d+', query))
    prec_props.extend(re.findall(r'p:P\d+', query))
    prec_props.extend(re.findall(r'p[s|q]:P\d+', query))
    prec_props.extend(re.findall(r'rdfs:label', query))
    prec_props.extend(re.findall(r'skos:altLabel', query))
    prec_props = list(set(prec_props))
    
    prec_props_ids = set([p.split(':')[-1] for p in prec_props])
    
    entry['question_props'] = prec_props

In [117]:
dataset[1000:1100]

[{'uid': 2100,
  'question_text': 'Какая река образуется при слиянии рек Катуни и Бии?',
  'query': 'SELECT ?answer\nWHERE {\n  wd:Q723864 wdt:P403 ?answer .\n  wd:Q859934 wdt:P403 ?answer\n}',
  'answer_text': 'Обь',
  'question_uris': ['http://www.wikidata.org/entity/Q723864',
   'http://www.wikidata.org/entity/Q859934'],
  'question_props': ['wdt:P403'],
  'answers': [{'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q973'}],
  'tags': ['multi-constraint'],
  'question_eng': 'What river is formed at the confluence of the katuni and Biya rivers?'},
 {'uid': 2101,
  'question_text': 'Какая актриса исполнила роль черепахи Тортиллы в советском фильме 1975 года «Приключения Буратино»?',
  'query': 'SELECT ?answer\nWHERE {\n  wd:Q3739484 p:P161 [ ps:P161 ?answer; pq:P453 wd:Q15642621 ]\n}',
  'answer_text': 'Рина Зелёная',
  'question_uris': ['http://www.wikidata.org/entity/Q15642621',
   'http://www.wikidata.org/entity/Q3739484'],
  'question_props': ['p:P161', 'pq:P453', 'ps:P161

In [118]:
json_dump(dataset, 'kbqa_russian_dataset.json')

### Find mistakes

In [34]:
def get_query_tags(ql):
    tags = []
    if 'COUNT' in ql[0]:
        tags.append('count')
    if 'LIMIT' in ql[-1] and 'ORDER' in ql[-2]:
        tags.append('ranking')
        ql = ql[:-2]
    else:
        ql = ql[:-1]
    ql = ql[2:]
    if 'FILTER' in ql[-1] and 'not in' in ql[-1]:
        tags.append('exclusion')
        ql = ql[:-1]
    if ql[0].split()[0] == '?answer':
        tags.append('reverse')
    if len(ql) == 1:
        if 'VALUES' in ql[0]:
            tags.append('0-hop')
            return tags
        if '[' in ql[0]:
            clause = re.findall(r'\[(.*)\]', ql[0])[0]
            if ';' not in clause:
                tags.append('multi-hop')
            else:
                clauses = clause.split(';')
                if '?answer' in clause[1]:
                    tags.append('qualifier-answer')
                else:
                    tags.append('qualifier-constraint')
        else:
            tags.append('1-hop')
    else:
        tags.append('multi-constraint')
    return tags

In [107]:
dataset = json_read('kbqa_russian_dataset.json')

In [52]:
# check tags

for entry in dataset[:1200]:
    uid = entry['uid']
    query = entry['query']
    q_uris = entry['question_uris']
    q_props = entry['question_props']
    tags = entry['tags']
    ql = query.split('\n')
    
    pred_tags = get_query_tags(ql)
    if sorted(tags) != sorted(pred_tags):
        print(uid)
        print(sorted(tags))
        print(sorted(pred_tags))
        print()

1012
['qualifier-answer']
['multi-hop']

1030
['qualifier-answer']
['multi-hop']

2024
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

2034
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

2051
['1-hop']
['multi-constraint']

2079
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

2119
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

3032
['1-hop']
['multi-constraint']

3033
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

3043
['1-hop']
['multi-constraint']

3073
['1-hop']
['multi-constraint']

3098
['1-hop']
['multi-constraint']

3104
['1-hop']
['multi-constraint']

3113
['multi-constraint', 'reverse']
['1-hop', 'reverse']

3118
['multi-constraint', 'qualifier-answer']
['qualifier-constraint']

3125
['multi-constraint', 'qualifier-answer']
['qualifier-constraint']

3133
['qualifier-constraint', 'ranking']
['multi-constraint', 'ranking']

3144
['multi-constraint', 'qualifier-answer']
['m

In [108]:
# check question uris

for entry in dataset[:1200]:
    uid = entry['uid']
    query = entry['query']
    q_uris = entry['question_uris']
    q_props = entry['question_props']
    tags = entry['tags']
    ql = query.split('\n')
    
    pred_q_uris = get_query_entities(ql)
    if sorted(q_uris) != sorted(pred_q_uris):
        print(uid)
        print(sorted(q_uris))
        print(sorted(pred_q_uris))
        print()
        continue
        
    for uri in q_uris:
        qid = uri.split('/')[-1]
        if qid not in query:
            print(uid)
            print(sorted(q_uris))
            print(sorted(pred_q_uris))
            print()

In [109]:
# check question props

for entry in dataset[:1200]:
    uid = entry['uid']
    query = entry['query']
    q_uris = entry['question_uris']
    q_props = entry['question_props']
    tags = entry['tags']
    ql = query.split('\n')
    
    pred_q_props = get_query_props(ql)
    if sorted(q_props) != sorted(pred_q_props):
        print(uid)
        print(sorted(q_props))
        print(sorted(pred_q_props))
        print()
        continue
        
    for prop in q_props:
        if prop not in query:
            print(uid)
            print(sorted(q_props))
            print(sorted(pred_q_props))
            print()

In [110]:
# duplicative answers

for entry in dataset[:1200]:
    answers = [ans['value'] for ans in entry['answers']]
    if len(answers) != len(set(answers)):
        print(entry['uid'])

### Create files for dev/test

In [120]:
test_uids = json_read('test_uids.json')
dev_uids = json_read('dev_uids.json')
dataset = json_read('kbqa_russian_dataset.json')

In [121]:
dataset_test = []
dataset_dev = []

In [122]:
for entry in dataset:
    if entry['uid'] in test_uids:
        dataset_test.append(entry)
    elif entry['uid'] in dev_uids:
        dataset_dev.append(entry)
    else:
        print('WTF')

In [123]:
print(len(dataset_test))
print(len(dataset_dev))

1200
300


In [124]:
json_dump(dataset_test, 'RuBQ_test.json')
json_dump(dataset_dev, 'RuBQ_dev.json')