In [3]:
%config IPCompleter.greedy=True
import re
import json
import csv
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from time import time

### Prepare toloka tasks

#### Read matches data

In [150]:
with open('matches.json', 'r') as inf:
    items = json.load(inf)

In [100]:
qs = []
with open('quiz_dataset.txt', 'r') as inf:
    for i, line in enumerate(inf):
        if i % 2 == 0:
            qs.append(line.strip('\n'))

In [101]:
for i in range(len(items)):
    items[i]['question'] = qs[i]
    items[i]['pos_id'] = i + 1

In [148]:
for item in items:
    q = item['question']
    a = item['text']
    if f'({a})' in q:
        q = q.replace(f'({a})', '')
        item['question'] = q

In [149]:
with open('matches.json', 'w') as ouf:
    json.dump(items, ouf, ensure_ascii=False, indent=4)

#### Filter questions

In [145]:
def is_number(s):
    res = True
    try:
        float(s.replace(',', '.'))
    except:
        res = False
    return res

def has_multichoice(q):
    mc_words = ['из этого', 'из перечисл', 'из этих', 'из следующ']
    for mcw in mc_words:
        if mcw in q:
            return True
    return False

def has_answer_in_question(q, a):
    return f'({a})' in q

In [139]:
def filter_numbers_in_answer(item):
    return not is_number(item['text'])

def filter_multichoice_in_question(item):
    return not has_multichoice(item['question'])

In [154]:
items_filtered = []
for item in items:
    if filter_numbers_in_answer(item) and filter_multichoice_in_question(item):
        items_filtered.append(item)

#### Create record from data

In [169]:
def match_to_task(match, i):
    res = {}
    
    name = match['name'] if match['name'] else '???'
    if match['description']:
        if len(match['description']) > 60:
            desc = match['description'][:57] + '...'
        else:
            desc = match['description']
        name += f' ({desc})'
    name = name.replace('"', '\\"').replace(',', '\,')
    res['text'] = name
    
    if match['ruwiki']:
        page = match['ruwiki'].replace(' ', '_')
        url = f'https://ru.wikipedia.org/wiki/{page}'.replace(',', '\,')
        res['link_text'] = 'Википедия'
    else:
        page = match['qid']
        url = f'https://www.wikidata.org/wiki/{page}'.replace(',', '\,')
        res['link_text'] = 'Викиданные'
    url = url.replace('"', '\\"')
    res['url'] = url
    
    res['value'] = i
    res['hotkey'] = str(i)
    
    return res

def data_to_task(data):
    task = []
    
    # question
    task.append(data['question'])
    
    # answer
    task.append(data['text'])
    
    # matches
    matches = data['matches']
    task_matches = []
    for i, match in enumerate(matches):
        cur_task_match = match_to_task(match, i + 1)
        task_matches.append('{' + '\,'.join([f'"{k}":"{v}"' for k, v in cur_task_match.items()]) + '}')
    task.append(','.join(task_matches))
    
    # golden
    task.append('')
    
    # hint
    task.append('')
    
    return task

#### Create tsv file

In [170]:
header = ['INPUT:question', 'INPUT:answer', 'INPUT:matches', 'GOLDEN:match', 'HINT:text']

with open('task.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n', quotechar='"')
    writer.writerow(header)
    for item in items_filtered[0:10]:
        writer.writerow(data_to_task(item))

In [162]:
'hfkdwj'[:10]

'hfkdwj'