In [1]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [4]:
with open('labels_new_v3.txt', 'r') as inf:
    labels = defaultdict(list)
    for line in tqdm(inf):
        qid, label = line.strip('\n').split(':', 1)
        labels[qid].append(label)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
class Timer:
    def __init__(self, job_name=None):
        self.job_name = job_name
    
    def __enter__(self):
        self.start_time = time()
        
    def __exit__(self, exc_type, exc_value, traceback):
        end_time = time()
        passed_time = end_time - self.start_time
        line = f'{self.job_name} executed in {passed_time:.3f} s.' if self.job_name else f'{passed_time:.3f} s.'
        print(line)

In [3]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

### Apply analyzer to labels

In [4]:
mystem = Mystem()
simple_tokenizer = CountVectorizer(lowercase=False, token_pattern='\w+').build_analyzer()

def tokenizer(text):
    return(' '.join(simple_tokenizer(text)))

def analyzer(text):
    return(' '.join(simple_analyzer(''.join(mystem.lemmatize(text)))))

def analyzer2(text):
    return(''.join(mystem.lemmatize(' '.join(simple_analyzer(text)))))

In [119]:
# with open('labels_new_v3_test.txt', 'r') as inf, open('labels_mystem_test.txt', 'w') as ouf:
#     for line in tqdm(inf):
#         qid, label = line.strip('\n').split(':', 1)
#         ouf.write(qid + ':' + analyzer(label) + '\n')
 
# create file with only tokenized labels
with open('labels_raw.txt', 'r') as inf, open('labels_token.txt', 'w') as ouf:
    for line in tqdm(inf):
        qid, label = line.strip('\n').split(':', 1)
        ouf.write(qid + ':' + tokenizer(label) + '\n')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Create elastic search index

In [5]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [120]:
def create_es_action(key, data, index):
    es_document = {'qid': key, 'label': data}
    es_action = {
        "_index": index,
        "_source": es_document
    }
    return es_action

documents_file = 'labels_token.txt'

def label_list_documents_generator():
    with open(documents_file, 'r') as file:
        prev_qid = None
        cur_label_list = []
        for line in tqdm(file):
            qid, label = line.split(':', 1)
            if qid == prev_qid:
                cur_label_list.append(label.strip('\n'))
            else:
                if len(cur_label_list) > 0:
                    yield create_es_action(prev_qid, cur_label_list, 'label_list')

                prev_qid = qid
                cur_label_list = [label.strip('\n')]
        
        if len(cur_label_list) > 0:
            yield create_es_action(prev_qid, cur_label_list, 'label_list')
            
def label_single_documents_generator():
    with open(documents_file, 'r') as file:
        for line in tqdm(file):
            qid, label = line.split(':', 1)
            yield create_es_action(qid, label.strip('\n'), 'label_single')

In [121]:
settings = {
    "mappings": {
        "properties": {
            "id": {
                "type": "text"
            },
            "label": {
                "type": "text",
                'analyzer': 'snowball_stemmer'
#                 "fields": {
#                     "shingle": {
#                         "type": "text",
#                         "analyzer": "shingle_analyzer"
#                     }
#                 }
            }
        }
    },
    "settings": {
        "analysis" : {
            "analyzer" : {
                "snowball_stemmer" : {
                    "tokenizer" : "whitespace",
                    "filter" : ['lowercase', "snow_stem"]
                }
            },
            "filter" : {
                "snow_stem" : {
                    "type" : "snowball",
                    "language" : "Russian"
                }
            }
        }
#         "analysis": {
#             "analyzer": {
#                 "shingle_analyzer": {
#                     "tokenizer": "standard",
#                     "filter": [
#                         "custom_shingle"
#                     ]
#                 }
#             },
#             "filter": {
#                 "custom_shingle": {
#                     "type": "shingle",
#                     "min_shingle_size": "2",
#                     "max_shingle_size": "3"
#                 }
#             }
#         }
    }
}

es.indices.delete(index='label_list')
es.indices.create(index='label_list', body=settings)

es.indices.delete(index='label_single')
es.indices.create(index='label_single', body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'label_single'}

In [122]:
for ok, result in parallel_bulk(es, label_single_documents_generator(), queue_size=8, thread_count=8, chunk_size=1000):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Elastic query builder

In [6]:
with open('russian_stopwords.txt', 'r') as inf:
    stopwords = []
    for line in inf:
        stopwords.append(line.strip())

In [6]:
stop_pos_tags = [
    'ADVPRO',
    'APRO',
    'CONJ',
    'INTJ',
    'PART',
    'PR',
    'SPRO',
    'V',
    'ADV'
]

In [7]:
# class TokenFilter:
#     def __init__(self, stopwords, stop_pos_tags):
#         self.stopwords = set(stopwords)
#         self.stop_pos_tags = set(stop_pos_tags)
        
#     def is_good(self, token):
#         if self.has_first_capital(token):
#             return True
#         if self.pass_stopwords(token) and \
#            self.pass_length(token):
#             return True
#         return False
        
#     def pass_stopwords(self, token):
#         if token in self.stopwords:
#             return False
#         return True
    
#     def pass_length(self, token):
#         if len(token) < 2:
#             return False
#         return True
    
#     def has_first_capital(self, token):
#         if token[0].isupper():
#             return True
#         return False

class Morpher:
    def __init__(self):
        self.stop_pos_tags = set([
            'ADVPRO',
            'APRO',
            'CONJ',
            'INTJ',
            'PART',
            'PR',
            'SPRO',
            'V',
            'ADV'
        ])
        self.mystem = Mystem()
        
    def analyze(self, text):
        return self.mystem.analyze(text)
    
    def approve_tag(self, tag):
        if tag in self.stop_pos_tags:
            return False
        return True

In [8]:
class Query:
    def __init__(self, text, tokenizer, morpher):
        self.text = text
        self.tokenizer = tokenizer
        self.morpher = morpher
        self.get_tokens()
        self.get_filtered_tokens()
        self.get_token_ngrams()
        self.get_capital_pairs()
        
    def get_tokens(self):
        self.tokens = self.tokenizer(self.text).split()
        return self.tokens
    
    def get_filtered_tokens(self):
        self.filtered_tokens = []
        analysis = self.morpher.analyze(' '.join(self.tokens))
        for i, ta in enumerate(analysis):
            if i == 0:
                continue
            if 'analysis' in ta:
                if ta['text'][0].isupper() and len(ta['text']) > 1:
                    self.filtered_tokens.append(ta['text'])
                    continue
                if not ta['analysis']:
                    self.filtered_tokens.append(ta['text'])
                    continue
                pos_tag = ta['analysis'][0]['gr'].split(',', 1)[0].split('=', 1)[0]
                if self.morpher.approve_tag(pos_tag):
                    self.filtered_tokens.append(ta['text'])
        return self.filtered_tokens
    
    def get_token_ngrams(self, n=3):
        self.token_ngrams = []
        for i in range(len(self.tokens) - (n - 1)):
            self.token_ngrams.append(' '.join(self.tokens[i:(i + n)]))
        return self.token_ngrams
    
    def get_capital_pairs(self):
        self.capital_pairs = []
        for i in range(1, len(self.tokens) - 1):
            if self.tokens[i][0].isupper() and \
               len(self.tokens[i]) > 1 and \
               self.tokens[i + 1][0].isupper() and \
               len(self.tokens[i + 1]) > 1:
                self.capital_pairs.append(' '.join(self.tokens[i:(i + 2)]))
        return self.capital_pairs
    
    def build_match_query(self, query, fuzziness='AUTO'):
        return  { 
                    'match': {
                        'label': {
                            'query': query,
                            "fuzziness": fuzziness,
                            "prefix_length": 1,
                            'fuzzy_transpositions': False
                        }
                    }
                }
    
    def build_phrase_query(self, query):
        return  { 
                    'match_phrase': {
                        'label': query
                    }
                }
    
    def get_phrase_queries(self):
        qs = []
        for ng in self.token_ngrams:
            qs.append(self.build_phrase_query(ng))
        for cp in self.capital_pairs:
            qs.append(self.build_phrase_query(cp))
        return qs
    
    def get_fulltext_queries(self):
        return [self.build_match_query(' '.join(self.filtered_tokens))]
    
    def get_single_capital_queries(self):
        qs = []
        for token in self.filtered_tokens:
            if token[0].isupper() and len(token) > 2:
                qs.append(self.build_phrase_query(token))
        return qs
    
    def get_partial_queries(self):
        qs = []
        n = len(self.filtered_tokens)
        for i in range(0, n, 2):
            subtext = ' '.join(self.filtered_tokens[i:(min(i + 3, n))])
            qs.append(self.build_match_query(subtext, fuzziness=0))
        return qs
        
    def build_es_query(self, queries):
        return  {
                    'query': {
                        'bool': {
                            'should': queries
                        }
                    }
                }

In [9]:
class QueryTarget(Query):
    def get_filtered_tokens(self):
        self.filtered_tokens = []
        analysis = self.morpher.analyze(' '.join(self.tokens))
        for ta in analysis:
            if 'analysis' in ta:
                if ta['text'][0].isupper() and len(ta['text']) > 1:
                    self.filtered_tokens.append(ta['text'])
                    continue
                if not ta['analysis']:
                    self.filtered_tokens.append(ta['text'])
                    continue
                pos_tag = ta['analysis'][0]['gr'].split(',', 1)[0].split('=', 1)[0]
                if self.morpher.approve_tag(pos_tag):
                    self.filtered_tokens.append(ta['text'])
        return self.filtered_tokens
    
    def get_token_ngrams(self, n=2):
        self.token_ngrams = [' '.join(self.tokens)]
        for i in range(len(self.tokens) - (n - 1)):
            self.token_ngrams.append(' '.join(self.tokens[i:(i + n)]))
        return self.token_ngrams

In [11]:
mrph = Morpher()
with Timer():
    q = Query('Луна', tokenizer=tokenizer, morpher=mrph)
q.get_phrase_queries()

1.421 s.


[]

In [12]:
q.build_es_query(q.get_phrase_queries())

{'query': {'bool': {'should': []}}}

In [23]:
query = tokenizer('Кого сыграл Жан Рено в фильме "Ее звали Никита"?')
# doc = {
#     'query': {
#         "fuzzy" : {
#             "labels" : {
#                 "value": "домашняя кошка",
#                 "fuzziness": 2,
#                 "prefix_length": 0,
#                 "max_expansions": 100,
#                 'transpositions': False
#             }
#         }
#     }
# }
# doc = {
#     'query': {
#         'match': {
#             'label': {
#                 'query': query,
#                 "fuzziness": 'AUTO',
#                 "prefix_length": 1,
#                 'fuzzy_transpositions': False
#             }
#         }
#     }
# }
# doc = {
#     'query': {
#         'match_phrase': {
#             'label': query
#         }
#     }
# }
doc = {
    'query': {
        'bool': {
            'should': [
#                 {
#                     'match_phrase': {
#                         'label': 'Жан Рено'
#                     }
#                 },
#                 {
#                     'match_phrase': {
#                         'label': 'Ее звали Никита'
#                     }
#                 },
                { 
                    'match': {
                        'label':
                        {
                            'query': 'домашняя'
#                             "fuzziness": 'AUTO',
#                             "prefix_length": 1,
#                             'fuzzy_transpositions': False
                        }
                    }
                },
                                { 
                    'match': {
                        'label':
                        {
                            'query': 'кошка'
#                             "fuzziness": 'AUTO',
#                             "prefix_length": 1,
#                             'fuzzy_transpositions': False
                        }
                    }
                }
            ]
        }
    }
}
# doc = {
#     'query': {
#         'match_all': {}
#     }
# }
q = Query('Что покажут жители Таллина, если вы захотите взглянуть на Толстую Маргариту?', tokenizer=tokenizer, morpher=mrph)
print(q.filtered_tokens)
doc = q.build_es_query(q.get_phrase_queries())

res = es.search(index='label_single', body=doc, size=20)
res['hits']

['жители', 'Таллина', 'Толстую', 'Маргариту']


{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 18.215916,
 'hits': [{'_index': 'label_single',
   '_type': '_doc',
   '_id': 'Jt_vpmwBg_1FN09HLgUG',
   '_score': 18.215916,
   '_source': {'qid': 'Q282994', 'label': 'Толстая Маргарита'}}]}

### Execute query

In [13]:
class Matcher:
    def __init__(self, es_instance):
        self.es = es_instance
        
    def _smooth_score(self, score, d=5):
        score = int(score)
        while score % d != 0:
            score += 1
        return score
    
    def _parse_id(self, qid):
        return int(qid[1:])
    
    def _sorting_key(self, match):
        score = self._smooth_score(match['score'])
        qid = self._parse_id(match['qid'])
        return score, -qid
    
    def get_names_and_descriptions(self, qids):
        if not qids:
            return {}
        qids_list = '|'.join(qids)
        wikiapi_query = f'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={qids_list}&languages=ru&props=labels|descriptions'
        resp = requests.get(wikiapi_query).json()
        result = {}
        for qid in qids:
            cur_data = resp['entities'][qid]
            name = None
            if 'labels' in cur_data:
                if 'ru' in cur_data['labels']:
                    name = cur_data['labels']['ru']['value']
            description = None
            if 'descriptions' in cur_data:
                if 'ru' in cur_data['descriptions']:
                    description = cur_data['descriptions']['ru']['value']
            result[qid] = {'name': name, 'description': description}
        return result
    
    def get_wikipedia_pageviews(self, titles_dict):
        if not titles_dict:
            return
        titles_list = list(map(lambda s: s.replace(' ', '_'), titles_dict.keys()))
        for i in range(len(titles_list)):
            titles = '|'.join(titles_list)
            wikiapi_query = f'https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews&pvipdays=30&titles={titles}'
            resp = requests.get(wikiapi_query).json()
            if 'batchcomplete' in resp:
                break
#         print(titles_list)
#         print(resp)
        pages_data = (resp['query']['pages'])
#         print(len(titles_list))
#         print(len(pages_data.values()))
        
        for entry in pages_data.values():
            view_stats = entry['pageviews']
            views = sum(filter(None, view_stats.values()))
            cur_title = entry['title']
            titles_dict[cur_title] = views
    
    def get_wikipedia_pages(self, qids):
        if not qids:
            return {}
        qids_list = '|'.join(qids)
        wikiapi_query = f'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={qids_list}&sitefilter=ruwiki&props=sitelinks'
        resp = requests.get(wikiapi_query).json()
        result = {}
        views = {}
        for qid in qids:
            cur_data = resp['entities'][qid]
            wiki_title = None
            if 'sitelinks' in cur_data and 'ruwiki' in cur_data['sitelinks'] and 'title' in cur_data['sitelinks']['ruwiki']:
                wiki_title = cur_data['sitelinks']['ruwiki']['title']
                views[wiki_title] = 0
            result[qid] = {'ruwiki': wiki_title}
            
        self.get_wikipedia_pageviews(views)
        for qid in qids:
            cur_title = result[qid]['ruwiki']
            if cur_title is not None:
                result[qid]['views'] = views[cur_title]
            else:
                result[qid]['views'] = 0
        return result
    
#     def get_wikipedia_pageviews(self, wiki_title):
#         wiki_title = wiki_title.replace(' ', '_')
#         wikiapi_query = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/ru.wikipedia/all-access/all-agents/{wiki_title}/monthly/2019010100/2019013100'
#         resp = requests.get(wikiapi_query).json()
#         return resp['items'][0]['views']

    def _apply_ranking(self, matches, relative_rate=0.9):
        if len(matches) == 0:
            return matches
        
        matches = sorted(matches, key=lambda m: m['score'], reverse=True)      
        
        cur_max_pos = 0
        cur_pos = 1
        while cur_pos < len(matches):
            cur_max_score = matches[cur_max_pos]['score']
            while cur_pos < len(matches) and matches[cur_pos]['score'] >= relative_rate * matches[cur_pos - 1]['score']:
                cur_pos += 1
            matches[cur_max_pos:cur_pos] = sorted(matches[cur_max_pos:cur_pos], key=lambda m: m['views'], reverse=True)
            cur_max_pos = cur_pos
            cur_pos = cur_max_pos + 1
            
        return matches
        
    def get_query_matches(self, query, n_matches=30):
        es_result = es.search(index='label_single', body=query, size=n_matches)['hits']
        matches_dict = {}
        for entry in es_result['hits']:
            qid = entry['_source']['qid']
            if qid not in matches_dict:
                matches_dict[qid] = entry['_source']
                matches_dict[qid]['score'] = entry['_score']
        
        names_and_descriptions = self.get_names_and_descriptions(matches_dict.keys())
        for qid, nds in names_and_descriptions.items():
            matches_dict[qid].update(nds)
            
        wikipedia_pages = self.get_wikipedia_pages(matches_dict.keys())
        for qid, wps in wikipedia_pages.items():
            matches_dict[qid].update(wps)
            
#         for qid in tqdm(matches_dict):
#             match = matches_dict[qid]
#             match['views'] = 0
#             if match['ruwiki'] is not None:
#                 match['views'] = self.get_wikipedia_pageviews(match['ruwiki'])
            
#         matches = sorted(matches_dict.values(), key=self._sorting_key, reverse=True)
        matches = self._apply_ranking(list(matches_dict.values()))
        return matches

In [24]:
mtc = Matcher(es_instance=es)

In [27]:
q.build_es_query(q.get_single_capital_queries()[0])
q.build_es_query(q.get_phrase_queries())
# q.build_es_query(q.get_fulltext_queries())

{'query': {'bool': {'should': [{'match_phrase': {'label': 'Что покажут жители'}},
    {'match_phrase': {'label': 'покажут жители Таллина'}},
    {'match_phrase': {'label': 'жители Таллина если'}},
    {'match_phrase': {'label': 'Таллина если вы'}},
    {'match_phrase': {'label': 'если вы захотите'}},
    {'match_phrase': {'label': 'вы захотите взглянуть'}},
    {'match_phrase': {'label': 'захотите взглянуть на'}},
    {'match_phrase': {'label': 'взглянуть на Толстую'}},
    {'match_phrase': {'label': 'на Толстую Маргариту'}},
    {'match_phrase': {'label': 'Толстую Маргариту'}}]}}}

In [36]:
mtc.get_query_matches(q.build_es_query(q.get_fulltext_queries()))
# mtc.get_query_matches(q.build_es_query(q.get_single_capital_queries()[0]))
# mtc.get_query_matches(q.build_es_query(q.get_phrase_queries()))

[{'qid': 'Q2813',
  'label': 'Кока кола',
  'score': 11.159221,
  'name': 'Кока-кола',
  'description': 'безалкогольный газированный напиток',
  'ruwiki': 'Кока-кола',
  'views': 32018},
 {'qid': 'Q3295867',
  'label': 'Кока кола',
  'score': 11.159221,
  'name': 'Кока-кола',
  'description': None,
  'ruwiki': 'The Coca-Cola Company',
  'views': 11994},
 {'qid': 'Q4135566',
  'label': 'Кафа город',
  'score': 13.625122,
  'name': 'Генуэзская крепость (Феодосия)',
  'description': 'Каффа',
  'ruwiki': 'Генуэзская крепость (Феодосия)',
  'views': 1610},
 {'qid': 'Q196378',
  'label': 'Со кол город',
  'score': 11.375547,
  'name': 'Сокол',
  'description': 'город в Сокольском районе Вологодской области России',
  'ruwiki': 'Сокол (город)',
  'views': 1510},
 {'qid': 'Q1104936',
  'label': 'Коба город',
  'score': 13.625122,
  'name': 'Коба (город)',
  'description': None,
  'ruwiki': 'Коба (майя)',
  'views': 737},
 {'qid': 'Q24672271',
  'label': 'Город котов',
  'score': 13.625122,
  '

In [15]:
class Suggester:
    def __init__(self, matcher, tokenizer, morpher):
        self.matcher = matcher
        self.tokenizer = tokenizer
        self.morpher = morpher
        
    def _build_query(self, text):
        return Query(text=text, tokenizer=self.tokenizer, morpher=self.morpher)
    
    def _get_matches(self, q, query_type):
        if query_type == 'fulltext':
            return self.matcher.get_query_matches(q.build_es_query(q.get_fulltext_queries()), n_matches=20)
        if query_type == 'phrase':
            return self.matcher.get_query_matches(q.build_es_query(q.get_phrase_queries()), n_matches=8)
        if query_type == 'single':
            single_queries = q.get_single_capital_queries()
            return [self.matcher.get_query_matches(q.build_es_query(single_query), n_matches=10)
                   for single_query in single_queries]
        
    def _select_matches(self, q, min_total=8, f=5, p=4, s=1):
        matches = []
        matches_qids = set()
        
#         print('# Phrase')
        phrase_matches = self._get_matches(q, query_type='phrase')
        if len(phrase_matches) > p:
            phrase_matches = phrase_matches[:p]
        for match in phrase_matches:
            matches_qids.add(match['qid'])
            match['source'] = 'phrase'
#             print(match)
        
#         print('# Singles')
        single_matches = self._get_matches(q, query_type='single')
        single_matches_result = []
        for single_match in single_matches:
            if not single_match:
                continue
            top_match = single_match[0]
            if top_match['qid'] not in matches_qids:
                matches_qids.add(top_match['qid'])
                top_match['source'] = 'single'
                single_matches_result.append(top_match)
#                 print(top_match)
        single_matches_result = sorted(single_matches_result, key=lambda m: m['views'], reverse=True)
        
#         print('# Fulltext')
        fulltext_matches = self._get_matches(q, query_type='fulltext')
        fulltext_matches_result = []
        f_cnt = 0
        for match in fulltext_matches:
            if match['qid'] not in matches_qids:
                f_cnt += 1
                matches_qids.add(match['qid'])
                match['source'] = 'fulltext'
                fulltext_matches_result.append(match)
#                 print(match)
            if len(matches_qids) >= min_total and f_cnt >= f:
                break
        fulltext_matches_result = sorted(fulltext_matches_result, key=lambda m: m['score'], reverse=True)
                
        matches.extend(phrase_matches)
        matches.extend(fulltext_matches_result)
        matches.extend(single_matches_result)
        return matches
    
    def get_suggestions(self, text, answer=''):
#         with Timer('Build query'):
        q = self._build_query(text)
        
#         with Timer('Select matches'):
        matches = self._select_matches(q)
        return {
            'text': text,
            'answer': answer,
            'matches': matches
        }
    
    @staticmethod
    def pretty_print(entry):
        lines = []
        
        matches = entry['matches']
        query = entry['text']
        
        lines.append(f'Query: {query}\n')
        
        for match in matches:
            if match['name']:
                lines.append(match['name'])
            else:
                lines.append('*no name*')

            if match['description']:
                desc = match['description']
                lines.append(f'({desc})')

            qid = match['qid']
            lines.append(f'Link: https://www.wikidata.org/wiki/{qid}\n')
        
        return '\n'.join(lines)

In [16]:
class SuggesterTarget(Suggester):
    def _build_query(self, text):
        return QueryTarget(text=text, tokenizer=self.tokenizer, morpher=self.morpher)
    
    def _get_matches(self, q, query_type):
        if query_type == 'fulltext':
            return self.matcher.get_query_matches(q.build_es_query(q.get_fulltext_queries()), n_matches=5)
        if query_type == 'phrase':
            return self.matcher.get_query_matches(q.build_es_query(q.get_phrase_queries()), n_matches=10)
        if query_type == 'single':
            single_queries = q.get_single_capital_queries()
            return [self.matcher.get_query_matches(q.build_es_query(single_query), n_matches=5)
                   for single_query in single_queries]
    
    def _select_matches(self, q, min_total=5, f=3, p=4, s=1):
        return super()._select_matches(q, min_total, f, p, s)

In [19]:
mrph = Morpher()
mtc = Matcher(es_instance=es)
suggester = Suggester(matcher=mtc, tokenizer=tokenizer, morpher=mrph)

In [34]:
with Timer('Make suggestions'):
    suggestions = suggester.get_suggestions('В каком австрийском городе расположен Музей кофе?')
    print(Suggester.pretty_print(suggestions))
    print(suggestions)

Query: В каком австрийском городе расположен Музей кофе?

Генуэзская крепость (Феодосия)
(Каффа)
Link: https://www.wikidata.org/wiki/Q4135566

Коба (город)
Link: https://www.wikidata.org/wiki/Q1104936

Город кошек
(Полнометражный документальный фильм о бродячих кошках, населяющих город Стамбул)
Link: https://www.wikidata.org/wiki/Q24672271

Коро
Link: https://www.wikidata.org/wiki/Q1134454

Музей мумий
Link: https://www.wikidata.org/wiki/Q785223

Австрийский музей прикладного искусства
(музей в Вене)
Link: https://www.wikidata.org/wiki/Q478455

Сокол
(город в Сокольском районе Вологодской области России)
Link: https://www.wikidata.org/wiki/Q196378

муза
(покровительницы искусств и наук)
Link: https://www.wikidata.org/wiki/Q66016

{'text': 'В каком австрийском городе расположен Музей кофе?', 'answer': '', 'matches': [{'qid': 'Q4135566', 'label': 'Кафа город', 'score': 13.625122, 'name': 'Генуэзская крепость (Феодосия)', 'description': 'Каффа', 'ruwiki': 'Генуэзская крепость (Феодосия)',

### Get entites from questions

In [21]:
suggestions_data = []

In [24]:
delim = '######'
dataset_num = '14001_end'
with open(f'matching/quiz_dataset_{dataset_num}.txt', 'r') as dataset, \
     open(f'matching/quiz_entities_{dataset_num}.txt', 'a') as pretty:
    for i, line in enumerate(tqdm(dataset)):
        if i < 3426:
            continue
        if i % 2 == 0:
            question = line.strip('\n')
        if i % 2 == 1:
            answer = line.strip('\n')
            print(delim, file=pretty)
            print(f'{i // 2 + 1}. ', end='', file=pretty)
            suggestions = suggester.get_suggestions(question, answer)
            suggestions_data.append(suggestions)
            with open(f'matching/quiz_entities_data_{dataset_num}.json', 'w') as results:
                json.dump(suggestions_data, results, indent=4, ensure_ascii=False)
            print(Suggester.pretty_print(suggestions), file=pretty)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

### Get entites from answers

In [37]:
suggestions_data = []

In [38]:
delim = '######'
with open('matching/quiz_dataset_14001_end.txt', 'r') as dataset, \
     open('matching/quiz_answer_entities_14001_end.txt', 'a') as pretty:
    for i, line in enumerate(tqdm(dataset)):
        if i % 2 == 0:
            question = line.strip('\n')
            continue
        answer = line.strip('\n')
        print(delim, file=pretty)
        print(f'{i // 2 + 1}.', file=pretty)
        print(f'Question: {question}', file=pretty)
        suggestions = suggester.get_suggestions(answer)
        suggestions['question'] = question
        suggestions_data.append(suggestions)
        with open('matching/quiz_answer_entities_data_14001_end.json', 'w') as results:
            json.dump(suggestions_data, results, indent=4, ensure_ascii=False)
        print(Suggester.pretty_print(suggestions), file=pretty)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [36]:
len(suggestions_data)

4000

In [714]:
suggestions_data[-1]

{'text': 'В каком году была Хиросима и Нагасаки?',
 'matches': [{'qid': 'Q488',
   'label': 'атомные бомбардировки Хиросимы и Нагасаки',
   'score': 17.550804,
   'name': 'атомные бомбардировки Хиросимы и Нагасаки',
   'description': 'бомбардировки атомными бомбами японских городов 6 и 9 августа 1945 года',
   'ruwiki': 'Атомные бомбардировки Хиросимы и Нагасаки',
   'views': 68446,
   'source': 'phrase'},
  {'qid': 'Q169376',
   'label': 'Нагасаки',
   'score': 12.959096,
   'name': 'Нагасаки',
   'description': 'префектура Японии',
   'ruwiki': 'Нагасаки (префектура)',
   'views': 335,
   'source': 'fulltext'},
  {'qid': 'Q617375',
   'label': 'Хиросима',
   'score': 12.252915,
   'name': 'Хиросима',
   'description': 'префектура Японии',
   'ruwiki': 'Хиросима (префектура)',
   'views': 371,
   'source': 'fulltext'},
  {'qid': 'Q11276735',
   'label': 'Хиросима',
   'score': 12.252915,
   'name': 'Хиросима',
   'description': None,
   'ruwiki': 'Хиросима (фильм, 1953)',
   'views': 

### Tests

In [657]:
t = tokenizer('Какое прозвище носил король Англии Ричард I?')

In [658]:
mrph.analyze(t)

[{'analysis': [{'lex': 'какой',
    'wt': 0.9940105847,
    'gr': 'APRO=(вин,ед,сред|им,ед,сред)'}],
  'text': 'Какое'},
 {'text': ' '},
 {'analysis': [{'lex': 'прозвище',
    'wt': 1,
    'gr': 'S,сред,неод=(пр,ед|вин,ед|им,ед)'}],
  'text': 'прозвище'},
 {'text': ' '},
 {'analysis': [{'lex': 'носить',
    'wt': 1,
    'gr': 'V,несов,пе=прош,ед,изъяв,муж'}],
  'text': 'носил'},
 {'text': ' '},
 {'analysis': [{'lex': 'король', 'wt': 1, 'gr': 'S,муж,од=им,ед'}],
  'text': 'король'},
 {'text': ' '},
 {'analysis': [{'lex': 'англия',
    'wt': 1,
    'gr': 'S,гео,жен,неод=(пр,ед|вин,мн|дат,ед|род,ед|им,мн)'}],
  'text': 'Англии'},
 {'text': ' '},
 {'analysis': [{'lex': 'ричард',
    'wt': 0.9724140581,
    'gr': 'S,имя,муж,од=им,ед'}],
  'text': 'Ричард'},
 {'text': ' '},
 {'analysis': [], 'text': 'I'},
 {'text': '\n'}]

In [659]:
body = {
    'tokenizer': 'whitespace',
    'filter': ['lowercase', 'snow_stem'],
#     'text': 'Эскиз декорации III го акт оперы А Спендиарова  Алмаст '
    'text': 'Ричард I'
}

es.indices.analyze(index='label_single', body=body)

{'tokens': [{'token': 'ричард',
   'start_offset': 0,
   'end_offset': 6,
   'type': 'word',
   'position': 0},
  {'token': 'i',
   'start_offset': 7,
   'end_offset': 8,
   'type': 'word',
   'position': 1}]}

In [647]:
mystem = Mystem()

In [650]:
mystem.analyze(tokenizer('Кого сыграл Жан Рено в фильме "Ее звали Никита"?'))
mystem.analyze('Максима Галкина')

[{'analysis': [{'lex': 'максим',
    'wt': 0.8066484349,
    'gr': 'S,имя,муж,од=(вин,ед|род,ед)'}],
  'text': 'Максима'},
 {'text': ' '},
 {'analysis': [{'lex': 'галкин',
    'wt': 0.7169838121,
    'gr': 'S,фам,муж,од=(вин,ед|род,ед)'}],
  'text': 'Галкина'},
 {'text': '\n'}]

In [None]:
stop_pos_tags = [
    'ADVPRO',
    'APRO',
    'CONJ',
    'INTJ',
    'PART',
    'PR',
    'SPRO',
    'V',
    'ADV'
]

In [684]:
# resp = requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q3656114|Q17&sitefilter=ruwiki&props=sitelinks')
resp = requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q3656114|Q17&languages=ru&props=labels|descriptions')
# resp = requests.get('https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/ru.wikipedia/all-access/all-agents/Обама,_Барак/monthly/2019010100/2019013100')
# resp = requests.get('https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews&pvipdays=20&titles=(2208)_Пушкин')
resp

<Response [200]>

In [685]:
# resp.json()['query']['pages'].values()
resp.json()

{'entities': {'Q3656114': {'type': 'item',
   'id': 'Q3656114',
   'labels': {'ru': {'language': 'ru',
     'value': 'История государства Российского'}},
   'descriptions': {}},
  'Q17': {'type': 'item',
   'id': 'Q17',
   'labels': {'ru': {'language': 'ru', 'value': 'Япония'}},
   'descriptions': {'ru': {'language': 'ru',
     'value': 'островное государство в Восточной Азии'}}}},
 'success': 1}

In [719]:
es.cluster.allocation_explain(include_disk_info=True)

{'index': 'label_single',
 'shard': 0,
 'primary': False,
 'current_state': 'unassigned',
 'unassigned_info': {'reason': 'INDEX_CREATED',
  'at': '2019-08-18T22:52:42.694Z',
  'last_allocation_status': 'no_attempt'},
 'cluster_info': {'nodes': {'T3rpd-0gSEePSbG0hzcyMA': {'node_name': 'Air-Vladislav.Dlink',
    'least_available': {'path': '/usr/local/var/lib/elasticsearch/nodes/0',
     'total_bytes': 121123069952,
     'used_bytes': 114139070464,
     'free_bytes': 6983999488,
     'free_disk_percent': 5.8,
     'used_disk_percent': 94.2},
    'most_available': {'path': '/usr/local/var/lib/elasticsearch/nodes/0',
     'total_bytes': 121123069952,
     'used_bytes': 114139070464,
     'free_bytes': 6983999488,
     'free_disk_percent': 5.8,
     'used_disk_percent': 94.2}}},
  'shard_sizes': {'[label_list][0][p]_bytes': 283,
   '[label_single][0][p]_bytes': 443925562,
   '[labels][0][p]_bytes': 53812},
  'shard_paths': {'[labels][0], node[T3rpd-0gSEePSbG0hzcyMA], [P], s[STARTED], a[id=D

In [71]:
wikiapi_query = f'https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews&pvipdays=30&titles='
resp = requests.get(wikiapi_query).json()
resp

{'batchcomplete': ''}

In [100]:
trtr = json_read('matching/quiz_answer_entities_data_5001_10000.json')
len(trtr)

5000

In [21]:
qst = set()
with open('labels_token.txt', 'r') as inf:
    for line in inf:
        qid = line.split(':')[0]
        qst.add(qid)
len(qst)

4114595

In [19]:
es.count(index='label_single')

{'count': 5430657,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}