In [586]:
%config IPCompleter.greedy=True
import re
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [4]:
with open('labels_new_v3.txt', 'r') as inf:
    labels = defaultdict(list)
    for line in tqdm(inf):
        qid, label = line.strip('\n').split(':', 1)
        labels[qid].append(label)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
for k, v in tqdm(labels.items()):
    pass

HBox(children=(IntProgress(value=0, max=4114595), HTML(value='')))




In [598]:
class Timer:
    def __init__(self, job_name=None):
        self.job_name = job_name
    
    def __enter__(self):
        self.start_time = time()
        
    def __exit__(self, exc_type, exc_value, traceback):
        end_time = time()
        passed_time = end_time - self.start_time
        line = f'{self.job_name} executed in {passed_time:.3f} s.' if self.job_name else f'{passed_time:.3f} s.'
        print(line)

### Apply analyzer to labels

In [52]:
mystem = Mystem()
simple_tokenizer = CountVectorizer(lowercase=False, token_pattern='\w+').build_analyzer()

def tokenizer(text):
    return(' '.join(simple_tokenizer(text)))

def analyzer(text):
    return(' '.join(simple_analyzer(''.join(mystem.lemmatize(text)))))

def analyzer2(text):
    return(''.join(mystem.lemmatize(' '.join(simple_analyzer(text)))))

In [119]:
# with open('labels_new_v3_test.txt', 'r') as inf, open('labels_mystem_test.txt', 'w') as ouf:
#     for line in tqdm(inf):
#         qid, label = line.strip('\n').split(':', 1)
#         ouf.write(qid + ':' + analyzer(label) + '\n')
 
# create file with only tokenized labels
with open('labels_raw.txt', 'r') as inf, open('labels_token.txt', 'w') as ouf:
    for line in tqdm(inf):
        qid, label = line.strip('\n').split(':', 1)
        ouf.write(qid + ':' + tokenizer(label) + '\n')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Create elastic search index

In [22]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [120]:
def create_es_action(key, data, index):
    es_document = {'qid': key, 'label': data}
    es_action = {
        "_index": index,
        "_source": es_document
    }
    return es_action

documents_file = 'labels_token.txt'

def label_list_documents_generator():
    with open(documents_file, 'r') as file:
        prev_qid = None
        cur_label_list = []
        for line in tqdm(file):
            qid, label = line.split(':', 1)
            if qid == prev_qid:
                cur_label_list.append(label.strip('\n'))
            else:
                if len(cur_label_list) > 0:
                    yield create_es_action(prev_qid, cur_label_list, 'label_list')

                prev_qid = qid
                cur_label_list = [label.strip('\n')]
        
        if len(cur_label_list) > 0:
            yield create_es_action(prev_qid, cur_label_list, 'label_list')
            
def label_single_documents_generator():
    with open(documents_file, 'r') as file:
        for line in tqdm(file):
            qid, label = line.split(':', 1)
            yield create_es_action(qid, label.strip('\n'), 'label_single')

In [121]:
settings = {
    "mappings": {
        "properties": {
            "id": {
                "type": "text"
            },
            "label": {
                "type": "text",
                'analyzer': 'snowball_stemmer'
#                 "fields": {
#                     "shingle": {
#                         "type": "text",
#                         "analyzer": "shingle_analyzer"
#                     }
#                 }
            }
        }
    },
    "settings": {
        "analysis" : {
            "analyzer" : {
                "snowball_stemmer" : {
                    "tokenizer" : "whitespace",
                    "filter" : ['lowercase', "snow_stem"]
                }
            },
            "filter" : {
                "snow_stem" : {
                    "type" : "snowball",
                    "language" : "Russian"
                }
            }
        }
#         "analysis": {
#             "analyzer": {
#                 "shingle_analyzer": {
#                     "tokenizer": "standard",
#                     "filter": [
#                         "custom_shingle"
#                     ]
#                 }
#             },
#             "filter": {
#                 "custom_shingle": {
#                     "type": "shingle",
#                     "min_shingle_size": "2",
#                     "max_shingle_size": "3"
#                 }
#             }
#         }
    }
}

es.indices.delete(index='label_list')
es.indices.create(index='label_list', body=settings)

es.indices.delete(index='label_single')
es.indices.create(index='label_single', body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'label_single'}

In [122]:
for ok, result in parallel_bulk(es, label_single_documents_generator(), queue_size=8, thread_count=8, chunk_size=1000):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Elastic query builder

In [178]:
with open('russian_stopwords.txt', 'r') as inf:
    stopwords = []
    for line in inf:
        stopwords.append(line.strip())

In [202]:
stop_pos_tags = [
    'ADVPRO',
    'APRO',
    'CONJ',
    'INTJ',
    'PART',
    'PR',
    'SPRO',
    'V',
    'ADV'
]

In [221]:
# class TokenFilter:
#     def __init__(self, stopwords, stop_pos_tags):
#         self.stopwords = set(stopwords)
#         self.stop_pos_tags = set(stop_pos_tags)
        
#     def is_good(self, token):
#         if self.has_first_capital(token):
#             return True
#         if self.pass_stopwords(token) and \
#            self.pass_length(token):
#             return True
#         return False
        
#     def pass_stopwords(self, token):
#         if token in self.stopwords:
#             return False
#         return True
    
#     def pass_length(self, token):
#         if len(token) < 2:
#             return False
#         return True
    
#     def has_first_capital(self, token):
#         if token[0].isupper():
#             return True
#         return False

class Morpher:
    def __init__(self):
        self.stop_pos_tags = set([
            'ADVPRO',
            'APRO',
            'CONJ',
            'INTJ',
            'PART',
            'PR',
            'SPRO',
            'V',
            'ADV'
        ])
        self.mystem = Mystem()
        
    def analyze(self, text):
        return self.mystem.analyze(text)
    
    def approve_tag(self, tag):
        if tag in self.stop_pos_tags:
            return False
        return True

In [543]:
class Query:
    def __init__(self, text, tokenizer, morpher):
        self.text = text
        self.tokenizer = tokenizer
        self.morpher = morpher
        self.get_tokens()
        self.get_filtered_tokens()
        self.get_token_ngrams()
        self.get_capital_pairs()
        
    def get_tokens(self):
        self.tokens = self.tokenizer(self.text).split()
        return self.tokens
    
    def get_filtered_tokens(self):
        self.filtered_tokens = []
        analysis = self.morpher.analyze(' '.join(self.tokens))
        for i, ta in enumerate(analysis):
            if i == 0:
                continue
            if 'analysis' in ta:
                if ta['text'][0].isupper() and len(ta['text']) > 1:
                    self.filtered_tokens.append(ta['text'])
                    continue
                pos_tag = ta['analysis'][0]['gr'].split(',', 1)[0].split('=', 1)[0]
                if self.morpher.approve_tag(pos_tag):
                    self.filtered_tokens.append(ta['text'])
        return self.filtered_tokens
    
    def get_token_ngrams(self, n=3):
        self.token_ngrams = []
        for i in range(len(self.tokens) - (n - 1)):
            self.token_ngrams.append(' '.join(self.tokens[i:(i + n)]))
        return self.token_ngrams
    
    def get_capital_pairs(self):
        self.capital_pairs = []
        for i in range(1, len(self.tokens) - 1):
            if self.tokens[i][0].isupper() and \
               len(self.tokens[i]) > 1 and \
               self.tokens[i + 1][0].isupper() and \
               len(self.tokens[i + 1]) > 1:
                self.capital_pairs.append(' '.join(self.tokens[i:(i + 2)]))
        return self.capital_pairs
    
    def build_match_query(self, query, fuzziness='AUTO'):
        return  { 
                    'match': {
                        'label': {
                            'query': query,
                            "fuzziness": fuzziness,
                            "prefix_length": 1,
                            'fuzzy_transpositions': False
                        }
                    }
                }
    
    def build_phrase_query(self, query):
        return  { 
                    'match_phrase': {
                        'label': query
                    }
                }
    
    def get_phrase_queries(self):
        qs = []
        for ng in self.token_ngrams:
            qs.append(self.build_phrase_query(ng))
        for cp in self.capital_pairs:
            qs.append(self.build_phrase_query(cp))
        return qs
    
    def get_fulltext_queries(self):
        return [self.build_match_query(' '.join(self.filtered_tokens))]
    
    def get_single_capital_queries(self):
        qs = []
        for token in self.filtered_tokens:
            if token[0].isupper() and len(token) > 2:
                qs.append(self.build_phrase_query(token))
        return qs
    
    def get_partial_queries(self):
        qs = []
        n = len(self.filtered_tokens)
        for i in range(0, n, 2):
            subtext = ' '.join(self.filtered_tokens[i:(min(i + 3, n))])
            qs.append(self.build_match_query(subtext, fuzziness=0))
        return qs
        
    def build_es_query(self, queries):
        return  {
                    'query': {
                        'bool': {
                            'should': queries
                        }
                    }
                }

In [603]:
mrph = Morpher()
with Timer():
    q = Query('О какой битве в российской истории говорил русский поэт А . Пушкин?', tokenizer=tokenizer, morpher=mrph)
q.get_phrase_queries()

1.551 s.


[{'match_phrase': {'label': 'О какой битве'}},
 {'match_phrase': {'label': 'какой битве в'}},
 {'match_phrase': {'label': 'битве в российской'}},
 {'match_phrase': {'label': 'в российской истории'}},
 {'match_phrase': {'label': 'российской истории говорил'}},
 {'match_phrase': {'label': 'истории говорил русский'}},
 {'match_phrase': {'label': 'говорил русский поэт'}},
 {'match_phrase': {'label': 'русский поэт А'}},
 {'match_phrase': {'label': 'поэт А Пушкин'}}]

In [535]:
q.build_es_query(q.get_phrase_queries())

{'query': {'bool': {'should': [{'match_phrase': {'label': 'О какой битве'}},
    {'match_phrase': {'label': 'какой битве в'}},
    {'match_phrase': {'label': 'битве в российской'}},
    {'match_phrase': {'label': 'в российской истории'}},
    {'match_phrase': {'label': 'российской истории говорил'}},
    {'match_phrase': {'label': 'истории говорил русский'}},
    {'match_phrase': {'label': 'говорил русский поэт'}},
    {'match_phrase': {'label': 'русский поэт А'}},
    {'match_phrase': {'label': 'поэт А Пушкин'}}]}}}

In [639]:
query = tokenizer('Кого сыграл Жан Рено в фильме "Ее звали Никита"?')
# doc = {
#     'query': {
#         "fuzzy" : {
#             "labels" : {
#                 "value": "домашняя кошка",
#                 "fuzziness": 2,
#                 "prefix_length": 0,
#                 "max_expansions": 100,
#                 'transpositions': False
#             }
#         }
#     }
# }
# doc = {
#     'query': {
#         'match': {
#             'label': {
#                 'query': query,
#                 "fuzziness": 'AUTO',
#                 "prefix_length": 1,
#                 'fuzzy_transpositions': False
#             }
#         }
#     }
# }
# doc = {
#     'query': {
#         'match_phrase': {
#             'label': query
#         }
#     }
# }
doc = {
    'query': {
        'bool': {
            'should': [
#                 {
#                     'match_phrase': {
#                         'label': 'Жан Рено'
#                     }
#                 },
#                 {
#                     'match_phrase': {
#                         'label': 'Ее звали Никита'
#                     }
#                 },
                { 
                    'match': {
                        'label': {
                            'query': 'Пушкин',
                            "fuzziness": 'AUTO:7,10',
                            "prefix_length": 1,
                            'fuzzy_transpositions': False
                        }
                    }
                }
            ]
        }
    }
}
# doc = {
#     'query': {
#         'match_all': {}
#     }
# }
q = Query('Какое профессиональное прозвище у Максима Галкина?', tokenizer=tokenizer, morpher=mrph)
print(q.filtered_tokens)
doc = q.build_es_query(q.get_fulltext_queries())

res = es.search(index='label_single', body=doc, size=20)
res['hits']

['профессиональное', 'прозвище', 'Максима', 'Галкина']


{'total': {'value': 3151, 'relation': 'eq'},
 'max_score': 15.187308,
 'hits': [{'_index': 'label_single',
   '_type': '_doc',
   '_id': 'Z9rupmwBg_1FN09H97FX',
   '_score': 15.187308,
   '_source': {'qid': 'Q49614', 'label': 'прозвище'}},
  {'_index': 'label_single',
   '_type': '_doc',
   '_id': 'PhnxpmwBg_1FN09H_iAA',
   '_score': 12.729599,
   '_source': {'qid': 'Q16593872', 'label': 'Галкина Галина Анатольевна'}},
  {'_index': 'label_single',
   '_type': '_doc',
   '_id': 'PxnxpmwBg_1FN09H_iAA',
   '_score': 12.729599,
   '_source': {'qid': 'Q16593872', 'label': 'Галина Анатольевна Галкина'}},
  {'_index': 'label_single',
   '_type': '_doc',
   '_id': '3ejvpmwBg_1FN09HueDA',
   '_score': 12.183937,
   '_source': {'qid': 'Q1371427', 'label': 'национальное прозвище'}},
  {'_index': 'label_single',
   '_type': '_doc',
   '_id': '3ujvpmwBg_1FN09HueDA',
   '_score': 12.183937,
   '_source': {'qid': 'Q1371427', 'label': 'национальные прозвища'}},
  {'_index': 'label_single',
   '_type':

### Execute query

In [629]:
class Matcher:
    def __init__(self, es_instance):
        self.es = es_instance
        
    def _smooth_score(self, score, d=5):
        score = int(score)
        while score % d != 0:
            score += 1
        return score
    
    def _parse_id(self, qid):
        return int(qid[1:])
    
    def _sorting_key(self, match):
        score = self._smooth_score(match['score'])
        qid = self._parse_id(match['qid'])
        return score, -qid
    
    def get_names_and_descriptions(self, qids):
        qids_list = '|'.join(qids)
        wikiapi_query = f'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={qids_list}&languages=ru&props=labels|descriptions'
        resp = requests.get(wikiapi_query).json()
        result = {}
        for qid in qids:
            cur_data = resp['entities'][qid]
            name = None
            if 'ru' in cur_data['labels']:
                name = cur_data['labels']['ru']['value']
            description = None
            if 'ru' in cur_data['descriptions']:
                description = cur_data['descriptions']['ru']['value']
            result[qid] = {'name': name, 'description': description}
        return result
    
    def get_wikipedia_pageviews(self, titles_dict):
        if not titles_dict:
            return
        titles_list = list(map(lambda s: s.replace(' ', '_'), titles_dict.keys()))
        for i in range(len(titles_list)):
            titles = '|'.join(titles_list)
            wikiapi_query = f'https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews&pvipdays=30&titles={titles}'
            resp = requests.get(wikiapi_query).json()
            if 'batchcomplete' in resp:
                break
#         print(resp)
        pages_data = (resp['query']['pages'])
#         print(len(titles_list))
#         print(len(pages_data.values()))
        
        for entry in pages_data.values():
            view_stats = entry['pageviews']
            views = sum(filter(None, view_stats.values()))
            cur_title = entry['title']
            titles_dict[cur_title] = views
    
    def get_wikipedia_pages(self, qids):
        qids_list = '|'.join(qids)
        wikiapi_query = f'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={qids_list}&sitefilter=ruwiki&props=sitelinks'
        resp = requests.get(wikiapi_query).json()
        result = {}
        views = {}
        for qid in qids:
            cur_data = resp['entities'][qid]
            wiki_title = None
            if 'ruwiki' in cur_data['sitelinks']:
                wiki_title = cur_data['sitelinks']['ruwiki']['title']
                views[wiki_title] = 0
            result[qid] = {'ruwiki': wiki_title}
            
        self.get_wikipedia_pageviews(views)
        for qid in qids:
            cur_title = result[qid]['ruwiki']
            if cur_title is not None:
                result[qid]['views'] = views[cur_title]
            else:
                result[qid]['views'] = 0
        return result
    
#     def get_wikipedia_pageviews(self, wiki_title):
#         wiki_title = wiki_title.replace(' ', '_')
#         wikiapi_query = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/ru.wikipedia/all-access/all-agents/{wiki_title}/monthly/2019010100/2019013100'
#         resp = requests.get(wikiapi_query).json()
#         return resp['items'][0]['views']

    def _apply_ranking(self, matches, relative_rate=0.9):
        if len(matches) == 0:
            return matches
        
        matches = sorted(matches, key=lambda m: m['score'], reverse=True)      
        
        cur_max_pos = 0
        cur_pos = 1
        while cur_pos < len(matches):
            cur_max_score = matches[cur_max_pos]['score']
            while cur_pos < len(matches) and matches[cur_pos]['score'] >= relative_rate * matches[cur_pos - 1]['score']:
                cur_pos += 1
            matches[cur_max_pos:cur_pos] = sorted(matches[cur_max_pos:cur_pos], key=lambda m: m['views'], reverse=True)
            cur_max_pos = cur_pos
            cur_pos = cur_max_pos + 1
            
        return matches
        
    def get_query_matches(self, query, n_matches=30):
        es_result = es.search(index='label_single', body=query, size=n_matches)['hits']
        matches_dict = {}
        for entry in es_result['hits']:
            qid = entry['_source']['qid']
            if qid not in matches_dict:
                matches_dict[qid] = entry['_source']
                matches_dict[qid]['score'] = entry['_score']
        
        names_and_descriptions = self.get_names_and_descriptions(matches_dict.keys())
        for qid, nds in names_and_descriptions.items():
            matches_dict[qid].update(nds)
            
        wikipedia_pages = self.get_wikipedia_pages(matches_dict.keys())
        for qid, wps in wikipedia_pages.items():
            matches_dict[qid].update(wps)
            
#         for qid in tqdm(matches_dict):
#             match = matches_dict[qid]
#             match['views'] = 0
#             if match['ruwiki'] is not None:
#                 match['views'] = self.get_wikipedia_pageviews(match['ruwiki'])
            
#         matches = sorted(matches_dict.values(), key=self._sorting_key, reverse=True)
        matches = self._apply_ranking(list(matches_dict.values()))
        return matches

In [630]:
mtc = Matcher(es_instance=es)

In [640]:
q.build_es_query(q.get_single_capital_queries()[0])
q.build_es_query(q.get_phrase_queries())

{'query': {'bool': {'should': [{'match_phrase': {'label': 'Какое профессиональное прозвище'}},
    {'match_phrase': {'label': 'профессиональное прозвище у'}},
    {'match_phrase': {'label': 'прозвище у Максима'}},
    {'match_phrase': {'label': 'у Максима Галкина'}},
    {'match_phrase': {'label': 'Максима Галкина'}}]}}}

In [641]:
# mtc.get_query_matches(q.build_es_query(q.get_fulltext_queries()))
# mtc.get_query_matches(q.build_es_query(q.get_single_capital_queries()[0]))
mtc.get_query_matches(q.build_es_query(q.get_phrase_queries()))

[{'qid': 'Q32360836',
  'label': 'Кто хочет стать Максимом Галкиным',
  'score': 12.354108,
  'name': 'Кто хочет стать Максимом Галкиным?',
  'description': None,
  'ruwiki': 'Кто хочет стать Максимом Галкиным?',
  'views': 324},
 {'qid': 'Q245445',
  'label': 'Кто хочет стать Максимом Галкиным',
  'score': 12.354108,
  'name': 'Кто хочет стать Максимом Галкиным?',
  'description': None,
  'ruwiki': None,
  'views': 0}]

In [620]:
class Suggester:
    def __init__(self, matcher, tokenizer, morpher):
        self.matcher = matcher
        self.tokenizer = tokenizer
        self.morpher = morpher
        
    def _build_query(self, text):
        return Query(text=text, tokenizer=self.tokenizer, morpher=self.morpher)
    
    def _get_matches(self, q, query_type):
        if query_type == 'fulltext':
            return self.matcher.get_query_matches(q.build_es_query(q.get_fulltext_queries()), n_matches=20)
        if query_type == 'phrase':
            return self.matcher.get_query_matches(q.build_es_query(q.get_phrase_queries()), n_matches=8)
        if query_type == 'single':
            single_queries = q.get_single_capital_queries()
            return [self.matcher.get_query_matches(q.build_es_query(single_query), n_matches=10)
                   for single_query in single_queries]
        
    def _select_matches(self, q, min_total=8, f=5, p=4, s=1):
        matches = []
        matches_qids = set()
        
#         print('# Phrase')
        phrase_matches = self._get_matches(q, query_type='phrase')
        if len(phrase_matches) > p:
            phrase_matches = phrase_matches[:p]
        for match in phrase_matches:
            matches_qids.add(match['qid'])
            match['source'] = 'phrase'
#             print(match)
        
#         print('# Singles')
        single_matches = self._get_matches(q, query_type='single')
        single_matches_result = []
        for single_match in single_matches:
            top_match = single_match[0]
            if top_match['qid'] not in matches_qids:
                matches_qids.add(top_match['qid'])
                top_match['source'] = 'single'
                single_matches_result.append(top_match)
#                 print(top_match)
        single_matches_result = sorted(single_matches_result, key=lambda m: m['views'], reverse=True)
        
#         print('# Fulltext')
        fulltext_matches = self._get_matches(q, query_type='fulltext')
        fulltext_matches_result = []
        f_cnt = 0
        for match in fulltext_matches:
            if match['qid'] not in matches_qids:
                f_cnt += 1
                matches_qids.add(match['qid'])
                match['source'] = 'fulltext'
                fulltext_matches_result.append(match)
#                 print(match)
            if len(matches_qids) >= min_total and f_cnt >= f:
                break
        fulltext_matches_result = sorted(fulltext_matches_result, key=lambda m: m['views'], reverse=True)
                
        matches.extend(phrase_matches)
        matches.extend(fulltext_matches_result)
        matches.extend(single_matches_result)
        return matches
    
    def get_suggestions(self, text):
#         with Timer('Build query'):
        q = self._build_query(text)
        
#         with Timer('Select matches'):
        matches = self._select_matches(q)
        return {
            'text': text,
            'matches': matches
        }
    
    @staticmethod
    def pretty_print(entry):
        lines = []
        
        matches = entry['matches']
        query = entry['text']
        
        lines.append(f'Query: {query}\n')
        
        for match in matches:
            if match['name']:
                lines.append(match['name'])
            else:
                lines.append('*no name*')

            if match['description']:
                desc = match['description']
                lines.append(f'({desc})')

            qid = match['qid']
            lines.append(f'Link: https://www.wikidata.org/wiki/{qid}\n')
        
        return '\n'.join(lines)

In [633]:
mrph = Morpher()
mtc = Matcher(es_instance=es)
suggester = Suggester(matcher=mtc, tokenizer=tokenizer, morpher=mrph)

In [634]:
with Timer('Make suggestions'):
    suggestions = suggester.get_suggestions('Какой титул носил белогвардейский генерал Петр Николаевич Врангель?')
    print(Suggester.pretty_print(suggestions))
#     print(suggestions)

Query: Какой титул носил белогвардейский генерал Петр Николаевич Врангель?

Пётр Николаевич Врангель
(русский военачальник, один из главных руководителей Белого движения в годы Гражданской войны)
Link: https://www.wikidata.org/wiki/Q108260

Врангель, Николай Николаевич
Link: https://www.wikidata.org/wiki/Q4126988

Пётр Николаевич
(Великий князь)
Link: https://www.wikidata.org/wiki/Q446676

Белянин, Пётр Николаевич
Link: https://www.wikidata.org/wiki/Q15064452

титул
(почётное звание)
Link: https://www.wikidata.org/wiki/Q216353

Врангель (Находка)
Link: https://www.wikidata.org/wiki/Q4126992

Врангели
Link: https://www.wikidata.org/wiki/Q689100

Врангель
(город в штате Аляска, США)
Link: https://www.wikidata.org/wiki/Q43983

почётное звание
(звание, присваиваемое как награда)
Link: https://www.wikidata.org/wiki/Q3320743

Александр II
(Император Всероссийский (1855—1881))
Link: https://www.wikidata.org/wiki/Q83171

Апостол Пётр
(один из двенадцати апостолов (учеников) Иисуса Христа)
Link

In [638]:
delim = '######'
with open('quiz_test.txt', 'r') as inf, open('quiz_test_entities.txt', 'w') as ouf:
    for i, line in enumerate(tqdm(inf)):
        if i % 2 == 1:
            continue
        question = line.strip('\n')
        print(delim, file=ouf)
        print(f'{i // 2 + 1}. ', end='', file=ouf)
        suggestions = suggester.get_suggestions(question)
        print(Suggester.pretty_print(suggestions), file=ouf)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [646]:
body = {
    'tokenizer': 'whitespace',
    'filter': ['lowercase', 'snow_stem'],
#     'text': 'Эскиз декорации III го акт оперы А Спендиарова  Алмаст '
    'text': 'Максим Галкин'
}

es.indices.analyze(index='label_single', body=body)

{'tokens': [{'token': 'макс',
   'start_offset': 0,
   'end_offset': 6,
   'type': 'word',
   'position': 0},
  {'token': 'галкин',
   'start_offset': 7,
   'end_offset': 13,
   'type': 'word',
   'position': 1}]}

In [647]:
mystem = Mystem()

In [650]:
mystem.analyze(tokenizer('Кого сыграл Жан Рено в фильме "Ее звали Никита"?'))
mystem.analyze('Максима Галкина')

[{'analysis': [{'lex': 'максим',
    'wt': 0.8066484349,
    'gr': 'S,имя,муж,од=(вин,ед|род,ед)'}],
  'text': 'Максима'},
 {'text': ' '},
 {'analysis': [{'lex': 'галкин',
    'wt': 0.7169838121,
    'gr': 'S,фам,муж,од=(вин,ед|род,ед)'}],
  'text': 'Галкина'},
 {'text': '\n'}]

In [None]:
stop_pos_tags = [
    'ADVPRO',
    'APRO',
    'CONJ',
    'INTJ',
    'PART',
    'PR',
    'SPRO',
    'V',
    'ADV'
]

In [216]:
mystem.analyze(tokenizer('в'))

[{'analysis': [{'lex': 'в',
    'wt': 8.212235587e-06,
    'gr': 'S,сокр=(пр,мн|пр,ед|вин,мн|вин,ед|дат,мн|дат,ед|род,мн|род,ед|твор,мн|твор,ед|им,мн|им,ед)'}],
  'text': 'в'},
 {'text': '\n'}]

In [16]:
vec = CountVectorizer()
an = vec.build_analyzer()

In [17]:
an('Бой в августе 1702 года (lol)')

['бой', 'августе', '1702', 'года', 'lol']

In [169]:
'ArithmeticErro'.isupper()

False

In [429]:
d = {20: None, 3: 5}
sum(d.values())

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [483]:
# resp = requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q3656114|Q17&sitefilter=ruwiki&props=sitelinks')
# resp = requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q42|Q17&languages=ru&props=labels|descriptions')
# resp = requests.get('https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/ru.wikipedia/all-access/all-agents/Обама,_Барак/monthly/2019010100/2019013100')
resp = requests.get('https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews&pvipdays=20&titles=(2208)_Пушкин')
resp

<Response [200]>

In [484]:
# resp.json()['query']['pages'].values()
resp.json()

{'batchcomplete': '',
 'query': {'normalized': [{'from': '(2208)_Пушкин', 'to': '(2208) Пушкин'}],
  'pages': {'4326905': {'pageid': 4326905,
    'ns': 0,
    'title': '(2208) Пушкин',
    'pageviews': {'2019-08-02': 0,
     '2019-08-03': 0,
     '2019-08-04': 0,
     '2019-08-05': 2,
     '2019-08-06': 0,
     '2019-08-07': 0,
     '2019-08-08': 2,
     '2019-08-09': 1,
     '2019-08-10': 0,
     '2019-08-11': 1,
     '2019-08-12': 3,
     '2019-08-13': 1,
     '2019-08-14': 2,
     '2019-08-15': 2,
     '2019-08-16': 1,
     '2019-08-17': 2,
     '2019-08-18': 0,
     '2019-08-19': 2,
     '2019-08-20': 1,
     '2019-08-21': None}}}}}

In [599]:
with Timer('Lalka sasatb'):
    s = 0
    for i in range(1000000):
        s += i
    print(s)

499999500000
Lalka sasatb executed in 0.149 s.


AttributeError: 'function' object has no attribute 'tqdm_notebook'