In [1]:
''' This cell contains all constants thay may differ on our machines '''

ELASTIC_HOST = 'localhost'
ELASTIC_PORT = 9200
COLLECTION_DIRECTORY = "../byweb" # directory with .out files to process
COLLECTION_DIRECTORY_MYSTEM = "../byweb_stem" # directory with .out files after mystem processing

QUERIES_FILE = "../web2008_adhoc.xml"
RELEVANCE_FILE = "../relevant_table_2009.xml"

In [2]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [3]:
es = Elasticsearch([{'host': ELASTIC_HOST, 'port': ELASTIC_PORT, 'timeout': 360, 'maxsize': 25}])

In [4]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'title': {
                'type' : 'text'
            }
        }
    }
}

In [7]:
def recreate_index(index_name):
    try:
        es.indices.delete(index=index_name)
    except:
        pass
    es.indices.create(index=index_name, body=settings)

In [8]:
recreate_index('hw2index')
# recreate_index('hw2index_stem')

In [9]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [10]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [11]:
import os
from tqdm import tqdm
from tqdm import tqdm_notebook
import pickle

class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin, open(os.path.join(d, f.replace(".out", ".title")), "rb") as tfin:
        dct = pickle.load(tfin)
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document, dct[document.id])

def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [None]:
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
from tqdm import tqdm_notebook
import networkx as nx
import operator
pagerank_dict = {}

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        return nx.pagerank(graph)
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
pagerank_dict = processor.result()

In [15]:
class IndexDocs(BaseDocumentProcessor):
    def __init__(self, index_name):
        """ do all initialization here """
        self.actions = []
        self.index_name = index_name        
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        pagerank = 0     
#         if document_url in pagerank_dict.keys():
#             pagerank = max(0.00005, pagerank_dict[document_url])
        self.actions.append(create_es_action(self.index_name, document.id, json.dumps({'content' : document.words, 'title' : title})))
        
    def result(self):
        return self.actions
        


In [13]:
def es_actions_generator(index_name, collection_directory):
    processor = IndexDocs(index_name)
    start = time.time()
    process_collection(collection_directory, processor)
    end = time.time()
    print("preprocess time = ", end - start)
    return processor.result()

In [16]:
import time
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator('hw2index', COLLECTION_DIRECTORY), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))



  0%|          | 0/200000 [00:00<?, ?it/s][A
  0%|          | 377/200000 [00:00<00:53, 3746.09it/s][A


processing ../byweb/byweb.0.out


  0%|          | 876/200000 [00:00<00:49, 4047.32it/s][A
  1%|          | 1337/200000 [00:00<00:47, 4197.19it/s][A
  1%|          | 2080/200000 [00:00<00:41, 4819.62it/s][A
  1%|▏         | 2604/200000 [00:00<00:39, 4936.55it/s][A
  2%|▏         | 3239/200000 [00:00<00:37, 5285.60it/s][A
  2%|▏         | 3786/200000 [00:00<00:36, 5337.69it/s][A
  2%|▏         | 4308/200000 [00:00<00:37, 5190.22it/s][A
  2%|▏         | 4855/200000 [00:00<00:37, 5270.57it/s][A
  3%|▎         | 5377/200000 [00:01<00:37, 5213.46it/s][A
  3%|▎         | 5895/200000 [00:01<00:39, 4896.75it/s][A
  3%|▎         | 6387/200000 [00:01<00:40, 4764.79it/s][A
  3%|▎         | 6880/200000 [00:01<00:40, 4812.68it/s][A
  4%|▎         | 7363/200000 [00:01<00:43, 4387.54it/s][A
  4%|▍         | 7883/200000 [00:01<00:41, 4599.01it/s][A
  4%|▍         | 8437/200000 [00:01<00:39, 4845.21it/s][A
  4%|▍         | 8931/200000 [00:01<00:40, 4737.88it/s][A
  5%|▍         | 9412/200000 [00:01<00:42, 4531.01it/s][

processing ../byweb/byweb.9.out



 11%|█         | 21375/200000 [00:04<00:38, 4681.38it/s][A
 11%|█         | 21999/200000 [00:04<00:35, 5059.65it/s][A
 11%|█▏        | 22533/200000 [00:04<00:37, 4710.66it/s][A
 12%|█▏        | 23028/200000 [00:04<00:38, 4610.59it/s][A
 12%|█▏        | 23638/200000 [00:04<00:35, 4973.21it/s][A
 12%|█▏        | 24208/200000 [00:04<00:34, 5170.24it/s][A
 12%|█▏        | 24800/200000 [00:05<00:32, 5373.72it/s][A
 13%|█▎        | 25351/200000 [00:05<00:34, 5032.01it/s][A
 13%|█▎        | 25868/200000 [00:05<00:34, 5046.00it/s][A
 13%|█▎        | 26383/200000 [00:05<00:37, 4674.17it/s][A
 13%|█▎        | 26966/200000 [00:05<00:34, 4969.25it/s][A
 14%|█▎        | 27477/200000 [00:05<00:36, 4712.87it/s][A
 14%|█▍        | 28136/200000 [00:05<00:33, 5151.27it/s][A
 14%|█▍        | 28674/200000 [00:05<00:32, 5212.34it/s][A
 15%|█▍        | 29210/200000 [00:05<00:33, 5073.58it/s][A
 15%|█▍        | 29729/200000 [00:06<00:39, 4307.34it/s][A
 15%|█▌        | 30188/200000 [00:06<00

processing ../byweb/byweb.1.out



 21%|██        | 41175/200000 [00:08<00:36, 4331.78it/s][A
 21%|██        | 41682/200000 [00:08<00:34, 4529.56it/s][A
 21%|██        | 42208/200000 [00:08<00:33, 4721.36it/s][A
 21%|██▏       | 42692/200000 [00:08<00:33, 4703.29it/s][A
 22%|██▏       | 43248/200000 [00:08<00:31, 4931.09it/s][A
 22%|██▏       | 44056/200000 [00:08<00:27, 5583.31it/s][A
 22%|██▏       | 44650/200000 [00:09<00:29, 5259.79it/s][A
 23%|██▎       | 45205/200000 [00:09<00:29, 5210.06it/s][A
 23%|██▎       | 45746/200000 [00:09<00:30, 5139.92it/s][A
 23%|██▎       | 46275/200000 [00:09<00:31, 4907.71it/s][A
 23%|██▎       | 46778/200000 [00:09<00:31, 4829.42it/s][A
 24%|██▎       | 47400/200000 [00:09<00:29, 5176.31it/s][A
 24%|██▍       | 47957/200000 [00:09<00:28, 5281.03it/s][A
 24%|██▍       | 48495/200000 [00:09<00:29, 5171.43it/s][A
 25%|██▍       | 49107/200000 [00:09<00:27, 5423.49it/s][A
 25%|██▌       | 50016/200000 [00:10<00:24, 6165.48it/s][A
 25%|██▌       | 50690/200000 [00:10<00

processing ../byweb/byweb.5.out



 31%|███       | 61271/200000 [00:12<00:27, 5068.37it/s][A
 31%|███       | 61811/200000 [00:12<00:28, 4878.40it/s][A
 31%|███       | 62324/200000 [00:12<00:29, 4715.20it/s][A
 31%|███▏      | 62848/200000 [00:12<00:28, 4859.02it/s][A
 32%|███▏      | 63348/200000 [00:12<00:31, 4404.01it/s][A
 32%|███▏      | 63870/200000 [00:12<00:29, 4620.19it/s][A
 32%|███▏      | 64348/200000 [00:13<00:31, 4277.56it/s][A
 32%|███▏      | 64792/200000 [00:13<00:32, 4157.11it/s][A
 33%|███▎      | 65334/200000 [00:13<00:30, 4468.53it/s][A
 33%|███▎      | 65816/200000 [00:13<00:29, 4568.15it/s][A
 33%|███▎      | 66284/200000 [00:13<00:29, 4486.10it/s][A
 33%|███▎      | 66912/200000 [00:13<00:27, 4901.76it/s][A
 34%|███▎      | 67478/200000 [00:13<00:25, 5104.83it/s][A
 34%|███▍      | 68048/200000 [00:13<00:25, 5268.08it/s][A
 34%|███▍      | 68638/200000 [00:13<00:24, 5405.13it/s][A
 35%|███▍      | 69188/200000 [00:14<00:25, 5193.00it/s][A
 35%|███▍      | 69716/200000 [00:14<00

processing ../byweb/byweb.8.out



 41%|████      | 81258/200000 [00:16<00:25, 4648.76it/s][A
 41%|████      | 81732/200000 [00:16<00:26, 4514.51it/s][A
 41%|████      | 82191/200000 [00:16<00:31, 3752.43it/s][A
 41%|████▏     | 82787/200000 [00:16<00:28, 4181.17it/s][A
 42%|████▏     | 83240/200000 [00:17<00:28, 4157.79it/s][A
 42%|████▏     | 83680/200000 [00:17<00:28, 4018.09it/s][A
 42%|████▏     | 84291/200000 [00:17<00:25, 4477.52it/s][A
 42%|████▏     | 84768/200000 [00:17<00:26, 4337.17it/s][A
 43%|████▎     | 85254/200000 [00:17<00:25, 4480.93it/s][A
 43%|████▎     | 85719/200000 [00:17<00:26, 4269.08it/s][A
 43%|████▎     | 86188/200000 [00:17<00:25, 4384.98it/s][A
 43%|████▎     | 86708/200000 [00:17<00:24, 4600.07it/s][A
 44%|████▎     | 87178/200000 [00:17<00:24, 4618.50it/s][A
 44%|████▍     | 87647/200000 [00:18<00:25, 4438.20it/s][A
 44%|████▍     | 88223/200000 [00:18<00:23, 4765.76it/s][A
 44%|████▍     | 88711/200000 [00:18<00:25, 4336.21it/s][A
 45%|████▍     | 89160/200000 [00:18<00

processing ../byweb/byweb.7.out



 51%|█████     | 101130/200000 [00:21<00:20, 4728.83it/s][A
 51%|█████     | 101796/200000 [00:21<00:18, 5178.75it/s][A
 51%|█████     | 102441/200000 [00:21<00:17, 5504.02it/s][A
 52%|█████▏    | 103137/200000 [00:21<00:16, 5869.24it/s][A
 52%|█████▏    | 103961/200000 [00:21<00:15, 6390.46it/s][A
 52%|█████▏    | 104631/200000 [00:21<00:17, 5604.71it/s][A
 53%|█████▎    | 105231/200000 [00:21<00:17, 5376.67it/s][A
 53%|█████▎    | 105798/200000 [00:21<00:18, 5077.40it/s][A
 53%|█████▎    | 106330/200000 [00:21<00:19, 4802.70it/s][A
 53%|█████▎    | 106830/200000 [00:22<00:21, 4360.82it/s][A
 54%|█████▎    | 107318/200000 [00:22<00:20, 4504.05it/s][A
 54%|█████▍    | 107870/200000 [00:22<00:19, 4765.66it/s][A
 54%|█████▍    | 108453/200000 [00:22<00:18, 5041.28it/s][A
 54%|█████▍    | 108973/200000 [00:22<00:20, 4527.33it/s][A
 55%|█████▍    | 109555/200000 [00:22<00:18, 4850.23it/s][A
 55%|█████▌    | 110061/200000 [00:22<00:19, 4542.56it/s][A
 55%|█████▌    | 110622

processing ../byweb/byweb.3.out



 61%|██████    | 121220/200000 [00:24<00:16, 4691.81it/s][A
 61%|██████    | 121702/200000 [00:25<00:17, 4491.13it/s][A
 61%|██████    | 122162/200000 [00:25<00:18, 4157.75it/s][A
 61%|██████▏   | 122590/200000 [00:25<00:18, 4132.82it/s][A
 62%|██████▏   | 123121/200000 [00:25<00:17, 4426.13it/s][A
 62%|██████▏   | 123576/200000 [00:25<00:18, 4128.49it/s][A
 62%|██████▏   | 124098/200000 [00:25<00:17, 4404.44it/s][A
 62%|██████▏   | 124553/200000 [00:25<00:17, 4260.56it/s][A
 62%|██████▏   | 124990/200000 [00:25<00:18, 4135.74it/s][A
 63%|██████▎   | 125412/200000 [00:26<00:19, 3845.03it/s][A
 63%|██████▎   | 125807/200000 [00:26<00:19, 3778.37it/s][A
 63%|██████▎   | 126250/200000 [00:26<00:18, 3950.14it/s][A
 63%|██████▎   | 126700/200000 [00:26<00:17, 4100.11it/s][A
 64%|██████▎   | 127117/200000 [00:26<00:18, 3933.84it/s][A
 64%|██████▍   | 127584/200000 [00:26<00:17, 4128.58it/s][A
 64%|██████▍   | 128373/200000 [00:26<00:14, 4777.60it/s][A
 64%|██████▍   | 128894

processing ../byweb/byweb.2.out



 70%|███████   | 140944/200000 [00:29<00:14, 4047.60it/s][A
 71%|███████   | 141427/200000 [00:29<00:13, 4252.07it/s][A
 71%|███████   | 141994/200000 [00:29<00:12, 4595.97it/s][A
 71%|███████   | 142491/200000 [00:29<00:12, 4644.18it/s][A
 71%|███████▏  | 142982/200000 [00:29<00:12, 4498.38it/s][A
 72%|███████▏  | 143451/200000 [00:29<00:13, 4235.69it/s][A
 72%|███████▏  | 143891/200000 [00:29<00:13, 4120.46it/s][A
 72%|███████▏  | 144379/200000 [00:29<00:12, 4321.65it/s][A
 72%|███████▏  | 144916/200000 [00:30<00:12, 4589.66it/s][A
 73%|███████▎  | 145494/200000 [00:30<00:11, 4889.65it/s][A
 73%|███████▎  | 145997/200000 [00:30<00:11, 4694.65it/s][A
 73%|███████▎  | 146531/200000 [00:30<00:10, 4870.58it/s][A
 74%|███████▎  | 147028/200000 [00:30<00:12, 4278.82it/s][A
 74%|███████▍  | 147550/200000 [00:30<00:11, 4522.84it/s][A
 74%|███████▍  | 148020/200000 [00:30<00:11, 4384.75it/s][A
 74%|███████▍  | 148867/200000 [00:30<00:09, 5125.54it/s][A
 75%|███████▍  | 149438

processing ../byweb/byweb.6.out



 81%|████████  | 161438/200000 [00:33<00:06, 5798.05it/s][A
 81%|████████  | 162038/200000 [00:33<00:06, 5760.51it/s][A
 81%|████████▏ | 162629/200000 [00:33<00:06, 5669.80it/s][A
 82%|████████▏ | 163207/200000 [00:33<00:06, 5360.67it/s][A
 82%|████████▏ | 163777/200000 [00:33<00:06, 5455.07it/s][A
 82%|████████▏ | 164331/200000 [00:33<00:06, 5141.37it/s][A
 82%|████████▏ | 164855/200000 [00:33<00:06, 5102.69it/s][A
 83%|████████▎ | 165532/200000 [00:33<00:06, 5507.88it/s][A
 83%|████████▎ | 166097/200000 [00:34<00:06, 5189.18it/s][A
 83%|████████▎ | 166659/200000 [00:34<00:06, 5303.85it/s][A
 84%|████████▎ | 167200/200000 [00:34<00:06, 5298.90it/s][A
 84%|████████▍ | 167737/200000 [00:34<00:06, 4710.67it/s][A
 84%|████████▍ | 168226/200000 [00:34<00:07, 4367.61it/s][A
 84%|████████▍ | 168751/200000 [00:34<00:06, 4599.35it/s][A
 85%|████████▍ | 169227/200000 [00:34<00:06, 4416.38it/s][A
 85%|████████▍ | 169681/200000 [00:34<00:06, 4360.75it/s][A
 85%|████████▌ | 170406

processing ../byweb/byweb.4.out



 90%|█████████ | 180911/200000 [00:37<00:04, 4278.58it/s][A
 91%|█████████ | 181601/200000 [00:37<00:03, 4828.73it/s][A
 91%|█████████ | 182148/200000 [00:37<00:03, 4999.09it/s][A
 91%|█████████▏| 182670/200000 [00:37<00:03, 4722.97it/s][A
 92%|█████████▏| 183161/200000 [00:37<00:03, 4450.09it/s][A
 92%|█████████▏| 183638/200000 [00:37<00:03, 4540.90it/s][A
 92%|█████████▏| 184104/200000 [00:37<00:03, 4012.80it/s][A
 92%|█████████▏| 184720/200000 [00:38<00:03, 4434.86it/s][A
 93%|█████████▎| 185514/200000 [00:38<00:02, 5111.41it/s][A
 93%|█████████▎| 186210/200000 [00:38<00:02, 5551.51it/s][A
 93%|█████████▎| 186816/200000 [00:38<00:02, 4909.99it/s][A
 94%|█████████▎| 187357/200000 [00:38<00:02, 4932.30it/s][A
 94%|█████████▍| 187885/200000 [00:38<00:02, 4483.79it/s][A
 94%|█████████▍| 188366/200000 [00:38<00:02, 4328.85it/s][A
 94%|█████████▍| 188823/200000 [00:38<00:02, 4184.99it/s][A
 95%|█████████▍| 189271/200000 [00:39<00:02, 4237.89it/s][A
 95%|█████████▍| 189862

preprocess time =  41.25140118598938


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


200010it [00:53, 4605.01it/s]                            [A


Time=178.12326383590698


In [None]:
import time
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator('hw2index_stem', COLLECTION_DIRECTORY_MYSTEM), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))

In [None]:
import requests
param = (('v', ''),) # '-v' is for --verbose

# call the class's method to get an HTTP response model
resp = requests.get(f'http://{ELASTIC_HOST}:{ELASTIC_PORT}/_cat/indices', params=param)
resp.text

In [17]:
from bs4 import BeautifulSoup

def read_queries():
    queries = {}
    with open(QUERIES_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            queries[task["id"]] = task.text
    return queries

def read_relevance():
    relevance = {}
    with open(RELEVANCE_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            rel = [doc["id"] for doc in task.find_all("document") if doc["relevance"] == "vital"]
            if rel:
                relevance[task["id"]] = rel
    return relevance

In [18]:
relevance = read_relevance()
queries = read_queries()

In [19]:
def build_query(query):
    return {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': {
                                    'query': query,
                                    'boost': 1
                                }
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'OR',
                                    'boost': 0.3
                                }
                            }
                        },
                    ]
                }
            }
        }

def run_search(query, size, index_name):
    search_result = es.search(index=index_name, body=build_query(query), size=size)['hits']
    return [hit['_id'] for hit in search_result['hits']]

def count_metrics(index_name, queries):
    p = 0
    r = 0
    rp = 0
    ma_p = 0
    n = 0
    f1 = {}
    for qid, query in queries.items():
        if qid not in relevance.keys():
            continue
        n += 1
        rel = relevance[qid]
        relevant = len(rel)
        hits = run_search(query, max(20, relevant), index_name)
        hits_r = hits[:relevant]
        hits_20 = hits[:20]
        true_positive_20 = sum(1 for did in hits_20 if did in rel)
        true_positive_r = sum(1 for did in hits_r if did in rel)
        pp = true_positive_20 / 20
        rr = true_positive_20 / relevant
        p += pp
        r += rr
        rp += true_positive_r / relevant
        ma_p += sum(sum(1 for did in hits[:k] if did in rel) / k for k in range(1, 21)) / 20
        f1[qid] = 0 if pp * rr == 0 else pp * rr / (pp + rr)
    print("p@20:", p / n)
    print("r@20:", r / n)
    print("r-precision:", rp / n)
    print("map@20:", ma_p / n)
    return f1

In [20]:
f1 = count_metrics('hw2index', queries)
# f1_stem = count_metrics('hw2index_stem', stem_queries(queries))

# diff = sorted([(queries[qid], f1_stem[qid], f1[qid], f1_stem[qid] - f1[qid]) for qid in f1.keys()], key=lambda tup: -tup[-1])
# print(*diff[:5], sep="\n")

p@20: 0.351717171717172
r@20: 0.22831787409824356
r-precision: 0.28625719694853574
map@20: 0.4146079284488261


In [None]:
class MystemProcessor(BaseDocumentProcessor):
    def __init__(self, fout):
        self.m = Mystem(grammar_info=False, disambiguation=False)
        reg = re.compile('[a-zа-яё0-9\-]')
        self.filterFunc = lambda w : reg.match(w)
        self.fout = fout

    def process(self, document):
        text = ' '.join(document.words).lower()
        lemmas = self.m.lemmatize(text)
        words = list(filter(self.filterFunc, lemmas))
        doc = Document(document.url, document.id, document.sz_bytes, document.sz_words)
        doc.words = words
        doc.links = document.links
        pickle.dump(doc, self.fout)

         
    def result(self):
        pass

def process_collection_with_mystem(directory, outdirectory):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):        
            with open(os.path.join(outdirectory, file), "wb") as fout:
                processor = MystemProcessor(fout)
                process_file(directory, file, processor, pbar)

In [None]:
process_collection_with_mystem(COLLECTION_DIRECTORY, COLLECTION_DIRECTORY_MYSTEM)

In [None]:
def stem_queries(queries):
    m = Mystem(grammar_info=False, disambiguation=False)
    reg = re.compile('[a-zа-яё0-9\-]')
    filterFunc = lambda w : reg.match(w)
    result = {}
    for (qid, text) in queries.items():
        result[qid] = ' '.join(filter(filterFunc, m.lemmatize(text)))
    return result

In [None]:
queries = stem_queries(queries)
print(queries)