In [21]:
''' This cell contains all constants thay may differ on our machines '''

ELASTIC_HOST = 'localhost'
ELASTIC_PORT = 9200
COLLECTION_DIRECTORY = "../byweb" # directory with .out files to process
COLLECTION_DIRECTORY_MYSTEM = "../byweb_stem" # directory with .out files after mystem processing

QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE = "relevant_table_2009.xml"

In [4]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [None]:
es = Elasticsearch([{'host': ELASTIC_HOST, 'port': ELASTIC_PORT, 'timeout': 360, 'maxsize': 25}])

In [None]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            }
        }
    }
}

In [None]:
def recreate_index():
    es.indices.delete(index='hw2index')
    es.indices.create(index='hw2index', body=settings)

In [None]:
recreate_index()

In [None]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [24]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [25]:
import os

In [26]:
from tqdm import tqdm
from tqdm import tqdm_notebook
import pickle

class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document)

def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [None]:

class IndexDocs(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.actions = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        #print(json.dumps({'content' : document.words}))
        self.actions.append(create_es_action('hw2index', document.id, json.dumps({'content' : document.words})))
        
    def result(self):
        return self.actions
        


In [None]:
def es_actions_generator():
    processor = IndexDocs()        
    process_collection(COLLECTION_DIRECTORY, processor)
    return processor.result()

In [None]:
import time

In [None]:
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator(), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))


In [None]:
import requests
param = (('v', ''),) # '-v' is for --verbose

# call the class's method to get an HTTP response model
resp = requests.get(f'http://{ELASTIC_HOST}:{ELASTIC_PORT}/_cat/indices', params=param)


In [None]:
resp.text

In [None]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            }
        }
    },
    "settings": {
        "analysis" : {
            "analyzer" : {
                "my_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["lowercase", "my_snow"]
                }
            },
            "filter" : {
                "my_snow" : {
                    "type" : "snowball",
                    "language" : "russian"
                }
            }
        }
    }
}

In [None]:
recreate_index()

In [None]:
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator(), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))



In [None]:
requests.get(f'http://{ELASTIC_HOST}:{ELASTIC_PORT}/_cat/indices', params=param).text

In [None]:
from bs4 import BeautifulSoup

def read_queries():
    queries = {}
    with open(QUERIES_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            queries[task["id"]] = task.text
    return queries

def read_relevance():
    relevance = {}
    with open(RELEVANCE_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            rel = [doc["id"] for doc in task.find_all("document") if doc["relevance"] == "vital"]
            if rel:
                relevance[task["id"]] = rel
    return relevance

In [None]:
relevance = read_relevance()
queries = read_queries()

In [None]:
def build_query(query):
    return { 'query': { 'bool': { 'should': [ { 'match': { 'content': query } } ] } } }

def run_search(query, size):
    search_result = es.search(index='hw2index', body=build_query(query), size=size)['hits']
    return [hit['_id'] for hit in search_result['hits']]

def count_metrics():
    p = 0
    r = 0
    rp = 0
    ma_p = 0
    n = 0
    for qid, query in queries.items():
        if qid not in relevance.keys():
            continue
        n += 1
        rel = relevance[qid]
        relevant = len(rel)
        hits = run_search(query, max(20, relevant))
        hits_r = hits[:relevant]
        hits_20 = hits[:20]
        true_positive_20 = sum(1 for did in hits_20 if did in rel)
        true_positive_r = sum(1 for did in hits_r if did in rel)
        p += true_positive_20 / 20
        r += true_positive_20 / relevant
        rp += true_positive_r / relevant
        ma_p += sum(sum(1 for did in hits[:k] if did in rel) / k for k in range(1, 21)) / 20
    print("p@20:", p / n)
    print("r@20:", r / n)
    print("r-precision:", rp / n)
    print("map@20:", ma_p / n)

In [None]:
count_metrics()

In [31]:
class MystemProcessor(BaseDocumentProcessor):
    def __init__(self, fout):
        self.m = Mystem(grammar_info=False, disambiguation=False)
        reg = re.compile('[a-zа-яё0-9\-]')
        self.filterFunc = lambda w : reg.match(w)
        self.fout = fout

    def process(self, document):
        text = ' '.join(document.words).lower()
        lemmas = self.m.lemmatize(text)
        words = list(filter(self.filterFunc, lemmas))
        doc = Document(document.url, document.id, document.sz_bytes, document.sz_words)
        doc.words = words
        doc.links = document.links
        pickle.dump(doc, self.fout)

         
    def result(self):
        pass

def process_collection_with_mystem(directory, outdirectory):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):        
            with open(os.path.join(outdirectory, file), "wb") as fout:
                processor = MystemProcessor(fout)
                process_file(directory, file, processor, pbar)

In [32]:
process_collection_with_mystem(COLLECTION_DIRECTORY, COLLECTION_DIRECTORY_MYSTEM)






  0%|                                                                                       | 0/200000 [00:00<?, ?it/s]

processing ..\byweb\byweb.0.out







  0%|                                                                             | 3/200000 [00:00<3:03:58, 18.12it/s]




  0%|                                                                             | 4/200000 [00:00<5:51:13,  9.49it/s]




  0%|                                                                             | 6/200000 [00:00<5:43:23,  9.71it/s]




  0%|                                                                             | 8/200000 [00:00<6:53:27,  8.06it/s]




  0%|                                                                             | 9/200000 [00:01<9:32:10,  5.83it/s]




  0%|                                                                            | 11/200000 [00:01<8:01:02,  6.93it/s]




  0%|                                                                            | 12/200000 [00:01<7:40:37,  7.24it/s]




  0%|                                                                            | 13/200000 [00:01<7:54:54,  7.02it/s]




  0

  0%|                                                                           | 114/200000 [00:12<4:57:25, 11.20it/s]




  0%|                                                                           | 116/200000 [00:12<4:48:03, 11.56it/s]




  0%|                                                                           | 118/200000 [00:12<5:29:21, 10.11it/s]




  0%|                                                                           | 120/200000 [00:12<5:23:01, 10.31it/s]




  0%|                                                                           | 122/200000 [00:12<5:08:51, 10.79it/s]




  0%|                                                                           | 124/200000 [00:13<4:47:39, 11.58it/s]




  0%|                                                                           | 126/200000 [00:13<4:55:54, 11.26it/s]




  0%|                                                                           | 128/200000 [00:13<4:42:09, 11.81it/s]




  0%|   

  0%|                                                                           | 240/200000 [00:25<7:14:37,  7.66it/s]




  0%|                                                                           | 241/200000 [00:25<6:51:49,  8.08it/s]




  0%|                                                                           | 242/200000 [00:25<6:40:29,  8.31it/s]




  0%|                                                                           | 243/200000 [00:25<6:22:13,  8.71it/s]




  0%|                                                                           | 244/200000 [00:25<6:21:56,  8.72it/s]




  0%|                                                                           | 246/200000 [00:26<5:44:38,  9.66it/s]




  0%|                                                                           | 248/200000 [00:26<5:21:34, 10.35it/s]




  0%|                                                                           | 250/200000 [00:26<5:13:03, 10.63it/s]




  0%|   

  0%|▏                                                                          | 364/200000 [00:38<5:57:53,  9.30it/s]




  0%|▏                                                                          | 366/200000 [00:38<6:04:56,  9.12it/s]




  0%|▏                                                                          | 368/200000 [00:38<6:07:42,  9.05it/s]




  0%|▏                                                                          | 370/200000 [00:38<5:43:59,  9.67it/s]




  0%|▏                                                                          | 372/200000 [00:39<6:50:27,  8.11it/s]




  0%|▏                                                                          | 373/200000 [00:39<6:40:55,  8.30it/s]




  0%|▏                                                                         | 374/200000 [00:39<10:16:26,  5.40it/s]




  0%|▏                                                                          | 376/200000 [00:39<8:50:44,  6.27it/s]




  0%|▏  

  0%|▏                                                                          | 479/200000 [00:50<5:47:43,  9.56it/s]




  0%|▏                                                                          | 481/200000 [00:50<5:25:30, 10.22it/s]




  0%|▏                                                                          | 483/200000 [00:50<4:54:27, 11.29it/s]




  0%|▏                                                                          | 485/200000 [00:51<6:30:59,  8.50it/s]




  0%|▏                                                                          | 487/200000 [00:51<5:54:08,  9.39it/s]




  0%|▏                                                                          | 489/200000 [00:51<5:52:34,  9.43it/s]




  0%|▏                                                                          | 491/200000 [00:51<5:40:58,  9.75it/s]




  0%|▏                                                                          | 493/200000 [00:51<6:02:24,  9.17it/s]




  0%|▏  

  0%|▏                                                                          | 605/200000 [01:02<4:35:04, 12.08it/s]




  0%|▏                                                                          | 607/200000 [01:02<4:32:24, 12.20it/s]




  0%|▏                                                                          | 609/200000 [01:02<6:05:19,  9.10it/s]




  0%|▏                                                                          | 611/200000 [01:02<5:31:00, 10.04it/s]




  0%|▏                                                                          | 613/200000 [01:03<6:06:16,  9.07it/s]




  0%|▏                                                                          | 615/200000 [01:03<6:11:08,  8.95it/s]




  0%|▏                                                                          | 617/200000 [01:03<5:43:34,  9.67it/s]




  0%|▏                                                                          | 619/200000 [01:03<5:07:17, 10.81it/s]




  0%|▏  

  0%|▎                                                                          | 722/200000 [01:14<4:51:02, 11.41it/s]




  0%|▎                                                                          | 724/200000 [01:14<6:12:21,  8.92it/s]




  0%|▎                                                                          | 725/200000 [01:14<6:04:14,  9.12it/s]




  0%|▎                                                                          | 727/200000 [01:15<5:50:09,  9.48it/s]




  0%|▎                                                                          | 729/200000 [01:15<5:41:09,  9.73it/s]




  0%|▎                                                                          | 731/200000 [01:15<6:47:34,  8.15it/s]




  0%|▎                                                                          | 733/200000 [01:15<6:03:33,  9.13it/s]




  0%|▎                                                                          | 735/200000 [01:16<6:11:14,  8.95it/s]




  0%|▎  

  0%|▎                                                                          | 840/200000 [01:28<7:01:54,  7.87it/s]




  0%|▎                                                                          | 842/200000 [01:28<6:23:01,  8.67it/s]




  0%|▎                                                                          | 844/200000 [01:28<5:55:54,  9.33it/s]




  0%|▎                                                                          | 846/200000 [01:28<5:43:19,  9.67it/s]




  0%|▎                                                                          | 848/200000 [01:28<5:29:13, 10.08it/s]




  0%|▎                                                                          | 850/200000 [01:29<5:33:02,  9.97it/s]




  0%|▎                                                                          | 852/200000 [01:29<5:15:37, 10.52it/s]




  0%|▎                                                                          | 854/200000 [01:29<5:03:11, 10.95it/s]




  0%|▎  

  0%|▎                                                                          | 971/200000 [01:40<5:09:08, 10.73it/s]




  0%|▎                                                                          | 973/200000 [01:41<5:34:13,  9.92it/s]




  0%|▎                                                                          | 975/200000 [01:41<5:32:34,  9.97it/s]




  0%|▎                                                                          | 977/200000 [01:41<5:18:20, 10.42it/s]




  0%|▎                                                                          | 979/200000 [01:41<4:57:43, 11.14it/s]




  0%|▎                                                                          | 981/200000 [01:41<5:28:11, 10.11it/s]




  0%|▎                                                                          | 983/200000 [01:42<5:43:46,  9.65it/s]




  0%|▎                                                                          | 984/200000 [01:42<5:53:56,  9.37it/s]




  0%|▎  

  1%|▍                                                                         | 1097/200000 [01:52<6:08:01,  9.01it/s]




  1%|▍                                                                         | 1099/200000 [01:53<5:28:41, 10.09it/s]




  1%|▍                                                                         | 1101/200000 [01:53<5:14:54, 10.53it/s]




  1%|▍                                                                         | 1103/200000 [01:53<5:29:17, 10.07it/s]




  1%|▍                                                                         | 1105/200000 [01:53<5:29:19, 10.07it/s]




  1%|▍                                                                         | 1107/200000 [01:53<5:28:38, 10.09it/s]




  1%|▍                                                                         | 1109/200000 [01:54<5:33:46,  9.93it/s]




  1%|▍                                                                         | 1111/200000 [01:54<5:39:27,  9.76it/s]




  1%|▍  

  1%|▍                                                                         | 1205/200000 [02:06<5:48:05,  9.52it/s]




  1%|▍                                                                         | 1207/200000 [02:06<5:30:02, 10.04it/s]




  1%|▍                                                                         | 1209/200000 [02:06<5:34:05,  9.92it/s]




  1%|▍                                                                         | 1211/200000 [02:07<5:27:38, 10.11it/s]




  1%|▍                                                                         | 1213/200000 [02:07<5:26:07, 10.16it/s]




  1%|▍                                                                         | 1215/200000 [02:07<5:36:27,  9.85it/s]




  1%|▍                                                                         | 1217/200000 [02:07<5:21:02, 10.32it/s]




  1%|▍                                                                         | 1219/200000 [02:07<5:47:48,  9.53it/s]




  1%|▍  

  1%|▍                                                                         | 1330/200000 [02:17<4:27:17, 12.39it/s]




  1%|▍                                                                         | 1332/200000 [02:18<4:30:58, 12.22it/s]




  1%|▍                                                                         | 1334/200000 [02:18<4:43:35, 11.68it/s]




  1%|▍                                                                         | 1336/200000 [02:18<5:03:35, 10.91it/s]




  1%|▍                                                                         | 1338/200000 [02:18<5:29:14, 10.06it/s]




  1%|▍                                                                         | 1340/200000 [02:18<5:01:37, 10.98it/s]




  1%|▍                                                                         | 1342/200000 [02:18<4:44:10, 11.65it/s]




  1%|▍                                                                         | 1344/200000 [02:19<4:41:02, 11.78it/s]




  1%|▍  

  1%|▌                                                                         | 1462/200000 [02:29<4:22:20, 12.61it/s]




  1%|▌                                                                         | 1464/200000 [02:29<4:36:33, 11.96it/s]




  1%|▌                                                                         | 1466/200000 [02:29<4:51:17, 11.36it/s]




  1%|▌                                                                         | 1468/200000 [02:30<5:17:45, 10.41it/s]




  1%|▌                                                                         | 1470/200000 [02:30<5:03:46, 10.89it/s]




  1%|▌                                                                         | 1472/200000 [02:30<5:08:14, 10.73it/s]




  1%|▌                                                                         | 1474/200000 [02:30<5:43:26,  9.63it/s]




  1%|▌                                                                         | 1476/200000 [02:30<5:19:40, 10.35it/s]




  1%|▌  

  1%|▌                                                                         | 1592/200000 [02:41<4:21:00, 12.67it/s]




  1%|▌                                                                         | 1594/200000 [02:41<4:37:00, 11.94it/s]




  1%|▌                                                                         | 1596/200000 [02:41<4:24:46, 12.49it/s]




  1%|▌                                                                         | 1598/200000 [02:41<4:14:45, 12.98it/s]




  1%|▌                                                                         | 1600/200000 [02:41<4:19:01, 12.77it/s]




  1%|▌                                                                         | 1602/200000 [02:42<4:20:01, 12.72it/s]




  1%|▌                                                                         | 1604/200000 [02:42<4:15:31, 12.94it/s]




  1%|▌                                                                         | 1606/200000 [02:42<4:04:52, 13.50it/s]




  1%|▌  

  1%|▋                                                                         | 1722/200000 [02:52<5:17:54, 10.39it/s]




  1%|▋                                                                         | 1724/200000 [02:52<5:08:45, 10.70it/s]




  1%|▋                                                                         | 1726/200000 [02:52<7:21:42,  7.48it/s]




  1%|▋                                                                         | 1728/200000 [02:53<6:20:06,  8.69it/s]




  1%|▋                                                                         | 1730/200000 [02:53<5:44:38,  9.59it/s]




  1%|▋                                                                         | 1732/200000 [02:53<5:12:43, 10.57it/s]




  1%|▋                                                                         | 1734/200000 [02:53<5:24:30, 10.18it/s]




  1%|▋                                                                         | 1736/200000 [02:53<5:00:12, 11.01it/s]




  1%|▋  

  1%|▋                                                                         | 1854/200000 [03:03<4:28:02, 12.32it/s]




  1%|▋                                                                         | 1856/200000 [03:03<4:17:46, 12.81it/s]




  1%|▋                                                                         | 1858/200000 [03:03<4:02:22, 13.63it/s]




  1%|▋                                                                         | 1860/200000 [03:03<4:08:04, 13.31it/s]




  1%|▋                                                                         | 1862/200000 [03:03<4:03:21, 13.57it/s]




  1%|▋                                                                         | 1864/200000 [03:04<4:00:12, 13.75it/s]




  1%|▋                                                                         | 1866/200000 [03:04<3:55:18, 14.03it/s]




  1%|▋                                                                         | 1868/200000 [03:04<4:13:34, 13.02it/s]




  1%|▋  

  1%|▋                                                                         | 1979/200000 [03:14<5:04:15, 10.85it/s]




  1%|▋                                                                         | 1981/200000 [03:14<4:59:04, 11.03it/s]




  1%|▋                                                                         | 1983/200000 [03:15<9:10:57,  5.99it/s]




  1%|▋                                                                         | 1985/200000 [03:15<8:24:58,  6.54it/s]




  1%|▋                                                                         | 1987/200000 [03:15<7:07:18,  7.72it/s]




  1%|▋                                                                         | 1989/200000 [03:15<6:18:06,  8.73it/s]




  1%|▋                                                                         | 1991/200000 [03:16<5:40:48,  9.68it/s]




  1%|▋                                                                         | 1993/200000 [03:16<5:17:06, 10.41it/s]




  1%|▋  

  1%|▊                                                                         | 2104/200000 [03:27<6:18:46,  8.71it/s]




  1%|▊                                                                         | 2106/200000 [03:27<5:44:14,  9.58it/s]




  1%|▊                                                                         | 2108/200000 [03:27<5:19:09, 10.33it/s]




  1%|▊                                                                         | 2110/200000 [03:27<5:08:20, 10.70it/s]




  1%|▊                                                                         | 2112/200000 [03:28<7:10:29,  7.66it/s]




  1%|▊                                                                         | 2114/200000 [03:28<7:08:52,  7.69it/s]




  1%|▊                                                                         | 2116/200000 [03:28<6:44:31,  8.15it/s]




  1%|▊                                                                         | 2118/200000 [03:28<6:08:12,  8.96it/s]




  1%|▊  

  1%|▊                                                                         | 2229/200000 [03:39<4:20:00, 12.68it/s]




  1%|▊                                                                         | 2231/200000 [03:39<4:11:51, 13.09it/s]




  1%|▊                                                                         | 2233/200000 [03:39<4:11:58, 13.08it/s]




  1%|▊                                                                         | 2235/200000 [03:39<4:11:52, 13.09it/s]




  1%|▊                                                                         | 2237/200000 [03:39<4:01:41, 13.64it/s]




  1%|▊                                                                         | 2239/200000 [03:39<4:40:51, 11.74it/s]




  1%|▊                                                                         | 2241/200000 [03:40<4:50:18, 11.35it/s]




  1%|▊                                                                         | 2243/200000 [03:40<4:36:41, 11.91it/s]




  1%|▊  

  1%|▊                                                                         | 2344/200000 [03:50<8:18:24,  6.61it/s]




  1%|▊                                                                         | 2345/200000 [03:51<7:57:29,  6.90it/s]




  1%|▊                                                                         | 2346/200000 [03:51<7:26:31,  7.38it/s]




  1%|▊                                                                         | 2347/200000 [03:51<7:34:48,  7.24it/s]




  1%|▊                                                                        | 2348/200000 [03:51<14:36:59,  3.76it/s]




  1%|▊                                                                        | 2350/200000 [03:52<12:17:46,  4.46it/s]




  1%|▊                                                                         | 2352/200000 [03:52<9:50:29,  5.58it/s]




  1%|▊                                                                         | 2354/200000 [03:52<8:12:22,  6.69it/s]




  1%|▊  

KeyboardInterrupt: 