In [1]:
''' This cell contains all constants thay may differ on our machines '''

ELASTIC_HOST = 'localhost'
ELASTIC_PORT = 9200
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
COLLECTION_DIRECTORY_MYSTEM = "../byweb_stem" # directory with .out files after mystem processing

QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE = "relevant_table_2009.xml"

In [2]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [3]:
es = Elasticsearch([{'host': ELASTIC_HOST, 'port': ELASTIC_PORT, 'timeout': 360, 'maxsize': 25}])

In [4]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'title': {
                'type' : 'text'
            }
        }
    }
}

In [5]:
def recreate_index(index_name):
    try:
        es.indices.delete(index=index_name)
    except:
        pass
    es.indices.create(index=index_name, body=settings)

In [6]:
recreate_index('hw2index')
# recreate_index('hw2index_stem')

In [7]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [8]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [9]:
import os
from tqdm import tqdm
from tqdm import tqdm_notebook
import pickle
 
class BaseDocumentProcessor:
    def process(self, document, title):
        pass
    def result(self):
        pass
    
def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin, open(os.path.join(d, f.replace(".out", ".title")), "rb") as tfin:
        dct = pickle.load(tfin)
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document, dct[document.id])
            
def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [10]:
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

  0%|          | 24/200000 [00:00<14:02, 237.43it/s]

processing byweb/byweb.0.out


 10%|▉         | 19867/200000 [00:21<02:37, 1144.99it/s]

In [11]:
from tqdm import tqdm_notebook
import networkx as nx
import operator
pagerank_dict = {}

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        return nx.pagerank(graph)
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
pagerank_dict = processor.result()


  0%|          | 0/200000 [00:00<?, ?it/s][A
  0%|          | 35/200000 [00:00<09:37, 346.54it/s][A

processing byweb/byweb.0.out



  0%|          | 149/200000 [00:00<07:36, 437.99it/s][A
  0%|          | 233/200000 [00:00<06:30, 511.17it/s][A
  0%|          | 316/200000 [00:00<05:45, 577.38it/s][A
  0%|          | 390/200000 [00:00<05:23, 617.83it/s][A
  0%|          | 457/200000 [00:00<05:54, 563.27it/s][A
  0%|          | 602/200000 [00:00<04:49, 689.64it/s][A
  0%|          | 688/200000 [00:00<05:02, 659.03it/s][A
  0%|          | 777/200000 [00:00<04:39, 714.03it/s][A
  0%|          | 859/200000 [00:01<04:51, 683.23it/s][A
  0%|          | 944/200000 [00:01<04:34, 725.22it/s][A
  1%|          | 1027/200000 [00:01<04:24, 751.97it/s][A
  1%|          | 1107/200000 [00:01<04:49, 687.29it/s][A
  1%|          | 1184/200000 [00:01<04:51, 682.89it/s][A
  1%|          | 1256/200000 [00:01<05:09, 642.27it/s][A
  1%|          | 1375/200000 [00:01<04:26, 744.95it/s][A
  1%|          | 1482/200000 [00:01<04:02, 817.90it/s][A
  1%|          | 1572/200000 [00:02<04:31, 731.28it/s][A
  1%|          | 1670/2

  6%|▋         | 12837/200000 [00:17<04:42, 663.20it/s][A
  6%|▋         | 12910/200000 [00:17<04:40, 667.20it/s][A
  6%|▋         | 12998/200000 [00:17<04:21, 714.51it/s][A
  7%|▋         | 13074/200000 [00:17<04:48, 648.82it/s][A
  7%|▋         | 13176/200000 [00:17<04:18, 723.88it/s][A
  7%|▋         | 13254/200000 [00:17<04:42, 661.69it/s][A
  7%|▋         | 13353/200000 [00:17<04:14, 734.71it/s][A
  7%|▋         | 13439/200000 [00:18<04:03, 765.55it/s][A
  7%|▋         | 13521/200000 [00:18<04:02, 768.71it/s][A
 10%|█         | 20001/200000 [00:40<02:37, 1144.99it/s][A
  7%|▋         | 13726/200000 [00:18<03:57, 783.01it/s][A
  7%|▋         | 13828/200000 [00:18<03:41, 840.16it/s][A
  7%|▋         | 13917/200000 [00:18<04:11, 739.74it/s][A
  7%|▋         | 13997/200000 [00:18<04:30, 688.27it/s][A
  7%|▋         | 14105/200000 [00:18<04:01, 770.81it/s][A
  7%|▋         | 14189/200000 [00:19<04:23, 706.41it/s][A
  7%|▋         | 14278/200000 [00:19<04:07, 751.02it/s]

In [12]:
class IndexDocs(BaseDocumentProcessor):
    def __init__(self, index_name):
        """ do all initialization here """
        self.actions = []
        self.index_name = index_name        
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        pagerank = 0     
#         if document_url in pagerank_dict.keys():
#             pagerank = max(0.00005, pagerank_dict[document_url])
        self.actions.append(create_es_action(self.index_name, document.id, json.dumps({'content' : document.words, 'title' : title})))
        
    def result(self):
        return self.actions
        


In [13]:
def es_actions_generator(index_name, collection_directory):
    processor = IndexDocs(index_name)
    start = time.time()
    process_collection(collection_directory, processor)
    end = time.time()
    print("preprocess time = ", end - start)
    return processor.result()

In [14]:
import time
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator('hw2index', COLLECTION_DIRECTORY), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))




  0%|          | 0/200000 [00:00<?, ?it/s][A[A

  0%|          | 32/200000 [00:00<10:33, 315.59it/s][A[A

processing byweb/byweb.0.out




  0%|          | 126/200000 [00:00<08:27, 394.11it/s][A[A

  0%|          | 212/200000 [00:00<07:04, 470.17it/s][A[A

  0%|          | 263/200000 [00:00<07:26, 447.28it/s][A[A

  0%|          | 348/200000 [00:00<06:24, 519.81it/s][A[A

  0%|          | 407/200000 [00:00<06:30, 511.15it/s][A[A

  0%|          | 463/200000 [00:00<06:38, 500.72it/s][A[A

  0%|          | 626/200000 [00:00<05:15, 631.68it/s][A[A

  0%|          | 712/200000 [00:01<05:33, 596.98it/s][A[A

  0%|          | 821/200000 [00:01<04:49, 687.14it/s][A[A

  0%|          | 906/200000 [00:01<05:06, 650.40it/s][A[A

  0%|          | 983/200000 [00:01<04:52, 680.26it/s][A[A

  1%|          | 1060/200000 [00:01<05:33, 595.81it/s][A[A

  1%|          | 1128/200000 [00:01<05:35, 592.44it/s][A[A

  1%|          | 1193/200000 [00:01<05:49, 568.11it/s][A[A

  1%|          | 1316/200000 [00:01<04:53, 677.11it/s][A[A

  1%|          | 1482/200000 [00:01<04:01, 822.26it/s][A[A

  1%|          | 

  7%|▋         | 13184/200000 [00:15<03:26, 902.59it/s][A[A

  7%|▋         | 13278/200000 [00:15<03:32, 879.48it/s][A[A

  7%|▋         | 13369/200000 [00:15<03:36, 860.71it/s][A[A

  7%|▋         | 13458/200000 [00:15<03:35, 866.72it/s][A[A

  7%|▋         | 13570/200000 [00:15<03:20, 929.53it/s][A[A

  7%|▋         | 13680/200000 [00:15<03:11, 970.95it/s][A[A

  7%|▋         | 13780/200000 [00:16<03:12, 964.88it/s][A[A

  7%|▋         | 13911/200000 [00:16<02:58, 1044.41it/s][A[A

  7%|▋         | 14021/200000 [00:16<02:55, 1060.07it/s][A[A

  7%|▋         | 14130/200000 [00:16<03:06, 994.90it/s] [A[A

  7%|▋         | 14232/200000 [00:16<03:24, 908.14it/s][A[A

  7%|▋         | 14326/200000 [00:16<03:56, 785.02it/s][A[A

  7%|▋         | 14434/200000 [00:16<03:37, 854.32it/s][A[A

  7%|▋         | 14525/200000 [00:16<03:45, 823.33it/s][A[A

  7%|▋         | 14690/200000 [00:16<03:11, 967.09it/s][A[A

  7%|▋         | 14800/200000 [00:17<03:17, 939.10it

preprocess time =  22.447121143341064


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



 10%|█         | 20001/200000 [00:40<03:08, 954.49it/s][A[A


Time=58.184091091156006


In [15]:
import requests
param = (('v', ''),) # '-v' is for --verbose

# call the class's method to get an HTTP response model
resp = requests.get(f'http://{ELASTIC_HOST}:{ELASTIC_PORT}/_cat/indices', params=param)
resp.text

'health status index    uuid                   pri rep docs.count docs.deleted store.size pri.store.size\nyellow open   hw2index j0s2v1t-REWSjG3ytASEmw   1   1      19958            0    276.5mb        276.5mb\n'

In [16]:
from bs4 import BeautifulSoup

def read_queries():
    queries = {}
    with open(QUERIES_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            queries[task["id"]] = task.text
    return queries

def read_relevance():
    relevance = {}
    with open(RELEVANCE_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            rel = [doc["id"] for doc in task.find_all("document") if doc["relevance"] == "vital"]
            if rel:
                relevance[task["id"]] = rel
    return relevance

In [17]:
relevance = read_relevance()
queries = read_queries()

In [18]:
class DocById(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.docs = {}     
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        self.docs[document.id] = document
    def result(self):
        return self.docs
        



In [19]:
processor = DocById()        
process_collection(COLLECTION_DIRECTORY, processor)
documents_by_id = processor.result()




  0%|          | 0/200000 [00:00<?, ?it/s][A[A[A


  0%|          | 30/200000 [00:00<11:12, 297.31it/s][A[A[A




processing byweb/byweb.0.out


  0%|          | 109/200000 [00:00<09:06, 365.67it/s][A[A[A


  0%|          | 208/200000 [00:00<07:23, 450.93it/s][A[A[A


  0%|          | 269/200000 [00:00<06:48, 489.02it/s][A[A[A


  0%|          | 348/200000 [00:00<06:07, 543.12it/s][A[A[A


  0%|          | 411/200000 [00:00<05:54, 563.79it/s][A[A[A


  0%|          | 472/200000 [00:00<06:05, 545.54it/s][A[A[A


  0%|          | 530/200000 [00:00<06:00, 553.96it/s][A[A[A


  0%|          | 650/200000 [00:00<05:04, 654.55it/s][A[A[A


  0%|          | 725/200000 [00:01<05:40, 586.02it/s][A[A[A


  0%|          | 821/200000 [00:01<05:01, 661.29it/s][A[A[A


  0%|          | 896/200000 [00:01<05:17, 626.31it/s][A[A[A


  1%|          | 1002/200000 [00:01<04:39, 712.13it/s][A[A[A


  1%|          | 1082/200000 [00:01<05:39, 586.24it/s][A[A[A


  1%|          | 1151/200000 [00:01<06:08, 539.40it/s][A[A[A


  1%|          | 1213/200000 [00:01<06:37, 499.90it/s][A[A[A


  1%|          | 1281

  6%|▌         | 11943/200000 [00:15<04:00, 780.81it/s][A[A[A


  6%|▌         | 12046/200000 [00:15<03:44, 839.06it/s][A[A[A


  6%|▌         | 12135/200000 [00:15<03:59, 783.26it/s][A[A[A


  6%|▌         | 12218/200000 [00:15<04:02, 774.63it/s][A[A[A


  6%|▌         | 12299/200000 [00:16<04:02, 773.93it/s][A[A[A


  6%|▌         | 12391/200000 [00:16<03:51, 810.58it/s][A[A[A


  6%|▌         | 12479/200000 [00:16<03:46, 828.51it/s][A[A[A


  6%|▋         | 12571/200000 [00:16<03:39, 853.45it/s][A[A[A


  6%|▋         | 12658/200000 [00:16<03:42, 841.29it/s][A[A[A


  6%|▋         | 12777/200000 [00:16<03:42, 840.13it/s][A[A[A


  6%|▋         | 12862/200000 [00:16<04:07, 756.18it/s][A[A[A


  6%|▋         | 12996/200000 [00:16<03:35, 869.15it/s][A[A[A


  7%|▋         | 13091/200000 [00:16<03:41, 843.36it/s][A[A[A


  7%|▋         | 13202/200000 [00:17<03:25, 906.95it/s][A[A[A


  7%|▋         | 13298/200000 [00:17<03:41, 843.52it/s][A[A

In [20]:
from rank_bm25 import BM25Okapi #https://pypi.org/project/rank-bm25/ examples
from rank_bm25 import BM25Plus

def build_query(query):
    return {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': {
                                    'query': query,
                                    'boost': 1
                                }
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'OR',
                                    'boost': 0.3
                                }
                            }
                        },
                    ]
                }
            }
        }

def run_search(query, size, index_name):
    search_result = es.search(index=index_name, body=build_query(query), size=size)['hits']
    return [hit['_id'] for hit in search_result['hits']]

def prepare_testing(index_name, queries):
    x, y = [], []
    for query in tqdm_notebook(queries.keys()):
        doc_ids = run_search(queries[query], 100, index_name)
        documents = [documents_by_id[id_] for id_ in doc_ids]
        if (len(documents) == 0):
            continue
        all_words = [doc.words for doc in documents]
        bm25 = BM25Okapi(all_words)
        bm_25_res = bm25.get_scores(query.split(" "))
        for res in bm_25_res:
            x.append(res)
        for id_ in doc_ids:
            if query in relevance.keys() and id_ in relevance[query]:
                y.append(1)
            else:
                y.append(0)
    return x, y              

In [21]:
x, y = prepare_testing('hw2index', queries)

HBox(children=(IntProgress(value=0, max=29231), HTML(value='')))




 10%|█         | 20001/200000 [00:41<03:31, 852.35it/s][A[A[A

KeyboardInterrupt: 