In [None]:
''' This cell contains all constants thay may differ on our machines '''

ELASTIC_HOST = 'localhost'
ELASTIC_PORT = 9200
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
COLLECTION_DIRECTORY_MYSTEM = "../byweb_stem" # directory with .out files after mystem processing

QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE = "relevant_table_2009.xml"

In [None]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time

In [None]:
es = Elasticsearch([{'host': ELASTIC_HOST, 'port': ELASTIC_PORT, 'timeout': 360, 'maxsize': 25}])

In [None]:
settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'title': {
                'type' : 'text'
            }
        }
    }
}

In [None]:
def recreate_index(index_name):
    try:
        es.indices.delete(index=index_name)
    except:
        pass
    es.indices.create(index=index_name, body=settings)

In [None]:
recreate_index('hw2index')
# recreate_index('hw2index_stem')

In [None]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [None]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [None]:
import os
from tqdm import tqdm
from tqdm import tqdm_notebook
import pickle
 
class BaseDocumentProcessor:
    def process(self, document, title):
        pass
    def result(self):
        pass
    
def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin, open(os.path.join(d, f.replace(".out", ".title")), "rb") as tfin:
        dct = pickle.load(tfin)
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document, dct[document.id])
            
def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [None]:
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
from tqdm import tqdm_notebook
import networkx as nx
import operator
pagerank_dict = {}

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        return nx.pagerank(graph)
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
pagerank_dict = processor.result()

In [None]:
class IndexDocs(BaseDocumentProcessor):
    def __init__(self, index_name):
        """ do all initialization here """
        self.actions = []
        self.index_name = index_name        
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        pagerank = 0     
#         if document_url in pagerank_dict.keys():
#             pagerank = max(0.00005, pagerank_dict[document_url])
        self.actions.append(create_es_action(self.index_name, document.id, json.dumps({'content' : document.words, 'title' : title})))
        
    def result(self):
        return self.actions
        


In [None]:
def es_actions_generator(index_name, collection_directory):
    processor = IndexDocs(index_name)
    start = time.time()
    process_collection(collection_directory, processor)
    end = time.time()
    print("preprocess time = ", end - start)
    return processor.result()

In [None]:
import time
start = time.time()
for ok, result in tqdm_notebook(parallel_bulk(es, es_actions_generator('hw2index', COLLECTION_DIRECTORY), queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)
end = time.time()
print('Time=' + str(end - start))


In [None]:
import requests
param = (('v', ''),) # '-v' is for --verbose

# call the class's method to get an HTTP response model
resp = requests.get(f'http://{ELASTIC_HOST}:{ELASTIC_PORT}/_cat/indices', params=param)
resp.text

In [None]:
from bs4 import BeautifulSoup

def read_queries():
    queries = {}
    with open(QUERIES_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            queries[task["id"]] = task.text
    return queries

def read_relevance():
    relevance = {}
    with open(RELEVANCE_FILE, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            rel = [doc["id"] for doc in task.find_all("document") if doc["relevance"] == "vital"]
            if rel:
                relevance[task["id"]] = rel
    return relevance

In [None]:
relevance = read_relevance()
queries = read_queries()

In [None]:
class DocById(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.docs = {}     
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        self.docs[document.id] = document
    def result(self):
        return self.docs
        



In [None]:
processor = DocById()        
process_collection(COLLECTION_DIRECTORY, processor)
documents_by_id = processor.result()

In [None]:
from rank_bm25 import BM25Okapi #https://pypi.org/project/rank-bm25/ examples
from rank_bm25 import BM25Plus

def build_query(query):
    return {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'content': {
                                    'query': query,
                                    'boost': 1
                                }
                            }
                        },
                        {
                            'match': {
                                'title': {
                                    'query': query,
                                    'operator': 'OR',
                                    'boost': 0.3
                                }
                            }
                        },
                    ]
                }
            }
        }

def run_search(query, size, index_name):
    search_result = es.search(index=index_name, body=build_query(query), size=size)['hits']
    return [hit['_id'] for hit in search_result['hits']]

def prepare_testing(index_name, queries):
    did, qid, y = [], [], []
    for query in tqdm_notebook(queries.keys()):
        doc_ids = run_search(queries[query], 100, index_name)
        if (len(doc_ids) == 0):
            continue
        for res in doc_ids:
            did.append(res)
            qid.append(query)
        for id_ in doc_ids:
            if query in relevance.keys() and id_ in relevance[query]:
                y.append(1)
            else:
                y.append(0)
    return did, qid, y              

In [None]:
did, qid, y = prepare_testing('hw2index', queries)

In [None]:
import numpy as np
import pickle

m1 = pickle.load("m1.dat")
m2 = pickle.load("m2.dat")
m3 = pickle.load("m3.dat")
m4 = pickle.load("m4.dat")
m5 = pickle.load("m5.dat")
m6 = pickle.load("m6.dat")
id_to_ind = pickle.load("id_to_ind.dat")
id_to_url = pickle.load("id_to_url.dat")

x = []
for i in range(len(y)):
    v = np.zeros((6,))
    v[0] = m1[qid[i]][id_to_ind[did[i]]]
    v[1] = m2[id_to_ind[did[i]]]
    v[2] = m3[qid[i]][id_to_ind[did[i]]]
    v[3] = m4[id_to_ind[id_to_url[did[i]]]]
    v[4] = m5[qid[i]]
    v[5] = m6[qid[i]]
    x.append(v)

In [None]:
from catboost import Pool

x = np.array(x)
qid = np.array(qid)
y = np.array(y)
test = Pool(data=x, label=y, group_id=qid)

In [None]:
from catboost import CatBoost
from statistics import mean

model = CatBoost.load_model("byweb_model.md")
print("test: ", mean(model.eval_metrics(test, ['NDCG:top=20'])['NDCG:top=20;type=Base']))