In [None]:
''' This cell contains all constants thay may differ on our machines '''

ELASTIC_HOST = 'localhost'
ELASTIC_PORT = 9200
COLLECTION_DIRECTORY = "../byweb" # directory with .out files to process
COLLECTION_DIRECTORY_MYSTEM = "../byweb_stem" # directory with .out files after mystem processing

QUERIES_FILE = "../web2008_adhoc.xml"
RELEVANCE_FILE = "../relevant_table_2009.xml"

In [None]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [None]:
import os
from tqdm import tqdm
from tqdm import tqdm_notebook
import pickle

class BaseDocumentProcessor:
    def process(self, document, title):
        pass
    def result(self):
        pass
    def process_pack(self):
        pass

def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin, open(os.path.join(d, f.replace(".out", ".title")), "rb") as tfin:
        dct = pickle.load(tfin)
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document, dct[document.id])
    processor.process_pack()

def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [None]:
from pymystem3 import Mystem
import re

def stem_queries(queries):
    m = Mystem(grammar_info=False, disambiguation=False)
    reg = re.compile('[a-zа-яё0-9\-]')
    filterFunc = lambda w : reg.match(w)
    result = {}
    for (qid, text) in queries.items():
        result[qid] = ' '.join(filter(filterFunc, m.lemmatize(text)))
    return result

In [None]:
from rank_bm25 import BM25Okapi #https://pypi.org/project/rank-bm25/ examples
from rank_bm25 import BM25Plus

def __feature_bm25(documents, queries, BM, result):
    all_words = [doc for doc in documents]
    bm25 = BM(all_words)
    for qid, q in queries.items():
        print(qid)
        if qid not in result:
            result[qid] = []
        result[qid].extend(bm25.get_scores(q.split(" ")))
    return result

def feature_bm25Plus(documents, queries, result):
    return __feature_bm25(documents, queries, BM25Plus, result)

def feature_bm25(documents, queries, result):
    return __feature_bm25(documents, queries, BM25Okapi, result)

In [None]:
class QWord:
    def __init__(self, word, pos, prev = None):
        self.prev = prev
        self.next = None
        self.word = word
        self.pos = pos
        self.step = 0
        if (prev != None):
            prev.next = self
            self.step = pos - prev.pos

def check(d):
    for k, v in d.items():
        if v == 0:
            return False
    return True
        
def feature_window(document, query, title):
    queryset = set(query.split(' '))
    head = None
    tail = None
    for i, w in enumerate(document.words):
        if w in queryset:
            if head == None:
                head = QWord(w, i)
                tail = head
            else:
                tail = QWord(w, i, tail)
                
    if not head:
        return document.sz_bytes
                
    collect = dict.fromkeys(queryset, 0)
    node1 = head
    node2 = head
    collect[head.word] = 1
    length = 0
    minLength = len(document.words)
    
    while node1.next != None:
        node1 = node1.next
        collect[node1.word] += 1
        length += node1.step
        while node2.next != None and check(collect):
            minLength = min(minLength, length)
            collect[node2.word] -= 1
            node2 = node2.next
            length -= node2.step
    return minLength

In [None]:
def feature_query_len(document, query, title):
    return len(query)

def feature_query_list_len(document, query, title):
    return len(query.split(' '))

def feature_doc_len(document, query, title):
    return len(document.words)

In [None]:
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
import networkx as nx
import operator

pagerank_dict = {}

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        return nx.pagerank(graph)
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
pagerank_dict = processor.result()

In [None]:
url_to_id = {}

class GetDocUrlsToIds(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document, title):
        """ document: Document (see first cell)
            process each document here """
        url_to_id[document.url.decode("cp1251")] = document.id
        
    def result(self):
        pass
        
processor = GetDocUrlsToIds()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
from bs4 import BeautifulSoup

def read_relevance(file, r_type):
    did = []
    qid = []
    y = []
    with open(file, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            for doc in task.find_all("document"):
                if (r_type == 2008):
                    if (doc["id"] not in url_to_id):
                        continue
                    did.append(url_to_id[doc["id"]])
                else:
                    did.append(int(doc["id"]))
                qid.append(int(task["id"][3:]))
                y.append(1 if doc["relevance"] == "vital" else 0)
    return did, qid, y

def read_queries(file):
    queries = {}
    with open(file, "rt", encoding="cp1251") as file:
        bs = BeautifulSoup(file.read())
        for task in bs.find_all("task"):
            queries[int(task["id"][3:])] = task.text
    return queries

In [None]:
did, qid, y = read_relevance("../relevant_table_2008.xml", 2008)
queries = read_queries(QUERIES_FILE)
queries = stem_queries(queries)

In [None]:
class GetAllMetricsProcessor(BaseDocumentProcessor):
    def __init__(self, queries):
        self.queries = queries
        self.docs = {}
        self.bm25_plus = {}
        self.doc_len = []
        self.window = {}
        self.id_to_ind = {}
    
    def process(self, document, title):
        self.docs[document.id] = document.words
        self.doc_len.append(feature_doc_len(document, None, title))
        self.id_to_ind[document.id] = len(self.id_to_ind)
        for query in self.queries.keys():
            if query not in self.window:
                self.window[query] = []
            self.window[query].append(feature_window(document, queries[query], title))

    def process_pack(self):
        feature_bm25Plus(self.docs.values(), self.queries, self.bm25_plus)
        self.docs = {}
        
    def result(self):
        return self.bm25_plus, self.doc_len, self.window, self.id_to_ind

processor = GetAllMetricsProcessor(queries)        
process_collection(COLLECTION_DIRECTORY, processor)
m1, m2, m3, id_to_ind = processor.result()
m4 = pagerank_dict
m5 = {}
m6 = {}

for q in queries.keys():
    m5[q] = feature_query_len(None, queries[q], None)
    m6[q] = feature_query_list_len(None, queries[q], None)

In [None]:
import numpy as np

id_to_url = {}
for uurl, iid in url_to_id.items():
    id_to_url[iid] = uurl

x = []
for i in range(len(y)):
    v = np.zeros((6,))
    v[0] = m1[qid[i]][id_to_ind[did[i]]]
    v[1] = m2[id_to_ind[did[i]]]
    v[2] = m3[qid[i]][id_to_ind[did[i]]]
    v[3] = m4[id_to_ind[id_to_url[did[i]]]]
    v[4] = m5[qid[i]]
    v[5] = m6[qid[i]]
    x.append(v)

x = np.array(x)

In [None]:
def split(x, y, qid, train_ratio):
    n = qid.shape[0]
    train_inds = np.sort(np.random.choice(n, int(n * train_ratio), replace=False))
    test_inds = np.sort(np.setdiff1d(range(n), train_inds))
    return (
        x[train_inds], y[train_inds], qid[train_inds],
        x[test_inds], y[test_inds], qid[test_inds],
    )

In [None]:
qid = np.array(qid)
y = np.array(y)
train_x, train_y, train_qid, test_x, test_y, test_qid = split(x, y, qid, 0.8)

In [None]:
from catboost import Pool
train = Pool(data=train_x, label=train_y, group_id=train_qid)
test = Pool(data=test_x, label=test_y, group_id=test_qid)

In [None]:
from catboost import CatBoost
from statistics import mean

parameters = { 'custom_metric': ['NDCG:top=20'], 'iterations': 2000, 'loss_function': 'PairLogitPairwise' }

model = CatBoost(parameters)
model.fit(train, eval_set=test)
print("train: ", mean(model.eval_metrics(train, ['NDCG:top=20'])['NDCG:top=20;type=Base']))
print("test: ", mean(model.eval_metrics(test, ['NDCG:top=20'])['NDCG:top=20;type=Base']))
print(model.get_feature_importance())