In [15]:
from tqdm.notebook import tqdm

In [2]:
from __future__ import print_function

import sys

import time

from pyclick.click_models.Evaluation import LogLikelihood, Perplexity
from pyclick.click_models.UBM import UBM
from pyclick.click_models.DBN import DBN
from pyclick.click_models.SDBN import SDBN
from pyclick.click_models.DCM import DCM
from pyclick.click_models.CCM import CCM
from pyclick.click_models.CTR import DCTR, RCTR, GCTR
from pyclick.click_models.CM import CM
from pyclick.click_models.PBM import PBM
from pyclick.utils.Utils import Utils

In [3]:
from pyclick.click_models.task_centric.TaskCentricSearchSession import TaskCentricSearchSession
from pyclick.search_session.SearchResult import SearchResult

In [8]:
import os
import bz2
import numpy as np

In [None]:
def parse_log_line(line):
    query = line.split("@")[0]
    line = line[len(query) + 1:]
    data = line.split('\t')
    geoId = data[0]
    show_urls = data[1].split(',')
    click_urls = data[2].split(',')
    time_click_urls = list(map(int, data[3].split(',')))
    ordered_time_click_urls = np.array(time_click_urls).argsort()
    for i, url in enumerate(show_urls):
        if url.find('://') > 0:
            show_urls[i] = url.split("://")[-1]
    ordered_click_url = []
    for i in ordered_time_click_urls:
        url = click_urls[i]
        if url.find('://') > 0:
            url = url.split("://")[-1]
        ordered_click_url.append(url)
    return query, geoId, show_urls, ordered_click_url

In [6]:
class ClickDataParser:
    @staticmethod
    def parse(sessions_filename, log_id_start, sessions_max=None):
        """
        Parses search sessions, formatted according to:
        Query Text @ Query Geo <tab> List of shown urls <tab> List of clicked urls <tab>
        :param sessions_filename: The name of the file with search sessions.
        :param sessions_max: The maximum number of search sessions to return.
        If not set, all search sessions are parsed and returned.
        :returns: A list of parsed search sessions, wrapped into SearchSession objects.
        """
        with bz2.open(sessions_filename, "rb") as f:
            lines = f.readlines()
            
        sessions = []

        log_id = 0
        for line in lines:
            if sessions_max and len(sessions) >= sessions_max:
                break
            line = line.decode('utf-8').rstrip('\n')
            query = line.split("@")[0]
            line = line[len(query) + 1:]
            data = line.split('\t')
            
            task = log_id + log_id_start
            results = dict(map(lambda x: (x.split("://")[-1], None) if x.find('://') > 0 else (x, None), 
                               data[1].split(',')))
                    
            click_results = dict(map(lambda x: (x.split("://")[-1], None) if x.find('://') > 0 else (x, None), 
                                     data[2].split(',')))
            #click_results = list(map(lambda x: x.split("://")[-1] if x.find('://') > 0 else x, 
            #                         data[2].split(',')))
            #time_clicks = list(map(int, data[3].split(',')))
            #ordered_time_clicks = np.array(time_clicks).argsort()
            #ordered_click_results = {click_results[i] : None for i in ordered_time_click_urls}
            
            session = TaskCentricSearchSession(task, query)
            
            for result in results:
                click = 1 if result in click_results else 0
                result = SearchResult(result, click)
                session.web_results.append(result)

            sessions.append(session)
                        
            log_id = log_id + 1
        return sessions

In [120]:
def predict_relevance(click_model, query, search_result):
    return click_model.params[click_model.param_names.attr].get(query, search_result).value()

In [7]:
search_sessions_path = os.path.join("2017", 'part-m-00015.bz2')
search_sessions_num = 5

In [8]:
search_sessions = ClickDataParser().parse(search_sessions_path, 0)

In [11]:
from pyclick.click_models.Inference import EMInference, MLEInference

In [16]:
search_sessions_cut = []
for s in tqdm(search_sessions):
    s_cut = TaskCentricSearchSession(s.task, s.query)
    s_cut.web_results = s.web_results[:10]
    search_sessions_cut.append(s_cut)

HBox(children=(FloatProgress(value=0.0, max=60370.0), HTML(value='')))




In [104]:
click_model = CCM(EMInference(iter_num=1))

In [87]:
click_model = SDBN()

In [106]:
start = time.time()
click_model.train(search_sessions[:1000])
end = time.time()
print("\tTrained %s click model in %i secs:\n" % (click_model.__class__.__name__, end - start))

	Trained CCM click model in 11 secs:



In [86]:
click_model.predict_relevance('пасхальные открытки 19 века с надписями', 'images')

0.5

In [204]:
print("\tTrained %s click model in %i secs:\n" % (click_model.__class__.__name__, end - start))

	Trained UBM click model in 0 secs:



### Загрузка сессий

In [208]:
def cut_session_to_10(search_sessions):
    search_sessions_cut = []
    for s in search_sessions:
        s_cut = TaskCentricSearchSession(s.task, s.query)
        s_cut.web_results = s.web_results[:10]
        search_sessions_cut.append(s_cut)
    return search_sessions_cut

In [2]:
import helpers
import importlib
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\taras\\Documents\\Аспирантура МГУ\\Техносфера mail.ru\\IR\\Итоговый конкурс\\helpers.py'>

In [3]:
search_sessions = helpers.load_obj('search_sessions')

In [None]:
search_sessions_cut = cut_session_to_10(search_sessions)

In [None]:
def predict_relevance(click_model, query, search_result):
    return click_model.params[click_model.param_names.attr].get(query, search_result).value()

In [None]:
train_df = pd.read_csv('train_use_qa_df.csv', sep='\t')
test_df = pd.read_csv('test_use_qa_df.csv', sep='\t')

**CM**

In [None]:
click_model = CM()
start = time.time()
click_model.train(search_sessions)
end = time.time()

In [None]:
print("\tTrained %s click model in %i secs" % (CM_click_model.__class__.__name__, end - start))

**CCM**