<a href="https://colab.research.google.com/github/tilacyn/ir-itmo/blob/master/pagerank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········


In [0]:
def dcg(relevance, k=None):
  k = len(relevance) if k is None else k
  dcg = np.sum(((np.power(2, relevance) - 1) / np.log2(2 + np.arange(len(relevance))))[:k])
  return dcg if dcg > 0 else 1

In [0]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy.special import expit
from tqdm import tqdm


def preprocess(labels, query_ids):
    query_groups = {}
    for query_index, query_id in enumerate(query_ids):
        l = query_groups.setdefault(query_id, [])
        l.append(query_index)

    query_idcg = {}
    query_permutations = {}
    for query_id, query_indexes in query_groups.items():
        idcg = dcg(sorted(labels[query_indexes], reverse=True))
        query_idcg[query_id] = idcg
        query_permutations[query_id] = np.tile(np.arange(len(query_indexes)), (len(query_indexes), 1))

    return query_groups, query_idcg, query_permutations


class LambdaMartModel:
    def __init__(self, lr, n_trees):
        self.lr = lr
        self.n_trees = n_trees
        self.models = []
        self.train_loss = []


    def newton(self, objects, tree, lambdas, hess):
        leaf_index_dct = {}
        objects = objects.astype(np.float32)
        for sample_index, leaf_index in enumerate(tree.tree_.apply(objects)):
            l = leaf_index_dct.setdefault(leaf_index, [])
            l.append(sample_index)

        for leaf_index, sample_indexes in leaf_index_dct.items():
            nom = - lambdas[sample_indexes].sum()
            denom = hess[sample_indexes].sum()
            if nom == 0 or denom == 0:
                tree.tree_.value[leaf_index] = 0.
            else:
                tree.tree_.value[leaf_index] = nom / denom

        return tree


    def train(self, objects, labels, predictions):
        self.query_groups, self.query_idcg, self.query_permutations = preprocess(labels, predictions)
        for i in tqdm(range(self.n_trees)):
            model = DecisionTreeRegressor(max_depth=10, max_features='sqrt')
            lambdas, hess = self.calc_lambdas(labels, predictions)
            model.fit(objects, - lambdas)
            model = self.newton(objects, model, lambdas, hess)
            self.models.append(model)
            predictions += np.int64(self.lr * model.predict(objects))

            print('train loss: {}'.format(np.mean(self.train_loss)))


    def calc_lambdas(self, y_true, y_pred):
        lambdas = np.empty_like(y_true)

        loss = 0
        idcg = 0
        hess = np.empty(len(y_true))
        for query_id, query_indexes in self.query_groups.items():
            query_y_true = y_true[query_indexes]
            query_y_pred = y_pred[query_indexes]
            i_j = self.query_permutations[query_id]
            i_j_preds = query_y_pred[i_j]
            i_j_true = query_y_true[i_j]

            document_positions = np.empty_like(query_indexes)
            document_positions[np.argsort(-query_y_pred, kind='mergesort')] = np.arange(1, len(query_indexes) + 1)
            doc_pos_matrix = np.tile(document_positions, (len(query_indexes), 1))

            delta_ndcg = (((np.power(2, query_y_true.reshape(-1, 1)) - np.power(2, i_j_true))
                           * (1 / np.log2(1 + document_positions.reshape(-1, 1)) - 1 / np.log2(1 + doc_pos_matrix)))
                          / self.query_idcg[query_id])

            delta_preds = query_y_pred.reshape(-1, 1) - i_j_preds
            perm_mask = query_y_true.reshape(-1, 1) - i_j_true

            p_ij = np.zeros_like(delta_preds, dtype=np.float32)
            p_ij += expit(-delta_preds) * (perm_mask > 0)
            p_ij += expit(delta_preds) * (perm_mask < 0)

            lambda_ij = -np.abs(delta_ndcg) * p_ij
            query_lambdas = np.sum(lambda_ij * (perm_mask > 0) - lambda_ij * (perm_mask < 0), axis=1)
            lambdas[query_indexes] = query_lambdas

            loss += dcg(query_y_true[np.argsort(-query_y_pred)], None)
            idcg += dcg(sorted(query_y_true, reverse=True), None)

            hess[query_indexes] = 1 * np.sum(
                np.abs(delta_ndcg) * p_ij * (1 - p_ij) * (perm_mask != 0), axis=1)

        self.train_loss.append(loss / idcg)

        return lambdas, hess


    def predict(self, objects):
        preds = np.sum([self.lr * tree.predict(objects) for tree in self.models], axis=0)
        return preds


In [0]:
import os
from os.path import join as opjoin

base_path = '/content/drive/My Drive/ir-itmo/pagerank-ir-itmo'

In [0]:
os.chdir(base_path)

from sklearn.datasets import load_svmlight_file

objects, labels, query_ids = load_svmlight_file('l2r/train.txt', query_id=True)
objects = objects.todense()
objects = np.asarray(objects)
non_zero_columns_mask = objects.sum(0) != 0
objects = objects[:, non_zero_columns_mask].astype(np.float32)

In [0]:
lambda_mart_model = LambdaMartModel(n_trees=100, lr=0.05)

lambda_mart_model.train(objects, labels, query_ids)

In [0]:
test_set, test_labels, query_ids_test = load_svmlight_file('l2r/test.txt', query_id=True)
test_set = test_set.todense()
test_set = np.asarray(test_set)
test_set = test_set[:, non_zero_columns_mask].astype(np.float32)

test_output = lambda_mart_model.predict(test_set)

In [0]:
import pandas as pd

submission = pd.read_csv('l2r/sample.made.fall.2019')

submission['QueryId'] = query_ids_test
submission['pred'] = test_output
submission['DocumentId'] = np.arange(1, submission.shape[0] + 1)

submission = submission[['QueryId']].drop_duplicates().merge(submission.sort_values(by=['QueryId', 'pred'], ascending=False))
submission = submission[['QueryId', 'DocumentId']]

submission.to_csv('submission.csv', index=False)