In [2]:
import numpy as np
import numpy as np
from sklearn.datasets import load_svmlight_file
from datetime import datetime
from xgboost import XGBRegressor

## LambdaMART and utilities

In [3]:
SIGMA = 1.0 # from an article any sigma would work
LIMIT_DELTA = 50 # limit for exponent

class Query:
    def __init__(self, asessor_scores, test_mode=False):
        self.test_mode = test_mode
        self.count_docs = len(asessor_scores)
        if self.test_mode:
            self.update_scores(np.ones((self.count_docs, )))
        else:
            self.asessor_scores = np.copy(asessor_scores)
            scores_sorted = np.sort(self.asessor_scores)[::-1]
            self.dcg_norm = np.sum((2.0 ** scores_sorted - 1) / np.log(np.arange(2, self.count_docs+2)))
            if self.dcg_norm == 0:
                self.dcg_norm = 1.0

            self.permutations = np.tile(np.arange(0, self.count_docs), (self.count_docs, 1))
            self.update_scores(np.zeros((self.count_docs, )))

    def update_delta_ndcg(self):
        self.delta_ndcg = (-1.0 / np.log(self.positions.reshape(-1, 1)+1) + 1.0 / np.log(self.positions[self.permutations]+1))
        self.delta_ndcg *=(((2 ** self.asessor_scores.reshape(-1, 1)) - 1) - ((2 ** self.asessor_scores[self.permutations]) - 1))
        self.delta_ndcg = np.abs(self.delta_ndcg) / self.dcg_norm

    def get_ndcg(self):
        return np.sum((2.0 ** self.asessor_scores - 1) / np.log(self.positions+1)) / self.dcg_norm

    def update_scores(self, new_scores):

        # getting current of scores
        self.positions = np.zeros((self.count_docs, ), dtype=np.int32)
        self.positions[np.argsort(new_scores)[::-1].astype(np.int32)] = np.arange(1, self.count_docs+1)
        self.scores = np.copy(new_scores)
        if self.test_mode == True:
            return

        self.update_delta_ndcg()

        delta_scores = np.abs(SIGMA * (self.scores.reshape((-1, 1)) - self.scores[self.permutations]))
        delta_scores[delta_scores >= LIMIT_DELTA / SIGMA] = LIMIT_DELTA / SIGMA
        self.ro_ij = 1.0 / (1 + np.exp(SIGMA * delta_scores))

        correct_permutations = ((self.asessor_scores.reshape((-1, 1)) 
                               > self.asessor_scores[self.permutations]).astype(np.int8))
        incorrect_permutations = ((self.asessor_scores.reshape((-1, 1)) 
                                 < self.asessor_scores[self.permutations]).astype(np.int8))
        valid_permutations = correct_permutations + incorrect_permutations

        # update gradient
        self.numerators = -np.sum(self.delta_ndcg * self.ro_ij * correct_permutations - \
                                  self.delta_ndcg * self.ro_ij * incorrect_permutations, axis=1)

        # update hessian
        self.denominators = np.sum(self.delta_ndcg * SIGMA * self.ro_ij 
                                   * (1.0 - self.ro_ij) * valid_permutations, axis=1)
        self.denominators[self.denominators == 0] = 1

        
class Data:
    def __init__(self, test_mode=False):
        self.test_mode = test_mode

    def load_data(self, filename):
        print("Loading data "+ datetime.now().isoformat())
        self.X, self.y, documents_query = load_svmlight_file(filename, query_id=True)
        print("Data has been loaded! "+ datetime.now().isoformat())

        print("Creating queries... "+ datetime.now().isoformat())
        self.queries = []
        self.query_document_indices = []
        self.unique_query_indices = np.unique(documents_query)
        for query_id in self.unique_query_indices:
            self.query_document_indices.append(np.where(documents_query == query_id)[0])
            self.queries.append(Query(self.y[self.query_document_indices[-1]], self.test_mode))
            if query_id % 1000 == 0:
                print(query_id)
        print("Queries have been created "+ datetime.now().isoformat())
        return self

EPOCH = 0
def ObjectiveFunction(data):
    def _objective_function(y_true, y_pred):
        global EPOCH
        print("Epoch = " + str(EPOCH) + "; " + datetime.now().isoformat())
        EPOCH += 1
        for query_id, indices in enumerate(data.query_document_indices):
            data.queries[query_id].update_scores(y_pred[indices])

        return np.hstack(q.numerators for q in train_data.queries), \
              np.hstack(q.denominators for q in train_data.queries)
    return _objective_function


class LambdaMART:
    def __init__(self, train_data, **kwargs):
        objective_func = ObjectiveFunction(train_data)
        self.xgb_classifier = XGBRegressor(n_jobs=4, objective=objective_func, **kwargs)

    def fit(self, train_data):
       self.xgb_classifier.fit(train_data.X, train_data.y)
       return self

    def predict(self, test_data):
        results = self.xgb_classifier.predict(test_data.X)
        for query_id, indices in enumerate(test_data.query_document_indices):
             test_data.queries[query_id].update_scores(results[indices])

def create_submission(filename, test_data):
    with open(filename, 'w') as f:
        f.write("QueryId,DocumentId\n")
        document_base_idx = 0
        for query_idx, unique_query_idx in enumerate(test_data.unique_query_indices):
            doc_pos = test_data.queries[query_idx].positions
            pos_for_write = np.full((len(doc_pos), ), document_base_idx)
            pos_for_write[doc_pos-1] += np.arange(1, len(doc_pos)+1)
            for pos in pos_for_write:
                f.write(str(unique_query_idx)+","+str(pos)+"\n")
            document_base_idx += len(test_data.queries[query_idx].positions)

## Run ranking

In [None]:
filename_base = "sm_xgb_0.2-10-8-hist.txt"
print(filename_base)

train_data = Data().load_data("/home/emil/Rank/train.txt")
params = {'max_depth': 8, 'silent': 1, 'learning_rate': 0.2, 'tree_method': 'hist', 
          'n_estimators': 3}
model = LambdaMART(train_data, **params)
model.fit(train_data)

test_data = Data(test_mode=True).load_data("/home/emil/Rank/test.txt")
model.predict(test_data)
create_submission("submission_"+filename_base, test_data)

sm_xgb_0.2-10-8-hist.txt
Loading data 2019-02-10T22:07:32.057997
Data has been loaded! 2019-02-10T22:09:31.776998
Creating queries... 2019-02-10T22:09:31.777150
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
Queries have been created 2019-02-10T22:09:47.640583
[22:10:21] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
Epoch = 0; 2019-02-10T22:10:22.075735
Epoch = 1; 2019-02-10T22:10:40.481070
Epoch = 2; 2019-02-10T22:10:48.446199
Loading data 2019-02-10T22:10:57.001198
Data has been loaded! 2019-02-10T22:11:49.603243
Creating queries... 2019-02-10T22:11:49.603449
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
Queries have been created 2019-02-10T22:11:51.556833
