Implement a learning-to-rank method with the following minimum requirements: 
        - Consider document-query matching in minimum 3 fields (title, content and anchors) 
          and at least two different retrieval models (e.g., BM25 and LM). That is, 6 document-query features minimum.

Perform baseline (BM25) retrieval on a separate anchor text index. (THIS IS DONE IN 1_baseline.ipynb)
        - The anchor text index (called clueweb12b_anchors) can be accessed the same way as the regular document index. 
        - Note that the anchor text index covers the entire ClueWeb collection, not just the Category B subset. 
          I.e., you need to ignore documents that are not present in the regular index.

Test your model using 5-fold cross-validation on the given training data (queries and relevance judments, i.e., data/queries.txt and data/qrels.csv).

--------------------------------------------------------------------------------------------------------------------------------
Load queries and qrels:

In [19]:
import urllib
import requests
import json


API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"

FEATURES_FILE = "data/features.txt"
OUTPUT_FILE = "data/ltr.txt"  # output the ranking

# load queries
def load_queries(query_file):
    queries = {}
    with open(QUERY_FILE, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
        
# load given ground truth
def load_qrels(qrels_file):
    gtruth = {}
    with open(QRELS_FILE, 'r') as qr:
        for line in qr.readlines():
            if line.startswith('QueryId'):
                continue
            qid, did, rel = line.strip().split(',')
            if qid not in gtruth:
                gtruth[qid] = {}
            gtruth[qid][did] = int(rel)
    return gtruth

queries = load_queries(QUERY_FILE)
qrels = load_qrels(QRELS_FILE)


--------------------------------------------------------------------------------------------------------------------------------
API:

In [20]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

def termvectors(indexname, docid, term_statistics="true"): 
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    response = requests.get(url).text
    return json.loads(response)

print(termvectors("clueweb12b", "clueweb12-0000tw-07-01629").items())

dict_items([('_id', 'clueweb12-0000tw-07-01629'), ('_index', 'clueweb12b'), ('_type', 'doc'), ('_version', 0), ('found', False), ('took', 0)])


--------------------------------------------------------------------------------------------------------------------------------
Collection LM:

In [21]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for fid in FIELD_MOD:
            self._probs[FIELD_MOD[fid][0]] = {}
            for t in qterms:
                self._probs[FIELD_MOD[fid][0]][t] = self.__get_prob(FIELD_MOD[fid][0], t)

    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        hits = search("clueweb12b", query, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True` by default)
            tv = termvectors("clueweb12b", doc_id)
            ttf = tv["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
            sum_ttf = tv["field_statistics"]["sum_ttf"]
            return ttf / sum_ttf

        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

In [22]:
def lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)

    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed (`term_statistics=False`)
    tv = termvectors("clueweb12b", doc_id, term_statistics="false")

    # NOTE: Keep in mind that a given document field might be empty. In that case there is no tv[field].

    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)

        #  compute the field language model $P(t|\theta_{d_i})$ with Jelinek-Mercer smoothing
        ####################################
        tf, tf_sum, dl_sum = 0, 0, 0
        if field in tv and t in tv[field]['terms']:
            if t in tv[field]['terms']:
                tf = tv[field]['terms'][t]['term_freq']
                tf_sum += tf
            dl = sum(stats['term_freq'] for term, stats in tv[field]['terms'].items())  # Document length
            dl_sum += dl
            Pt_theta_di = ((1 - LAMBDA) * (tf_sum/dl_sum)) + (LAMBDA * clm.prob(field, t))
        else:
            Pt_theta_di = (LAMBDA * clm.prob(field, t))

        ####################################

        # NOTE keep in mind that the term vector will not contain `term` as a key if the document doesn't
        # contain that term; you will still need to use the background term probabilities for that term.
        # You can get the background term probability using `clm.prob(field, t)`

        Pt_theta_d += FIELD_WEIGHTS[i] * Pt_theta_di

        score += math.log(Pt_theta_d)

    return score

--------------------------------------------------------------------------------------------------------------------------------
Collecting feature values in the features dict. It has the structure features[qid][docid][fid] = value, where fid is a feature ID

In [23]:
FIELD_MOD = {
    1: ["title", "BM25"],
    2: ["title", "LM"],
    3: ["content", "BM25"],
    4: ["content", "LM"],
    5: ["anchors", "BM25"],
    6: ["anchors", "LM"]
}

FIELDS_WEIGHTS = {
    "title": 0.2,
    "content": 0.6,
    "anchors": 0.2
}

features = {}


for fid in range(1, len(FIELD_MOD) + 1):
    print("Computing values for feature '%s & %s'" % (FIELD_MOD[fid][0], FIELD_MOD[fid][1]))

    for qid, query in queries.items():
        if qid not in features:
            features[qid] = {}

        if FIELD_MOD[fid][1] == "BM25":
#             res = search("clueweb12b", query, FIELD_MOD[fid][0], size=100)  # size=100?
#             for r in res.get("hits", {}).get("hits", {}):
#                 docid = r["_id"]
#                 if docid not in features[qid]:
#                     features[qid][docid] = {}
#                 features[qid][docid][fid] = r["_score"]
            pass
        else:  # LM
            res = search("clueweb12b", query, FIELD_MOD[fid][0], size=1000)  # size=1000 because of later re-ranking
            # Simple tokenizer
            qterms = []
            terms = query.split(" ")
            for t in terms:
                qterms.append(t.lower())
            #print("qterms: {0}".format(qterms))
            # get collection LM
            clm = CollectionLM(qterms)
            scores = {}
            for doc in res.get("hits", {}).get("hits", {}):
                docid = r["_id"]
                scores[docid] = lm(clm, qterms, docid, FIELD_MOD[fid][0])
                print(scores[docid])

#         for r in res.get("hits", {}).get("hits", {}):
#             docid = r["_id"]
#             if docid not in features[qid]:
#                 features[qid][docid] = {}
#             features[qid][docid][fid] = r["_score"]

Computing values for feature 'title & BM25'
Computing values for feature 'title & LM'


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Looking up relevance labels and writing training data to file:

In [None]:
with open(FEATURES_FILE, "w") as fout:
    for qid, query in queries.items():
        for docid, ft in features[qid].items():
            # Note that docid will not have a feature value for feature ID i
            # if it was not retrieved in the top-1000 positions for that feature
            # Here, we use -1 as the value for "missing" features
            
            # CHANGE range() PARAMETER WHEN READY
            for fid in range(1, len(FIELD_MOD) + 1):
                if fid not in ft:
                    ft[fid] = -1
            
            # relevance label is determined based on the ground truth (qrels) file
            label = 1 if docid in qrels.get(qid, []) else 0
                        
            feat_str = ['{}:{}'.format(k,v) for k,v in ft.items()]
            fout.write(" ".join([str(label), qid, docid] + feat_str) + "\n")

--------------------------------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

A class for pointwise-based learning to rank model:

In [None]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

Read training data from file:

In [None]:
def read_data_from_file(path):
    """
    :param path: path of file
    :return: X features of data, y labels of data, group a list of numbers indicate how many instances for each query
    """
    X, y, qids, doc_ids = [], [], [], []
    with open(path, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

Loading training data:

In [None]:
X, y, qids, doc_ids = read_data_from_file(path=FEATURES_FILE)
qids_unique= list(set(qids))

print("#queries: ", len(qids_unique))
print("#query-doc pairs: ", len(y))

Applying 5-fold cross-validation:

In [None]:
FOLDS = 5

fout = open(OUTPUT_FILE, "w")
# write header
fout.write("QueryId,DocumentId\n")
    
for f in range(FOLDS):
    print("Fold #{}".format(f + 1))
    
    train_qids, test_qids = [], []  # holds the IDs of train and test queries
    train_ids, test_ids = [], []  # holds the instance IDs (indices in X )

    for i in range(len(qids_unique)):
        qid = qids_unique[i]
        if i % FOLDS == f:  # test query
            test_qids.append(qid)
        else:  # train query
            train_qids.append(qid)

    train_X, train_y = [], []  # training feature values and target labels
    test_X = []  # for testing we only have feature values

    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(y[i])
        else:
            test_X.append(X[i])

    # Create and train LTR model
    print("\tTraining model ...")
    clf = RandomForestRegressor(max_depth=3, random_state=0)
    ltr = PointWiseLTRModel(clf)
    ltr._train(train_X, train_y)
    
    # Apply LTR model on the remaining fold (test queries)
    print("\tApplying model ...")
    
    for qid in set(test_qids):
        print("\t\tRanking docs for queryID {}".format(qid))
        # Collect the features and docids for that (test) query `qid`
        test_ft, test_docids = [], []
        for i in range(len(X)):
            if qids[i] == qid:
                test_ft.append(X[i])
                test_docids.append(doc_ids[i])
        
        # Get ranking
        r = ltr.rank(test_ft, test_docids)    
        # Write the results to file
        for doc, score in r:
            fout.write(qid + "," + doc + "\n")
        
fout.close()