Implement a learning-to-rank method with the following minimum requirements: 
        - Consider document-query matching in minimum 3 fields (title, content and anchors) 
          and at least two different retrieval models (e.g., BM25 and LM). That is, 6 document-query features minimum.

Perform baseline (BM25) retrieval on a separate anchor text index. 
        - The anchor text index (called clueweb12b_anchors) can be accessed the same way as the regular document index. 
        - Note that the anchor text index covers the entire ClueWeb collection, not just the Category B subset. 
          I.e., you need to ignore documents that are not present in the regular index.

Test your model using 5-fold cross-validation on the given training data (queries and relevance judments, i.e., data/queries.txt and data/qrels.csv).

--------------------------------------------------------------------------------------------------------------------------------
Load queries and qrels:

In [43]:
import urllib
import requests
import json
import math


API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"

FEATURES_FILE = "data/features.txt"
OUTPUT_FILE = "data/ltr.txt"  # output the ranking

FIELDS = ["title", "content", "anchors"]

# load queries
def load_queries(query_file):
    queries = {}
    with open(QUERY_FILE, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
        
# load given ground truth
def load_qrels(qrels_file):
    gtruth = {}
    with open(QRELS_FILE, 'r') as qr:
        for line in qr.readlines():
            if line.startswith('QueryId'):
                continue
            qid, did, rel = line.strip().split(',')
            if qid not in gtruth:
                gtruth[qid] = {}
            gtruth[qid][did] = int(rel)
    return gtruth

queries = load_queries(QUERY_FILE)
qrels = load_qrels(QRELS_FILE)


--------------------------------------------------------------------------------------------------------------------------------
Issuing a search query againt the API:

In [33]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

Printing results for a given search query:

In [34]:
res = search("clueweb12b", "united states", "content", size=5)
for r in res.get("hits", {}).get("hits", {}):
    print("{} {}".format(r["_id"], r["_score"]))

clueweb12-0209wb-65-17913 8.233204
clueweb12-1601wb-49-06848 8.229019
clueweb12-0802wb-81-18218 8.220643
clueweb12-0905wb-66-27014 8.211074
clueweb12-0008wb-13-28303 8.209314


--------------------------------------------------------------------------------------------------------------------------------
Collecting feature values in the features dict. It has the structure features[qid][docid][fid] = value, where fid is a feature ID

In [35]:
features = {}

# TODO: adapt this to the API

for f in FIELDS:
    print("Computing values for feature...")
    
    for qid, query in queries.items():
        if qid not in features:
            features[qid] = {}
            
        res = search("clueweb12b", query, "title", size=1000)  # size=1000 because of later re-ranking
        # add two other retrieval model features here
        
        for r in res.get("hits", {}).get("hits", {}):
            docid = r["_id"]
            if docid not in features[qid]:
                features[qid][docid] = {}
            features[qid][docid][f] = r["_score"]

Computing values for feature...
Computing values for feature...
Computing values for feature...


Looking up relevance labels and writing training data to file:

In [36]:
with open(FEATURES_FILE, "w") as fout:
    for qid, query in queries.items():
        for docid, ft in features[qid].items():
            # Note that docid will not have a feature value for feature ID i
            # if it was not retrieved in the top-1000 positions for that feature
            # Here, we use -1 as the value for "missing" features
            
            # CHANGE range() PARAMETER WHEN READY
            for fid in range(1, len(FIELDS) + 1):
                if fid not in ft:
                    ft[fid] = -1
            
            # relevance label is determined based on the ground truth (qrels) file
            label = 1 if docid in qrels.get(qid, []) else 0
                        
            feat_str = ['{}:{}'.format(k,v) for k,v in ft.items()]
            fout.write(" ".join([str(label), qid, docid] + feat_str) + "\n")

--------------------------------------------------------------------------------------------------------------------------------

In [37]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

A class for pointwise-based learning to rank model:

In [38]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

Read training data from file:

In [39]:
def read_data_from_file(path):
    """
    :param path: path of file
    :return: X features of data, y labels of data, group a list of numbers indicate how many instances for each query
    """
    X, y, qids, doc_ids = [], [], [], []
    with open(path, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

Loading training data:

In [40]:
X, y, qids, doc_ids = read_data_from_file(path=FEATURES_FILE)
qids_unique= list(set(qids))

print("#queries: ", len(qids_unique))
print("#query-doc pairs: ", len(y))

#queries:  50
#query-doc pairs:  50112


Applying 5-fold cross-validation:

In [45]:
FOLDS = 5

fout = open(OUTPUT_FILE, "w")
# write header
fout.write("QueryId,DocumentId\n")
    
for f in range(FOLDS):
    print("Fold #{}".format(f + 1))
    
    train_qids, test_qids = [], []  # holds the IDs of train and test queries
    train_ids, test_ids = [], []  # holds the instance IDs (indices in X )

    for i in range(len(qids_unique)):
        qid = qids_unique[i]
        if i % FOLDS == f:  # test query
            test_qids.append(qid)
        else:  # train query
            train_qids.append(qid)

    train_X, train_y = [], []  # training feature values and target labels
    test_X = []  # for testing we only have feature values

    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(y[i])
        else:
            test_X.append(X[i])

    # Create and train LTR model
    print("\tTraining model ...")
    clf = RandomForestRegressor(max_depth=3, random_state=0)
    ltr = PointWiseLTRModel(clf)
    ltr._train(train_X, train_y)
    
    # Apply LTR model on the remaining fold (test queries)
    print("\tApplying model ...")
    
    for qid in set(test_qids):
        print("\t\tRanking docs for queryID {}".format(qid))
        # Collect the features and docids for that (test) query `qid`
        test_ft, test_docids = [], []
        for i in range(len(X)):
            if qids[i] == qid:
                test_ft.append(X[i])
                test_docids.append(doc_ids[i])
        
        # Get ranking
        r = ltr.rank(test_ft, test_docids)    
        # Write the results to file
        for doc, score in r:
            fout.write(qid + "," + doc + "\n")
        
fout.close()

Fold #1
	Training model ...


ValueError: setting an array element with a sequence.