Implement a learning-to-rank method with the following minimum requirements: 
        - Consider document-query matching in minimum 3 fields (title, content and anchors) 
          and at least two different retrieval models (e.g., BM25 and LM). That is, 6 document-query features minimum.

Perform baseline (BM25) retrieval on a separate anchor text index. (THIS IS DONE IN 1_baseline.ipynb)
        - The anchor text index (called clueweb12b_anchors) can be accessed the same way as the regular document index. 
        - Note that the anchor text index covers the entire ClueWeb collection, not just the Category B subset. 
          I.e., you need to ignore documents that are not present in the regular index.

Test your model using 5-fold cross-validation on the given training data (queries and relevance judments, i.e., data/queries.txt and data/qrels.csv).

--------------------------------------------------------------------------------------------------------------------------------
Load queries and qrels:

In [20]:
import urllib
import requests
import json
import math


API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"

FEATURES_FILE = "data/features.txt"
OUTPUT_FILE = "data/ltr.txt"  # output the ranking

# load queries
def load_queries(query_file):
    queries = {}
    with open(QUERY_FILE, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
        
# load given ground truth
def load_qrels(qrels_file):
    gtruth = {}
    with open(QRELS_FILE, 'r') as qr:
        for line in qr.readlines():
            if line.startswith('QueryId'):
                continue
            qid, did, rel = line.strip().split(',')
            if qid not in gtruth:
                gtruth[qid] = {}
            gtruth[qid][did] = int(rel)
    return gtruth

queries = load_queries(QUERY_FILE)
gtruth = load_qrels(QRELS_FILE)


--------------------------------------------------------------------------------------------------------------------------------
API:

In [8]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)


def termvectors(indexname, docid, term_statistics="true"): 
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    response = requests.get(url).text
    return json.loads(response)

def exists(indexname, docid): 
    url = "/".join([API, indexname, docid, "_exists"])
    response = requests.get(url).text
    return json.loads(response)

def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = json.loads(requests.get(url).text)
    tokens = response["tokens"]
    query_terms = []
    for t in sorted(tokens, key=lambda x: x["position"]):
        query_terms.append(t["token"])
    return query_terms

# print(termvectors("clueweb12b", "clueweb12-0000tw-07-01629").items())
# print(analyze_query("clueweb12b", "raspberry pi"))

--------------------------------------------------------------------------------------------------------------------------------
Collecting feature values in the features dict. It has the structure features[qid][docid][fid] = value, where fid is a feature ID

In [9]:
FIELD_MOD = {
    1: ["title", "BM25"],
    2: ["title", "LM"],
    3: ["content", "BM25"],
    4: ["content", "LM"],
    5: ["anchors", "BM25"],
    6: ["anchors", "LM"]
}

features = {}

def add_to_features(docid):
    if docid not in features[qid]:
        features[qid][docid] = {}
    features[qid][docid][fid] = r["_score"]

for fid in range(1, len(FIELD_MOD) + 1):
    print("Computing values for feature '%s & %s'" % (FIELD_MOD[fid][0], FIELD_MOD[fid][1]))

    for qid, query in queries.items():
        if qid not in features:
            features[qid] = {}

        if FIELD_MOD[fid][1] == "BM25":
            if FIELD_MOD[fid][0] == "anchors":
                res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=20)
                for r in res.get("hits", {}).get("hits", {}):
                    docid = r["_id"]
                    if exists("clueweb12b", r["_id"])["exists"] == True:
                        add_to_features(docid)
            else: 
                res = search("clueweb12b", query, FIELD_MOD[fid][0], size=20)
                for r in res.get("hits", {}).get("hits", {}):
                    docid = r["_id"]
                    add_to_features(docid)
            
        else:
            # TODO TODO TODO
#             if FIELD_MOD[fid][0] == "anchors":
#                 res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=100)
#             else: 
#                 res = search("clueweb12b", query, FIELD_MOD[fid][0], size=100)
#             # reranking must be implemented here
#             qterms = analyze_query("clueweb12b", query)
            pass


        

Computing values for feature 'title & BM25'
Computing values for feature 'title & LM'
Computing values for feature 'content & BM25'
Computing values for feature 'content & LM'
Computing values for feature 'anchors & BM25'
Computing values for feature 'anchors & LM'


Looking up relevance labels and writing training data to file:

In [11]:
with open(FEATURES_FILE, "w") as fout:
    for qid, query in queries.items():
        for docid, ft in features[qid].items():
            # Note that docid will not have a feature value for feature ID i
            # if it was not retrieved in the top-1000 positions for that feature
            # Here, we use -1 as the value for "missing" features
            
            # CHANGE range() PARAMETER WHEN READY
            for fid in range(1, len(FIELD_MOD) + 1):
                if fid not in ft:
                    ft[fid] = -1
            
            # relevance label is determined based on the ground truth (qrels) file
            label = 1 if docid in gtruth.get(qid, []) else 0
                        
            feat_str = ['{}:{}'.format(k,v) for k,v in ft.items()]
            fout.write(" ".join([str(label), qid, docid] + feat_str) + "\n")

--------------------------------------------------------------------------------------------------------------------------------

In [12]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

A class for pointwise-based learning to rank model:

In [13]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

Read training data from file:

In [14]:
def read_data_from_file(path):
    """
    :param path: path of file
    :return: X features of data, y labels of data, group a list of numbers indicate how many instances for each query
    """
    X, y, qids, doc_ids = [], [], [], []
    with open(path, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

Loading training data:

In [15]:
X, y, qids, doc_ids = read_data_from_file(path=FEATURES_FILE)
qids_unique= list(set(qids))

print("#queries: ", len(qids_unique))
print("#query-doc pairs: ", len(y))

#queries:  50
#query-doc pairs:  1912


Applying 5-fold cross-validation:

In [16]:
FOLDS = 5

fout = open(OUTPUT_FILE, "w")
# write header
fout.write("QueryId,DocumentId\n")
    
for f in range(FOLDS):
    print("Fold #{}".format(f + 1))
    
    train_qids, test_qids = [], []  # holds the IDs of train and test queries
    train_ids, test_ids = [], []  # holds the instance IDs (indices in X )

    for i in range(len(qids_unique)):
        qid = qids_unique[i]
        if i % FOLDS == f:  # test query
            test_qids.append(qid)
        else:  # train query
            train_qids.append(qid)

    train_X, train_y = [], []  # training feature values and target labels
    test_X = []  # for testing we only have feature values

    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(y[i])
        else:
            test_X.append(X[i])

    # Create and train LTR model
    print("\tTraining model ...")
    clf = RandomForestRegressor(max_depth=3, random_state=0)
    ltr = PointWiseLTRModel(clf)
    ltr._train(train_X, train_y)
    
    # Apply LTR model on the remaining fold (test queries)
    print("\tApplying model ...")
    
    for qid in set(test_qids):
        print("\t\tRanking docs for queryID {}".format(qid))
        # Collect the features and docids for that (test) query `qid`
        test_ft, test_docids = [], []
        for i in range(len(X)):
            if qids[i] == qid:
                test_ft.append(X[i])
                test_docids.append(doc_ids[i])
        
        # Get ranking
        r = ltr.rank(test_ft, test_docids)    
        # Write the results to file
        for doc, score in r:
            fout.write(qid + "," + doc + "\n")
        
fout.close()

Fold #1
	Training model ...
	Applying model ...
		Ranking docs for queryID 245
		Ranking docs for queryID 221
		Ranking docs for queryID 240
		Ranking docs for queryID 246
		Ranking docs for queryID 212
		Ranking docs for queryID 213
		Ranking docs for queryID 210
		Ranking docs for queryID 248
		Ranking docs for queryID 230
		Ranking docs for queryID 203
Fold #2
	Training model ...
	Applying model ...
		Ranking docs for queryID 239
		Ranking docs for queryID 231
		Ranking docs for queryID 225
		Ranking docs for queryID 202
		Ranking docs for queryID 229
		Ranking docs for queryID 234
		Ranking docs for queryID 238
		Ranking docs for queryID 216
		Ranking docs for queryID 237
		Ranking docs for queryID 214
Fold #3
	Training model ...
	Applying model ...
		Ranking docs for queryID 233
		Ranking docs for queryID 215
		Ranking docs for queryID 206
		Ranking docs for queryID 224
		Ranking docs for queryID 236
		Ranking docs for queryID 227
		Ranking docs for queryID 208
		Ranking docs for 

Evaluation:

In [21]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def evaluate(rankings, gtruth, df):
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    
    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]    

        # relevance levels of our ranking
        gains = []
        for doc_id in ranking: 
            if gt.get(doc_id, 0) >= 0:
                gains.append(gt.get(doc_id, 0))
            else: 
                gains.append(0)
        
        # relevance levels of the idealized ranking
        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)

        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20

        # print("NDCG@10:", round(ndcg10, 3), "\nNDCG@20:", round(ndcg20, 3))

    print("\nAverage (%s):" % df)
    print("\tNDCG@10:", round(sum_ndcg10 / len(rankings), 3), "\n\tNDCG@20:", round(sum_ndcg20 / len(rankings), 3), "\n")
    
# load rankings for title field search
rankings_ltr = {}
with open(OUTPUT_FILE, "r") as fin:
    docs = []
    for line in fin.readlines():
        if line.startswith('QueryId'):
            continue
        qid, doc_id = line.strip().split(",")
        if qid not in rankings_ltr: 
            rankings_ltr[qid] = []
        rankings_ltr[qid].append(doc_id)
# evaluate
evaluate(rankings_ltr, gtruth, "ltr")


Average (ltr):
	NDCG@10: 0.144 
	NDCG@20: 0.131 

