Implement a learning-to-rank method with the following minimum requirements: 
        - Consider document-query matching in minimum 3 fields (title, content and anchors) 
          and at least two different retrieval models (e.g., BM25 and LM). That is, 6 document-query features minimum.

Perform baseline (BM25) retrieval on a separate anchor text index. (THIS IS DONE IN 1_baseline.ipynb)
        - The anchor text index (called clueweb12b_anchors) can be accessed the same way as the regular document index. 
        - Note that the anchor text index covers the entire ClueWeb collection, not just the Category B subset. 
          I.e., you need to ignore documents that are not present in the regular index.

Test your model using 5-fold cross-validation on the given training data (queries and relevance judments, i.e., data/queries.txt and data/qrels.csv).

--------------------------------------------------------------------------------------------------------------------------------
Load queries and qrels:

In [1]:
import urllib
import requests
import json
import math
import os
from pprint import pprint


API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"

FEATURES_FILE = "data/features_qd_q_d.txt"
OUTPUT_FILE = "data/ltr_qd_q_d.txt"  # output the ranking, only QD features

FIELD_MOD = {
    1: ["title", "BM25"],
    2: ["title", "LM"],
    3: ["content", "BM25"],
    4: ["content", "LM"],
    5: ["anchors", "BM25"],
    6: ["anchors", "LM"]
}

BASIC_INDEX_NAME = "clueweb12b"
ANCHORS_INDEX_NAME = "clueweb12b_anchors"

CACHE_DIR = "cache"
CACHE_DIR_SEARCH = CACHE_DIR + "/search"
CACHE_DIR_TERMVECTORS = CACHE_DIR + "/termvectors"

LAMBDA = 0.1

def get_index_name(field):
    return ANCHORS_INDEX_NAME if field == "anchors" else BASIC_INDEX_NAME

# load queries
def load_queries(query_file):
    queries = {}
    with open(QUERY_FILE, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
        
# load given ground truth
def load_qrels(qrels_file):
    gtruth = {}
    with open(QRELS_FILE, 'r') as qr:
        for line in qr.readlines():
            if line.startswith('QueryId'):
                continue
            qid, did, rel = line.strip().split(',')
            if qid not in gtruth:
                gtruth[qid] = {}
            gtruth[qid][did] = int(rel)
    return gtruth

queries = load_queries(QUERY_FILE)
gtruth = load_qrels(QRELS_FILE)


--------------------------------------------------------------------------------------------------------------------------------
API:

In [2]:
def search(indexname, query, field, size=10):
    cache_file = CACHE_DIR_SEARCH + "/" + indexname + "_" + query + "_" + field + "_" + str(size)
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

def termvectors(indexname, docid, term_statistics="true"): 
    cache_file = CACHE_DIR_TERMVECTORS + "/" + indexname + "_" + docid + "_" + term_statistics
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = json.loads(requests.get(url).text)
    tokens = response["tokens"]
    query_terms = []
    for t in sorted(tokens, key=lambda x: x["position"]):
        query_terms.append(t["token"])
    return query_terms

def exists(indexname, docid): 
    url = "/".join([API, indexname, docid, "_exists"])
    response = requests.get(url).text
    return json.loads(response)


# print(termvectors("clueweb12b", "clueweb12-0000tw-07-01629").items())
# print(analyze_query("clueweb12b", "raspberry pi"))

--------------------------------------------------------------------------------------------------------------------------------
Collection LM:

In [3]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for fid in FIELD_MOD:
            self._probs[FIELD_MOD[fid][0]] = {}
            for t in qterms:
                self._probs[FIELD_MOD[fid][0]][t] = self.__get_prob(FIELD_MOD[fid][0], t)

    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        index_name = get_index_name(field)
        hits = search(index_name, term, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True` by default)
            if termvectors("clueweb12b", doc_id)["found"] == True:
                index_name = get_index_name(field)
                tv = termvectors(index_name, doc_id)
                ttf = tv["term_vectors"][field]["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
                sum_ttf = tv["term_vectors"][field]["field_statistics"]["sum_ttf"]
                return ttf / sum_ttf

        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

In [4]:
def lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed
    index_name = get_index_name(field)
    tv = termvectors(index_name, doc_id, term_statistics="false")["term_vectors"]

    # compute field length $|d|$
    len_d = 0  # document field length initialization
    if field in tv:  # that document field may be NOT empty
        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        if field in tv:
            Pt_d = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d  # $P(t|d)$
        else:  # that document field is empty
            Pt_d = 0
        Pt_C = clm.prob(field, t)  # $P(t|C)$
        Pt_theta_d = (1 - LAMBDA) * Pt_d + LAMBDA * Pt_C  # $P(t|\theta_{d})$ with J-M smoothing
        # Pt_theta_d is 0 if t doesn't occur in any doc for that field, even with smoothing:
        score += math.log(Pt_theta_d) if Pt_theta_d > 0 else 0  
    
    return score

--------------------------------------------------------------------------------------------------------------------------------
Collecting feature values in the features dict. It has the structure features[qid][docid][fid] = value, where fid is a feature ID

In [5]:
features = {}

def add_to_features(docid):
    if docid not in features[qid]:
        features[qid][docid] = {}
    features[qid][docid][fid] = r["_score"]

for fid in range(1, len(FIELD_MOD) + 1):
    print("\nComputing values for feature '%s & %s' . . ." % (FIELD_MOD[fid][0], FIELD_MOD[fid][1]))

    for qid, query in queries.items():
        if qid not in features:
            features[qid] = {}

        if FIELD_MOD[fid][1] == "BM25":
            print("Query '{0}'".format(query))
            if FIELD_MOD[fid][0] == "anchors":
                res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=20)
                for r in res["hits"]["hits"]:
                    docid = r["_id"]
                    if exists("clueweb12b", r["_id"])["exists"] == True:
                        add_to_features(docid)
            else: 
                res = search("clueweb12b", query, FIELD_MOD[fid][0], size=20)                
                for r in res["hits"]["hits"]:
                    docid = r["_id"]
                    add_to_features(docid)
        else:  
            # LM
            print("Query '{0}'".format(query))
            if FIELD_MOD[fid][0] == "anchors":
                res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=100)
                for r in res["hits"]["hits"]:
                    docid = r["_id"]
                    if exists("clueweb12b", r["_id"])["exists"] == True:
                        qterms = analyze_query("clueweb12b", query)
                        clm = CollectionLM(qterms)
                        scores = {}
                        for r in res["hits"]["hits"]:
                            docid = r["_id"]
                            scores[docid] = lm(clm, qterms, docid, FIELD_MOD[fid][0])
                        i = 1
                        for docid in scores:
                            # return top 20
                            if i > 20:
                                break
                            if docid not in features[qid]:
                                features[qid][docid] = {}
                            features[qid][docid][fid] = scores[docid]
                            i += 1
            else:
                res = search("clueweb12b", query, FIELD_MOD[fid][0], size=100)  # size=100 because of later re-ranking
                qterms = analyze_query("clueweb12b", query)
                clm = CollectionLM(qterms)
                scores = {}
                for r in res["hits"]["hits"]:
                    docid = r["_id"]
                    scores[docid] = lm(clm, qterms, docid, FIELD_MOD[fid][0])
                i = 1
                for docid in scores:
                    # return top 20
                    if i > 20:
                        break
                    if docid not in features[qid]:
                        features[qid][docid] = {}
                    features[qid][docid][fid] = scores[docid]
                    i += 1

# additional query features
print("Computing values for additional query features . . .")
for qid, query in queries.items():
    avg_arr = []
    for e in features[qid]:
        if 1 in features[qid][e].keys():
            avg_arr.append(features[qid][e][1])
    
    for entry in features[qid]:
        ql = len(query.split(' '))
        # hardcoded feature IDs
        features[qid][entry][7] = ql
        features[qid][entry][8] = sum(avg_arr) / len(avg_arr)

# additional document features
print("Computing values for additional document features . . .")
for qid in queries.keys():
    for doc_id in features[qid]:
        for field in ['title', 'content']:
            index_name = get_index_name(field)
            try:
                tv = termvectors(index_name, doc_id, term_statistics="false")["term_vectors"]
                if field in tv:
                    len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
                    if field == 'title':
                        features[qid][doc_id][9] = len_d
                    else:
                        features[qid][doc_id][10] = len_d
            except KeyError:
                pass


Computing values for feature 'title & BM25' . . .
Query 'raspberry pi'
Query 'uss carl vinson'
Query 'reviews of les miserables'
Query 'rules of golf'
Query 'average charitable donation'
Query 'wind power'
Query 'bph treatment'
Query 'doctor zhivago'
Query 'land surveyor'
Query 'golf gps'
Query 'what is madagascar known for'
Query 'home theater systems'
Query 'carpal tunnel syndrome'
Query 'capital gains tax rate'
Query 'maryland department of natural resources'
Query 'nicolas cage movies'
Query 'kids earth day activities'
Query 'solar water fountains'
Query 'what was the name of elvis presley's home'
Query 'nba records'
Query 'electoral college 2008 results'
Query 'male menopause'
Query 'usda food pyramid'
Query 'making chicken soup from scratch'
Query 'black and gold'
Query 'traverse city'
Query 'i will survive lyrics'
Query 'hawaiian volcano observatories'
Query 'beef stroganoff recipe'
Query 'world's biggest dog'
Query 'what are the seven deadly sins'
Query 'hurricane Irene floodi

Query 'what was the name of elvis presley's home'
Query 'nba records'
Query 'electoral college 2008 results'
Query 'male menopause'
Query 'usda food pyramid'
Query 'making chicken soup from scratch'
Query 'black and gold'
Query 'traverse city'
Query 'i will survive lyrics'
Query 'hawaiian volcano observatories'
Query 'beef stroganoff recipe'
Query 'world's biggest dog'
Query 'what are the seven deadly sins'
Query 'hurricane Irene flooding in manville nj'
Query 'hair dye'
Query 'dark chocolate health benefits'
Query 'ham radio'
Query 'symptoms of mad cow disease in humans'
Query 'lump in throat'
Query 'george bush sr bio'
Query 'frank lloyd wright biography'
Query 'presidential middle names'
Query 'what is a wiki'
Query 'cannellini beans'
Query 'afghanistan flag'
Query 'old town scottsdale'
Query 'roosevelt island'
Query 'civil war battles in South Carolina'
Query 'rain man'
Query 'eggs shelf life'
Query 'occupational therapist'
Query 'ford edge problems'
Computing values for additional

Looking up relevance labels and writing training data to file:

In [6]:
with open(FEATURES_FILE, "w") as fout:
    for qid, query in queries.items():
        for docid, ft in features[qid].items():
            # Note that docid will not have a feature value for feature ID i
            # if it was not retrieved in the top-1000 positions for that feature
            # Here, we use -1 as the value for "missing" features
            
            # CHANGE range() PARAMETER WHEN READY
            for fid in range(1, 11):
                if fid not in ft:
                    ft[fid] = -1
            
            # relevance label is determined based on the ground truth (qrels) file
            label = 1 if docid in gtruth.get(qid, []) else 0
                        
            feat_str = ['{}:{}'.format(k,v) for k,v in ft.items()]
            fout.write(" ".join([str(label), qid, docid] + feat_str) + "\n")

--------------------------------------------------------------------------------------------------------------------------------

In [7]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

A class for pointwise-based learning to rank model:

In [8]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

Read training data from file:

In [9]:
def read_data_from_file(path):
    """
    :param path: path of file
    :return: X features of data, y labels of data, group a list of numbers indicate how many instances for each query
    """
    X, y, qids, doc_ids = [], [], [], []
    with open(path, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

Loading training data:

In [10]:
X, y, qids, doc_ids = read_data_from_file(path=FEATURES_FILE)
qids_unique= list(set(qids))

print("#queries: ", len(qids_unique))
print("#query-doc pairs: ", len(y))

#queries:  50
#query-doc pairs:  2865


Applying 5-fold cross-validation:

In [11]:
FOLDS = 5

fout = open(OUTPUT_FILE, "w")
# write header
fout.write("QueryId,DocumentId\n")
    
for f in range(FOLDS):
    print("Fold #{}".format(f + 1))
    
    train_qids, test_qids = [], []  # holds the IDs of train and test queries
    train_ids, test_ids = [], []  # holds the instance IDs (indices in X )

    for i in range(len(qids_unique)):
        qid = qids_unique[i]
        if i % FOLDS == f:  # test query
            test_qids.append(qid)
        else:  # train query
            train_qids.append(qid)

    train_X, train_y = [], []  # training feature values and target labels
    test_X = []  # for testing we only have feature values

    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(y[i])
        else:
            test_X.append(X[i])

    # Create and train LTR model
    print("\tTraining model ...")
    clf = RandomForestRegressor(max_depth=3, random_state=0)
    ltr = PointWiseLTRModel(clf)
    ltr._train(train_X, train_y)
    
    # Apply LTR model on the remaining fold (test queries)
    print("\tApplying model ...")
    
    for qid in set(test_qids):
        print("\t\tRanking docs for queryID {}".format(qid))
        # Collect the features and docids for that (test) query `qid`
        test_ft, test_docids = [], []
        for i in range(len(X)):
            if qids[i] == qid:
                test_ft.append(X[i])
                test_docids.append(doc_ids[i])
        
        # Get ranking
        r = ltr.rank(test_ft, test_docids)    
        # Write the results to file
        for doc, score in r:
            fout.write(qid + "," + doc + "\n")
        
fout.close()

Fold #1
	Training model ...
	Applying model ...
		Ranking docs for queryID 237
		Ranking docs for queryID 230
		Ranking docs for queryID 245
		Ranking docs for queryID 216
		Ranking docs for queryID 213
		Ranking docs for queryID 201
		Ranking docs for queryID 225
		Ranking docs for queryID 220
		Ranking docs for queryID 218
		Ranking docs for queryID 236
Fold #2
	Training model ...
	Applying model ...
		Ranking docs for queryID 242
		Ranking docs for queryID 211
		Ranking docs for queryID 222
		Ranking docs for queryID 247
		Ranking docs for queryID 234
		Ranking docs for queryID 238
		Ranking docs for queryID 210
		Ranking docs for queryID 250
		Ranking docs for queryID 224
		Ranking docs for queryID 226
Fold #3
	Training model ...
	Applying model ...
		Ranking docs for queryID 248
		Ranking docs for queryID 231
		Ranking docs for queryID 239
		Ranking docs for queryID 249
		Ranking docs for queryID 228
		Ranking docs for queryID 205
		Ranking docs for queryID 206
		Ranking docs for 

--------------------------------------------------------------------------------------------------------------------------------
Evaluation:

In [12]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def evaluate(rankings, gtruth, df):
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    
    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]    

        # relevance levels of our ranking
        gains = []
        for doc_id in ranking: 
            if gt.get(doc_id, 0) >= 0:
                gains.append(gt.get(doc_id, 0))
            else: 
                gains.append(0)
        
        # relevance levels of the idealized ranking
        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)

        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20

        # print("NDCG@10:", round(ndcg10, 3), "\nNDCG@20:", round(ndcg20, 3))

    print("\nAverage (%s):" % df)
    print("\tNDCG@10:", round(sum_ndcg10 / len(rankings), 3), "\n\tNDCG@20:", round(sum_ndcg20 / len(rankings), 3), "\n")
    
# load rankings for LTR QD+Q+D
rankings_ltr = {}
with open(OUTPUT_FILE, "r") as fin:
    docs = []
    for line in fin.readlines():
        if line.startswith('QueryId'):
            continue
        qid, doc_id = line.strip().split(",")
        if qid not in rankings_ltr: 
            rankings_ltr[qid] = []
        rankings_ltr[qid].append(doc_id)
# evaluate
evaluate(rankings_ltr, gtruth, "LTR QD+Q+D")


Average (LTR QD+Q+D):
	NDCG@10: 0.151 
	NDCG@20: 0.147 

