# Assignment 3 - Part 1

In [1]:
import urllib
import requests
import json
import math
import time

In [2]:
API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
TITLE_OUTPUT_FILE = "data/baseline_title.txt"
CONTENT_OUTPUT_FILE = "data/baseline_content.txt"
ANCHORS_OUTPUT_FILE = "data/baseline_anchors.txt"

# load queries
queries = {}
with open(QUERY_FILE, "r") as fin:
    for line in fin.readlines():
        qid, query = line.strip().split(" ", 1)
        queries[qid] = query
        
# load given ground truth
gtruth = {}
with open(QRELS_FILE, 'r') as qr:
    for line in qr.readlines():
        if line.startswith('QueryId'):
            continue
        qid, did, rel = line.strip().split(',')
        if qid not in gtruth:
            gtruth[qid] = {}
        gtruth[qid][did] = int(rel)


API:

In [3]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)


def termvectors(indexname, docid, term_statistics="true"): 
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    response = requests.get(url).text
    return json.loads(response)

def exists(indexname, docid): 
    url = "/".join([API, indexname, docid, "_exists"])
    response = requests.get(url).text
    return json.loads(response)

--------------------------------------------------------------------------------------------------------------------------------

Evaluation:

In [4]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def evaluate(rankings, gtruth, df):
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    
    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]    
        # print("\nQuery", qid)

        # relevance levels of our ranking
        gains = []
        for doc_id in ranking: 
            if gt.get(doc_id, 0) >= 0:
                gains.append(gt.get(doc_id, 0))
            else: 
                gains.append(0)
        
        # print("Gains:", gains)

        # relevance levels of the idealized ranking
        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)

        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20

        # print("NDCG@10:", round(ndcg10, 3), "\nNDCG@20:", round(ndcg20, 3))

    print("\nAverage (%s):" % df)
    print("\tNDCG@10:", round(sum_ndcg10 / len(rankings), 3), "\n\tNDCG@20:", round(sum_ndcg20 / len(rankings), 3), "\n")


--------------------------------------------------------------------------------------------------------------------------------
Search, load given ground trouth, evaluate: 

In [5]:
def title_search_evaluate():    
    # search title field
    print("Searching title field . . . ")
    with open(TITLE_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "title", size=20)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("Done searching title field!")
    # load rankings for title field search
    rankings_title = {}
    with open(TITLE_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid not in rankings_title: 
                rankings_title[qid] = []
            rankings_title[qid].append(doc_id)
    # evaluate
    evaluate(rankings_title, gtruth, "title")

    
def content_search_evaluate():
    # search content field
    print("Searching content field . . . ")
    with open(CONTENT_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "content", size=20)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("Done searching content field!")
    # load rankings for content field search
    rankings_content = {}
    with open(CONTENT_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid not in rankings_content: 
                rankings_content[qid] = []
            rankings_content[qid].append(doc_id)
    # evaluate
    evaluate(rankings_content, gtruth, "content")
    

def anchors_search_evaluate():
    # search anchors field
    print("Searching anchors field . . . ")
    with open(ANCHORS_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b_anchors", query, "anchors", size=20)
            for r in res.get("hits", {}).get("hits", {}):
                # check if present in clueweb12b
                time.sleep(0.25)
                # res_check = termvectors("clueweb12b", str(r["_id"]))
                if exists("clueweb12b", r["_id"])["exists"] == True:
                    fin.write("\n{},{}".format(qid, r["_id"]))
                else: 
                    continue
    print("Done searching anchors field!")
    # load rankings for anchors field search
    rankings_anchors = {}
    with open(ANCHORS_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid not in rankings_anchors:
                rankings_anchors[qid] = []
            rankings_anchors[qid].append(doc_id)
    # evaluate
    evaluate(rankings_anchors, gtruth, "anchors")

title_search_evaluate()
content_search_evaluate()
anchors_search_evaluate()

Searching title field . . . 
Done searching title field!

Average (title):
	NDCG@10: 0.128 
	NDCG@20: 0.114 

Searching content field . . . 
Done searching content field!

Average (content):
	NDCG@10: 0.138 
	NDCG@20: 0.128 

Searching anchors field . . . 
Done searching anchors field!

Average (anchors):
	NDCG@10: 0.057 
	NDCG@20: 0.042 



## TODOs:

- Retrieve results for all queries (in `data/queries.txt`)
- Do it for both `title` and `content` fields (separately) and write the output to files
- Evaluate the results against the relevance judgments (in `data/qrels.csv`) in terms of NDCG@10 and NDCG@20

- NB! `anchors` field retrieval is also done in this notebook