# Assignment 3 - Part 1

In [1]:
import urllib
import requests
import json
import math
import threading

In [2]:
API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
TITLE_OUTPUT_FILE = "data/baseline_title.txt"
CONTENT_OUTPUT_FILE = "data/baseline_content.txt"

# load queries
queries = {}
with open(QUERY_FILE, "r") as fin:
    for line in fin.readlines():
        qid, query = line.strip().split(" ", 1)
        queries[qid] = query
        
# load given ground truth
gtruth = {}
with open(QRELS_FILE, 'r') as qr:
    for line in qr.readlines():
        if line.startswith('QueryId'):
            continue
        qid, did, rel = line.strip().split(',')
        if qid not in gtruth.keys():
            gtruth[qid] = {}
        gtruth[qid][did] = rel


Issuing a search query againt the API

In [3]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

Printing results for a given search query

In [4]:
# res = search("clueweb12b", "united states", "content", size=5)
# for r in res.get("hits", {}).get("hits", {}):
#     print("{} {}".format(r["_id"], r["_score"]))

--------------------------------------------------------------------------------------------------------------------------------

Evaluation:

In [5]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def evaluate(rankings, gtruth, df):
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    
    print("--------------------------------------------------%s-----------------------------------------------------------" % df)
    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]    
        print("Query", qid)

        gains = [] # holds corresponding relevance levels for the ranked docs
        for doc_id in ranking: 
            if doc_id in gt.keys():  # Debugging: IS DOC IN QRELS???
                print(doc_id)
                print(gt.get(doc_id))
            gain = gt.get(doc_id, 0)
            gains.append(gain)
        # print("\tGains:", gains)

        # relevance levels of the idealized ranking
        gain_ideal = sorted([int(v) for _, v in gt.items()], reverse=True)
        # print("\tIdeal gains:", gain_ideal)

        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20

        print("\tNDCG@10:", round(ndcg10, 3), "\n\tNDCG@20:", round(ndcg20, 3))

    print("Average (%s):" % df)
    print("\tNDCG@10:", round(sum_ndcg10 / len(rankings), 3), "\n\tNDCG@20:", round(sum_ndcg20 / len(rankings), 3))


--------------------------------------------------------------------------------------------------------------------------------
Search, load given ground trouth, evaluate: 

In [6]:
def title_search_evaluate():    
    # search title field
    print("Searching title field . . . ")
    with open(TITLE_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "title", size=100)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("\n\nDone searching title field!")
    # load rankings for title field search
    rankings_title = {}
    with open(TITLE_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid in rankings_title: 
                rankings_title[qid].append(doc_id)
                continue
            else:
                rankings_title[qid] = []
    # evaluate
    evaluate(rankings_title, gtruth, "title")

    
def content_search_evaluate():
    # search content field
    print("Searching content field . . . ")
    with open(CONTENT_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "content", size=100)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("\n\nDone searching content field!")
    # load rankings for content field search
    rankings_content = {}
    with open(CONTENT_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid in rankings_content: 
                rankings_content[qid].append(doc_id)
                continue
            else:
                rankings_content[qid] = []
    # evaluate
    evaluate(rankings_content, gtruth, "content")


# using threading for the job
# console printing may be messed up, but file writing stays intact
threading.Thread(target=title_search_evaluate).start()  # search title field and evaluate
threading.Thread(target=content_search_evaluate).start()  # search content field and evaluate


Searching title field . . . 
Searching content field . . . 


Done searching title field!
--------------------------------------------------title-----------------------------------------------------------
Query 201
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 202
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 203
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 204
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 205
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 206
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 207
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 208
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 209
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 210
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 211
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 212
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 213
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 214
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 215
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 216
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 217
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 218
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 219
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 220
	NDCG@10: 0.0 
	NDCG@20: 0.0
Query 221
	NDCG

## TODOs:

- Retrieve results for all queries (in `data/queries.txt`)
- Do it for both `title` and `content` fields (separately) and write the output to files
- Evaluate the results against the relevance judgments (in `data/qrels.csv`) in terms of NDCG@10 and NDCG@20