# Assignment 3 - Part 1

In [6]:
import urllib
import requests
import json
import math
import threading

In [7]:
API = "http://gustav1.ux.uis.no:5002"

QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
TITLE_OUTPUT_FILE = "data/baseline_title.txt"
CONTENT_OUTPUT_FILE = "data/baseline_content.txt"
ANCHORS_OUTPUT_FILE = "data/baseline_anchors.txt"

# load queries
queries = {}
with open(QUERY_FILE, "r") as fin:
    for line in fin.readlines():
        qid, query = line.strip().split(" ", 1)
        queries[qid] = query
        
# load given ground truth
gtruth = {}
with open(QRELS_FILE, 'r') as qr:
    for line in qr.readlines():
        if line.startswith('QueryId'):
            continue
        qid, did, rel = line.strip().split(',')
        if qid not in gtruth:
            gtruth[qid] = {}
        gtruth[qid][did] = int(rel)


Issuing a search query againt the API

In [8]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

--------------------------------------------------------------------------------------------------------------------------------

Evaluation:

In [9]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def evaluate(rankings, gtruth, df):
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    
    print("-----------------------------------------%s-------------------------------------------------" % df)
    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]    
        print("\nQuery", qid)

        # relevance levels of our ranking
        gains = [gt.get(doc_id, 0) for doc_id in ranking]
        
        print("Gains:", gains)

        # relevance levels of the idealized ranking
        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)

        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20

        print("NDCG@10:", round(ndcg10, 3), "\nNDCG@20:", round(ndcg20, 3))

    print("\nAverage (%s):" % df)
    print("NDCG@10:", round(sum_ndcg10 / len(rankings), 3), "\nNDCG@20:", round(sum_ndcg20 / len(rankings), 3))


--------------------------------------------------------------------------------------------------------------------------------
Search, load given ground trouth, evaluate: 

In [10]:
def title_search_evaluate():    
    # search title field
    print("Searching title field . . . ")
    with open(TITLE_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "title", size=100)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("\n\nDone searching title field!")
    # load rankings for title field search
    rankings_title = {}
    with open(TITLE_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid in rankings_title: 
                rankings_title[qid].append(doc_id)
                continue
            else:
                rankings_title[qid] = []
    # evaluate
    evaluate(rankings_title, gtruth, "title")

    
def content_search_evaluate():
    # search content field
    print("Searching content field . . . ")
    with open(CONTENT_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b", query, "content", size=100)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("\n\nDone searching content field!")
    # load rankings for content field search
    rankings_content = {}
    with open(CONTENT_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid in rankings_content: 
                rankings_content[qid].append(doc_id)
                continue
            else:
                rankings_content[qid] = []
    # evaluate
    evaluate(rankings_content, gtruth, "content")
    

# TODO: limit to clueweb12b
def anchors_search_evaluate():
    # search anchors field
    print("Searching anchors field . . . ")
    with open(ANCHORS_OUTPUT_FILE, "w") as fin:
        fin.write("QueryId,DocumentId")
        for qid, query in queries.items():
            res = search("clueweb12b_anchors", query, "anchors", size=100)
            for r in res.get("hits", {}).get("hits", {}):
                fin.write("\n{},{}".format(qid, r["_id"]))
    print("\n\nDone searching anchors field!")
    # load rankings for anchors field search
    rankings_anchors = {}
    with open(ANCHORS_OUTPUT_FILE, "r") as fin:
        docs = []
        for line in fin.readlines():
            if line.startswith('QueryId'):
                continue
            qid, doc_id = line.strip().split(",")
            if qid in rankings_anchors: 
                rankings_anchors[qid].append(doc_id)
                continue
            else:
                rankings_anchors[qid] = []
    # evaluate
    evaluate(rankings_anchors, gtruth, "anchors")


# using threads to speed up the search. console printing may get messed up, but file writing stays intact
print("The notebook kernel may appear idle immediately after starting the threads,\n"
      "but the threads are running in the background. Give it time:\n")
threading.Thread(target=title_search_evaluate).start()  # search title field and evaluate
threading.Thread(target=content_search_evaluate).start()  # search content field and evaluate
threading.Thread(target=anchors_search_evaluate).start()  # search anchors field and evaluate

# single thread execution: 
# title_search_evaluate()
# content_search_evaluate()
# anchors_search_evaluate()


It may appear that the notebook kernel is idle immediately after starting the threads,
but the threads are running in the background. Give it time:

Searching title field . . . 
Searching content field . . . 
Searching anchors field . . . 


Done searching title field!
-----------------------------------------title-------------------------------------------------

Query 201
Gains: [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]
NDCG@10: 0.111 
NDCG@20: 0.149

Query 202
Gains: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0



Done searching content field!
-----------------------------------------content-------------------------------------------------

Query 201
Gains: [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
NDCG@10: 0.204 
NDCG@20: 0.259

Query 202
Gains: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
NDCG@10: 0.0 
NDCG@20: -0.122

Query 203
Gains: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1



Done searching anchors field!
-----------------------------------------anchors-------------------------------------------------

Query 201
Gains: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]
NDCG@10: 0.035 
NDCG@20: 0.07

Query 202
Gains: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
NDCG@10: 0.0 
NDCG@20: 0.0

Query 203
Gains: [3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## TODOs:

- Retrieve results for all queries (in `data/queries.txt`)
- Do it for both `title` and `content` fields (separately) and write the output to files
- Evaluate the results against the relevance judgments (in `data/qrels.csv`) in terms of NDCG@10 and NDCG@20