
# Assignment 3 - Part 3

In [None]:
import urllib
import requests
import json
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [None]:
API = "http://gustav1.ux.uis.no:5002"

## Utility functions

Loading queries

In [None]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

Loading qrels

In [None]:
def load_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, "r") as fin:
        i = 0
        for line in fin.readlines():
            i += 1
            if i == 1:  # skip header line
                continue
            qid, doc_id, rel = line.strip().split(",", 2)
            if qid not in qrels:
                qrels[qid] = {}
            qrels[qid][doc_id] = rel
    return qrels

Loading features file

In [None]:
def load_features(features_file):
    X, y, qids, doc_ids = [], [], [], []
    with open(features_file, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

## API functions

Issuing a search query againt the API

In [None]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

Get term vector

In [None]:
def term_vectors(indexname, doc_id, term_statistics=False):
    """
    param term_statistics: Boolean; True iff term_statistics are required.
    """
    url = "/".join([API, indexname, doc_id, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": str(term_statistics).lower()})
    response = requests.get(url).text

    return json.loads(response)

Analyze query (return a list of index terms)

In [None]:
def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    r = json.loads(response)
    return [t["token"] for t in r["tokens"]]

## Pointwise LTR class

In [None]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

## Feature computation

The total number of candidate documents to retrieve from Elasticsearch. Should not be set higher than 200 (otherwise things get unreasonably slow).

In [None]:
NUM_DOCS = 200

Total number of features (features will be indexed 1..NUM_FEAT)

In [None]:
NUM_FEAT = 3

#### Computing the feature vectors for a given query

  - We retrieve top `NUM_DOCS` documents for each field (title, content, anchors).
  - We ignore those docs that don't have a score in the content field. This also serves as a simple and pragmatic way of filtering out docs that are not in ClueWeb Category B. 
  
**IMPORTANT NOTE** If you compute the BM25 title and anchor scores this way, you will end up with a lot of missing features, which will likely hurt your performance. For IR, there should be no missing features, as matching the query against a field always produces a retrieval score (which might be 0). Therefore, instead of what is done below for Features #2 and #3, you should get the top 200 docs using BM25 content, then compute the BM25 title and anchors based on the termvectors.  Similarly, for the additional features (LM, TFIDF, etc.), compute the retrieval scores for the content, title, and anchors fields yourself based on the termvectors.
  
This function is used both both when training and when applying the model. When training, the target relevance labels will need to be assigned to each document. That is done in `get_training_data()`.

In [None]:
def get_features(qid, query):
    feats = {}
    print("Getting features for query #{} '{}'".format(qid, query))                

    # Analyze query (will be needed for some features)
    qterms = analyze_query("clueweb12b", query)
    
    # Feature 1: BM25 content score
    res1 = search("clueweb12b", query, "content", size=NUM_DOCS)
    # Initializing feature vector with values for Feature 1
    print("\tElasticsearch content field ...")
    for doc in res1.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        feats[doc_id] = {1: doc.get("_score")}
        
    # Feature 2: BM25 title score
    print("\tElasticsearch title field ...")
    res2 = search("clueweb12b", query, "title", size=NUM_DOCS)
    for doc in res2.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][2] = doc.get("_score")

    # Feature 3: BM25 anchors score
    # NOTE: we retrieve more candidate documents here
    print("\tElasticsearch anchors field ...")
    res3 = search("clueweb12b_anchors", query, "anchors", size=NUM_DOCS*10)
    for doc in res3.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][3] = doc.get("_score")
                
    # TODO: computation of additional features comes here 
                
    # TODO: we can apply feature normalization here
        
    return feats

## Main

### Training model

Queries and qrels for training

In [None]:
QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
FEATURES_FILE = "data/features.txt"

In [None]:
queries = load_queries(QUERY_FILE)
qrels = load_qrels(QRELS_FILE)

Create the complete training data set (feature vectors and corresponding labels) and write it to a file

In [1]:
def get_training_data(queries, qrels, output_file):
    with open(output_file, "w") as fout:
        for qid, query in sorted(queries.items()):
            # get feature vectors
            feats = get_features(qid, query)
            # assign target labels and write to file
            for doc_id, feat in feats.items():
                if doc_id in qrels[qid]: # we only consider docs where we have the target label
                    rel = qrels[qid][doc_id]
                    # NOTE: there shouldn't be "missing" features
                    for fid in range(1, NUM_FEAT + 1):
                        if fid not in feat:
                            feat[fid] = 0  # default value for "missing" features
                    # write to file
                    feat_str = ['{}:{}'.format(k,v) for k,v in sorted(feat.items())]
                    fout.write(" ".join([str(rel), qid, doc_id] + feat_str) + "\n")

In [None]:
get_training_data(queries, qrels, FEATURES_FILE)

Load training data from file

In [None]:
train_X, train_y, qids, doc_ids = load_features(FEATURES_FILE)

#### Train model

Set `max_depth` roughly to the square root of the number features

In [None]:
clf = RandomForestRegressor(max_depth=2, random_state=0)
ltr = PointWiseLTRModel(clf)
ltr._train(train_X, train_y)

### Applying model on unseen queries

In [None]:
QUERY2_FILE = "data/queries2.txt"
FEATURES2_FILE = "data/features2.txt"
OUTPUT_FILE = "data/ltr2.txt"
TOP_DOCS = 20  # this many top docs to write to output file

In [None]:
queries2 = load_queries(QUERY2_FILE)

Apply model and write results to output file

In [None]:
output_format = "trec"

with open(OUTPUT_FILE, "w") as fout:
    for qid, query in sorted(queries2.items()):
        # Get feature vectors
        feats = get_features(qid, query)
        
        # Convert into the format required by the `PointWiseLTRModel` class
        # and deal with missing feature values
        doc_fts = []
        doc_ids = []
        
        for doc_id, feat in feats.items():
            for fid in range(1, NUM_FEAT + 1):
                if fid not in feat:
                    feat[fid] = -1
            doc_fts.append(np.array([float(val) for fid, val in sorted(feat.items())]))
            doc_ids.append(doc_id)
        
        # Get ranking
        r = ltr.rank(doc_fts, doc_ids)    
        # Write the results to file
        rank = 1
        for doc_id, score in r:
            if rank <= TOP_DOCS:
                if output_format == "trec":
                    fout.write(("\t".join(["{}"] * 6) + "\n").format(qid, "Q0", doc_id, str(rank),
                                                                 str(score), "A3_3_Baseline"))
                else: 
                    fout.write(qid + "," + doc_id + "\n")                            
            rank += 1