
# Assignment 3 - Part 3

In [895]:
import urllib
import requests
import json
import os
import math
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [896]:
API = "http://gustav1.ux.uis.no:5002"

CACHE_DIR = "cache"
CACHE_DIR_SEARCH = CACHE_DIR + "/search"
CACHE_DIR_TERMVECTORS = CACHE_DIR + "/termvectors"

BASIC_INDEX_NAME = "clueweb12b"
ANCHORS_INDEX_NAME = "clueweb12b_anchors"

FIELDS = ["content", "title", "anchors"]

LAMBDA = 0.2

## Utility functions

Loading queries

In [897]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

Loading qrels

In [898]:
def load_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, "r") as fin:
        i = 0
        for line in fin.readlines():
            i += 1
            if i == 1:  # skip header line
                continue
            qid, doc_id, rel = line.strip().split(",", 2)
            if qid not in qrels:
                qrels[qid] = {}
            qrels[qid][doc_id] = rel
    return qrels

Loading features file

In [899]:
def load_features(features_file):
    X, y, qids, doc_ids = [], [], [], []
    with open(features_file, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

## API functions

Issuing a search query againt the API

In [900]:
def search(indexname, query, field, size=10):
    cache_file = CACHE_DIR_SEARCH + "/" + indexname + "_" + query + "_" + field + "_" + str(size)
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)
        
def exists(indexname, docid): 
    url = "/".join([API, indexname, docid, "_exists"])
    response = requests.get(url).text
    return json.loads(response)
        
def get_index_name(field):
    return ANCHORS_INDEX_NAME if field == "anchors" else BASIC_INDEX_NAME

Get term vector

In [901]:
def term_vectors(indexname, docid, term_statistics=False):
    cache_file = CACHE_DIR_TERMVECTORS + "/" + indexname + "_" + docid + "_" + str(term_statistics).lower()
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": str(term_statistics).lower()})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

Analyze query (return a list of index terms)

In [902]:
def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    r = json.loads(response)
    return [t["token"] for t in r["tokens"]]

In [903]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for field in FIELDS:
            self._probs[field] = {}
            for t in qterms:
                self._probs[field][t] = self.__get_prob(field, t)

    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        index_name = get_index_name(field)
        hits = search(index_name, term, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True` by default)
            if term_vectors("clueweb12b", doc_id, True)["found"] == True:
                index_name = get_index_name(field)
                tv = term_vectors(index_name, doc_id, True)
                ttf = tv["term_vectors"][field]["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
                sum_ttf = tv["term_vectors"][field]["field_statistics"]["sum_ttf"]
                return ttf / sum_ttf

        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

def lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed
    index_name = get_index_name(field)
    tv = term_vectors(index_name, doc_id)["term_vectors"]

    # compute field length $|d|$
    len_d = 0  # document field length initialization
    if field in tv:  # that document field may be NOT empty
        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        if field in tv:
            Pt_d = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d  # $P(t|d)$
        else:  # that document field is empty
            Pt_d = 0
        Pt_C = clm.prob(field, t)  # $P(t|C)$
        Pt_theta_d = (1 - LAMBDA) * Pt_d + LAMBDA * Pt_C  # $P(t|\theta_{d})$ with J-M smoothing
        # Pt_theta_d is 0 if t doesn't occur in any doc for that field, even with smoothing:
        score += math.log(Pt_theta_d) if Pt_theta_d > 0 else 0  
    
    return score

## Pointwise LTR class

In [904]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

## Feature computation

The total number of candidate documents to retrieve from Elasticsearch. Should not be set higher than 200 (otherwise things get unreasonably slow).

In [905]:
NUM_DOCS = 200

Total number of features (features will be indexed 1..NUM_FEAT)

In [906]:
NUM_FEAT = 9

#### Computing the feature vectors for a given query

  - We retrieve top `NUM_DOCS` documents for each field (title, content, anchors).
  - We ignore those docs that don't have a score in the content field. This also serves as a simple and pragmatic way of filtering out docs that are not in ClueWeb Category B. 
  
**IMPORTANT NOTE** If you compute the BM25 title and anchor scores this way, you will end up with a lot of missing features, which will likely hurt your performance. For IR, there should be no missing features, as matching the query against a field always produces a retrieval score (which might be 0). Therefore, instead of what is done below for Features #2 and #3, you should get the top 200 docs using BM25 content, then compute the BM25 title and anchors based on the termvectors.  Similarly, for the additional features (LM, TFIDF, etc.), compute the retrieval scores for the content, title, and anchors fields yourself based on the termvectors.
  
This function is used both both when training and when applying the model. When training, the target relevance labels will need to be assigned to each document. That is done in `get_training_data()`.

In [907]:
def minmax_norm(features, fid):
    """Normalizes a given feature."""
    # this is to be done for each query separately
    min_x = 10000 # sufficiently large number
    max_x = -10000 # # sufficiently small number
    for docid in features:
        if not features[docid].get(fid):
            continue
        x = features[docid][fid]
        if x < min_x:
            min_x = x
        if x > max_x:
            max_x = x
    for docid in features:
        if not features[docid].get(fid):
            continue
        x = features[docid][fid]
        try:
            features[docid][fid] = (x - min_x) / (max_x - min_x)
        except:
            features[docid][fid] = 0

In [908]:
def get_features(qid, query):
    feats = {}
    print("Getting features for query #{} '{}'".format(qid, query))                

    # Analyze query (will be needed for some features)
    qterms = analyze_query("clueweb12b", query)
    
    # Feature 1: BM25 content score
    res1 = search("clueweb12b", query, "content", size=NUM_DOCS)
    # Initializing feature vector with values for Feature 1
    print("\tElasticsearch content field ...")
    for doc in res1.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        feats[doc_id] = {1: doc.get("_score")}
        
    # Feature 2: BM25 title score
    print("\tElasticsearch title field ...")
    res2 = search("clueweb12b", query, "title", size=NUM_DOCS)
    for doc in res2.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][2] = doc.get("_score")

    # Feature 3: BM25 anchors score
    # NOTE: we retrieve more candidate documents here
    print("\tElasticsearch anchors field ...")
    res3 = search("clueweb12b_anchors", query, "anchors", size=NUM_DOCS*10)
    for doc in res3.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            feats[doc_id][3] = doc.get("_score")
                    
    # Feature 4: LM content score
    # NOTE: we retrieve more candidate documents here
    print("\tLM content field ...")
    res4 = search("clueweb12b", query, "content", size=NUM_DOCS*10)
    clm = CollectionLM(qterms)
    scores = {}
    for doc in res4.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            scores[doc_id] = lm(clm, qterms, doc_id, "content")
    i = 1
    for doc_id in scores:
        # return top 20
        if i > 20:
            break
        if doc_id in feats:
            feats[doc_id][4] = scores[doc_id]
        i += 1
        
    # Feature 5: LM title score
    # NOTE: we retrieve more candidate documents here
    print("\tLM title field ...")
    res5 = search("clueweb12b", query, "title", size=NUM_DOCS*10)
    clm = CollectionLM(qterms)
    scores = {}
    for doc in res5.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            scores[doc_id] = lm(clm, qterms, doc_id, "title")
    i = 1
    for doc_id in scores:
        # return top 20
        if i > 20:
            break
        if doc_id in feats:
            feats[doc_id][5] = scores[doc_id]
        i += 1
    
    # Feature 6: LM title score
    # NOTE: we retrieve more candidate documents here
    print("\tLM anchors field ...")
    res6 = search("clueweb12b_anchors", query, "anchors", size=NUM_DOCS*10)
    clm = CollectionLM(qterms)
    scores = {}
    for doc in res6.get('hits', {}).get("hits", {}):
        doc_id = doc.get("_id")
        if doc_id in feats:
            scores[doc_id] = lm(clm, qterms, doc_id, "anchors")
    i = 1
    for doc_id in scores:
        # return top 20
        if i > 20:
            break
        if doc_id in feats:
            feats[doc_id][6] = scores[doc_id]
        i += 1
        
    # Feature 7 & 8: additional document features
    print("\tDocument features ...")
    for doc_id in feats:
        for field in ["content", "title"]:
            index_name = get_index_name(field)
            try:
                tv = term_vectors(index_name, doc_id)["term_vectors"]
                if field in tv:
                    len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
                    if field == 'title':
                        feats[doc_id][7] = len_d
                    else:
                        feats[doc_id][8] = len_d
            except KeyError:
                pass
        
    # Feature 9 & 10: additional query features
    print("\tQuery features ...")
    ql = len(query.split(' '))
    avg_arr = []
    for doc_id in feats:
        if 1 in feats[doc_id]:
            avg_arr.append(feats[doc_id][1])
    for doc_id in feats:
        feats[doc_id][9] = ql
        #feats[doc_id][10] = sum(avg_arr) / len(avg_arr)
                
    for fid in range(1, 7):
        minmax_norm(feats, fid)
        
    return feats


# CHANGE NUM_FEATS AND ROOT TREE

## Main

### Training model

Queries and qrels for training

In [909]:
QUERY_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
FEATURES_FILE = "data/features_kaggle.txt"

In [910]:
queries = load_queries(QUERY_FILE)
qrels = load_qrels(QRELS_FILE)

Create the complete training data set (feature vectors and corresponding labels) and write it to a file

In [911]:
def get_training_data(queries, qrels, output_file):
    with open(output_file, "w") as fout:
        for qid, query in sorted(queries.items()):
            # get feature vectors
            feats = get_features(qid, query)
            # assign target labels and write to file
            for doc_id, feat in feats.items():
                if doc_id in qrels[qid]: # we only consider docs where we have the target label
                    rel = qrels[qid][doc_id]
                    # NOTE: there shouldn't be "missing" features
                    for fid in range(1, NUM_FEAT + 1):
                        if fid not in feat:
                            feat[fid] = 0  # default value for "missing" features
#                     for fid in range(1, 7):
#                         minmax_norm(feats, fid)
                    # write to file
                    feat_str = ['{}:{}'.format(k,v) for k,v in sorted(feat.items())]
                    fout.write(" ".join([str(rel), qid, doc_id] + feat_str) + "\n")

In [912]:
get_training_data(queries, qrels, FEATURES_FILE)

Getting features for query #201 'raspberry pi'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #202 'uss carl vinson'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #203 'reviews of les miserables'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #204 'rules of golf'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...

	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #233 'hair dye'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #234 'dark chocolate health benefits'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #235 'ham radio'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #236 'symptoms of mad cow disease in humans'
	Elasticsearch content field ...
	Elasticsearch title fie

Load training data from file

In [913]:
train_X, train_y, qids, doc_ids = load_features(FEATURES_FILE)

#### Train model

Set `max_depth` roughly to the square root of the number features

In [914]:
clf = RandomForestRegressor(max_depth=3, random_state=0)
ltr = PointWiseLTRModel(clf)
ltr._train(train_X, train_y)

### Applying model on unseen queries

In [915]:
QUERY2_FILE = "data/queries2.txt"
OUTPUT_FILE = "data/ltr_kaggle.txt"
TOP_DOCS = 20  # this many top docs to write to output file

In [916]:
queries2 = load_queries(QUERY2_FILE)

Apply model and write results to output file

In [917]:
# output_format = "trec"
output_format = "not_trec"

with open(OUTPUT_FILE, "w") as fout:
    fout.write("QueryId,DocumentId\n")
    for qid, query in sorted(queries2.items()):
        # Get feature vectors
        feats = get_features(qid, query)
        
        # Convert into the format required by the `PointWiseLTRModel` class
        # and deal with missing feature values
        doc_fts = []
        doc_ids = []
        
        for doc_id, feat in feats.items():
            for fid in range(1, NUM_FEAT + 1):
                if fid not in feat:
                    feat[fid] = -1
#             for fid in range(1, 7):
#                 minmax_norm(feats, fid)
            doc_fts.append(np.array([float(val) for fid, val in sorted(feat.items())]))
            doc_ids.append(doc_id)
        
        # Get ranking
        r = ltr.rank(doc_fts, doc_ids)    
        # Write the results to file
        rank = 1
        for doc_id, score in r:
            if rank <= TOP_DOCS:
                if output_format == "trec":
                    fout.write(("\t".join(["{}"] * 6) + "\n").format(qid, "Q0", doc_id, str(rank),
                                                                 str(score), "A3_3_Baseline"))
                else: 
                    fout.write(qid + "," + doc_id + "\n")                            
            rank += 1

Getting features for query #251 'identifying spider bites'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #252 'history of orcas island'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #253 'tooth abscess'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #254 'barrett's esophagus'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Documen

	Query features ...
Getting features for query #283 'hayrides in pa'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #284 'where to find morel mushrooms'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #285 'magnesium rich foods'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
	LM anchors field ...
	Document features ...
	Query features ...
Getting features for query #286 'common schizophrenia drugs'
	Elasticsearch content field ...
	Elasticsearch title field ...
	Elasticsearch anchors field ...
	LM content field ...
	LM title field ...
