
# Assignment 3 - Part 3

In [1]:
import urllib
import requests
import json
import math
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [2]:
API = "http://gustav1.ux.uis.no:5002"

FIELD_MOD = {
    1: ["title", "BM25"],
    2: ["title", "LM"],
    3: ["content", "BM25"],
    4: ["content", "LM"],
    5: ["anchors", "BM25"],
    6: ["anchors", "LM"]
}

BASIC_INDEX_NAME = "clueweb12b"
ANCHORS_INDEX_NAME = "clueweb12b_anchors"

CACHE_DIR = "cache"
CACHE_DIR_SEARCH = CACHE_DIR + "/search"
CACHE_DIR_TERMVECTORS = CACHE_DIR + "/termvectors"

LAMBDA = 0.1

## Utility functions

Loading queries

In [3]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

Loading features file

In [4]:
def load_features(features_file):
    X, y, qids, doc_ids = [], [], [], []
    with open(features_file, "r") as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

API:

In [5]:
def search(indexname, query, field, size=10):
    cache_file = CACHE_DIR_SEARCH + "/" + indexname + "_" + query + "_" + field + "_" + str(size)
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

def termvectors(indexname, docid, term_statistics="true"): 
    cache_file = CACHE_DIR_TERMVECTORS + "/" + indexname + "_" + docid + "_" + term_statistics
    url = "/".join([API, indexname, docid, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": term_statistics})
    if os.path.exists(cache_file):  # return from cache
        with open(cache_file) as infile:
            response = json.load(infile)
            return json.loads(response)
    else:
        with open(cache_file, "w") as outfile:
            response = requests.get(url).text
            json.dump(response, outfile)
            return json.loads(response)

def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = json.loads(requests.get(url).text)
    tokens = response["tokens"]
    query_terms = []
    for t in sorted(tokens, key=lambda x: x["position"]):
        query_terms.append(t["token"])
    return query_terms

def exists(indexname, docid): 
    url = "/".join([API, indexname, docid, "_exists"])
    response = requests.get(url).text
    return json.loads(response)

def get_index_name(field):
    return ANCHORS_INDEX_NAME if field == "anchors" else BASIC_INDEX_NAME

LM:

In [6]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for fid in FIELD_MOD:
            self._probs[FIELD_MOD[fid][0]] = {}
            for t in qterms:
                self._probs[FIELD_MOD[fid][0]][t] = self.__get_prob(FIELD_MOD[fid][0], t)

    def __get_prob(self, field, term):
        # use a boolean query to find a document that contains the term
        index_name = get_index_name(field)
        hits = search(index_name, term, field, size=1).get("hits", {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # ask for global term statistics when requesting the term vector of that doc (`term_statistics=True` by default)
            if termvectors("clueweb12b", doc_id)["found"] == True:
                index_name = get_index_name(field)
                tv = termvectors(index_name, doc_id)
                ttf = tv["term_vectors"][field]["terms"].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
                sum_ttf = tv["term_vectors"][field]["field_statistics"]["sum_ttf"]
                return ttf / sum_ttf

        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

def lm(clm, qterms, doc_id, field):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed
    index_name = get_index_name(field)
    tv = termvectors(index_name, doc_id, term_statistics="false")["term_vectors"]

    # compute field length $|d|$
    len_d = 0  # document field length initialization
    if field in tv:  # that document field may be NOT empty
        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        if field in tv:
            Pt_d = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d  # $P(t|d)$
        else:  # that document field is empty
            Pt_d = 0
        Pt_C = clm.prob(field, t)  # $P(t|C)$
        Pt_theta_d = (1 - LAMBDA) * Pt_d + LAMBDA * Pt_C  # $P(t|\theta_{d})$ with J-M smoothing
        # Pt_theta_d is 0 if t doesn't occur in any doc for that field, even with smoothing:
        score += math.log(Pt_theta_d) if Pt_theta_d > 0 else 0  
    
    return score

## Pointwise LTR class

In [7]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

## Main

In [8]:
def get_features(queries):
    features = {}

    def add_to_features(docid):
        if docid not in features[qid]:
            features[qid][docid] = {}
        features[qid][docid][fid] = r["_score"]

    for fid in range(1, len(FIELD_MOD) + 1):
        print("\nComputing values for feature '%s & %s' . . ." % (FIELD_MOD[fid][0], FIELD_MOD[fid][1]))

        for qid, query in queries.items():
            if qid not in features:
                features[qid] = {}

            if FIELD_MOD[fid][1] == "BM25":
                print("Query '{0}'".format(query))
                if FIELD_MOD[fid][0] == "anchors":
                    res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=20)
                    for r in res["hits"]["hits"]:
                        docid = r["_id"]
                        if exists("clueweb12b", r["_id"])["exists"] == True:
                            add_to_features(docid)
                else: 
                    res = search("clueweb12b", query, FIELD_MOD[fid][0], size=20)                
                    for r in res["hits"]["hits"]:
                        docid = r["_id"]
                        add_to_features(docid)
            else:  
                # LM
                print("Query '{0}'".format(query))
                if FIELD_MOD[fid][0] == "anchors":
                    res = search("clueweb12b_anchors", query, FIELD_MOD[fid][0], size=100)
                    for r in res["hits"]["hits"]:
                        docid = r["_id"]
                        if exists("clueweb12b", r["_id"])["exists"] == True:
                            qterms = analyze_query("clueweb12b", query)
                            clm = CollectionLM(qterms)
                            scores = {}
                            for r in res["hits"]["hits"]:
                                docid = r["_id"]
                                scores[docid] = lm(clm, qterms, docid, FIELD_MOD[fid][0])
                            i = 1
                            for docid in scores:
                                # return top 20
                                if i > 20:
                                    break
                                if docid not in features[qid]:
                                    features[qid][docid] = {}
                                features[qid][docid][fid] = scores[docid]
                                i += 1
                else:
                    res = search("clueweb12b", query, FIELD_MOD[fid][0], size=100)  # size=100 because of later re-ranking
                    qterms = analyze_query("clueweb12b", query)
                    clm = CollectionLM(qterms)
                    scores = {}
                    for r in res["hits"]["hits"]:
                        docid = r["_id"]
                        scores[docid] = lm(clm, qterms, docid, FIELD_MOD[fid][0])
                    i = 1
                    for docid in scores:
                        # return top 20
                        if i > 20:
                            break
                        if docid not in features[qid]:
                            features[qid][docid] = {}
                        features[qid][docid][fid] = scores[docid]
                        i += 1

    # additional query features
#     print("Computing values for additional query features . . .")
#     for qid, query in queries.items():
#         avg_arr = []
#         for e in features[qid]:
#             if 1 in features[qid][e].keys():
#                 avg_arr.append(features[qid][e][1])

#         for entry in features[qid]:
#             ql = len(query.split(' '))
#             # hardcoded feature IDs
#             features[qid][entry][7] = ql
#             features[qid][entry][8] = sum(avg_arr) / len(avg_arr)

    # additional document features
    print("Computing values for additional document features . . .")
    for qid in queries.keys():
        for doc_id in features[qid]:
            for field in ['title', 'content']:
                index_name = get_index_name(field)
                try:
                    tv = termvectors(index_name, doc_id, term_statistics="false")["term_vectors"]
                    if field in tv:
                        len_d = sum([s["term_freq"] for t, s in tv[field]["terms"].items()])
                        if field == 'title':
                            features[qid][doc_id][7] = len_d
                        else:
                            features[qid][doc_id][8] = len_d
                except KeyError:
                    pass
    
    return features

Load training data from file

In [9]:
FEATURES_FILE = "data/features_qd_d.txt"  # CHANGE TO WANTED FILE AND CHANGE get_features() IF NEEDED

train_X, train_y, qids, doc_ids = load_features(FEATURES_FILE)

def ftread(fpath):
    features = {}
    with open(fpath, 'r') as ft:
        for line in ft.readlines():
            linedata = line.split(' ')
            qid = linedata[1]
            docid = linedata[2]
            features[docid] = {}
            feats = linedata[3:]
            for feat in feats:
                ftdata = feat.split(':')
                features[docid][float(ftdata[0])] = float(ftdata[1].strip('\n'))
    return features

features = ftread(FEATURES_FILE)

#### Train model

Set `max_depth` roughly to the square root of the number features

In [10]:
clf = RandomForestRegressor(max_depth=3, random_state=0)
ltr = PointWiseLTRModel(clf)
ltr._train(train_X, train_y)

### Applying model on unseen queries

In [11]:
QUERY2_FILE = "data/queries2.txt"
OUTPUT_FILE = "data/ltr_kaggle.txt"
TOP_DOCS = 20  # this many top docs to write to output file
NUM_FEAT = 8   # change this if needed

In [12]:
queries2 = load_queries(QUERY2_FILE)

Apply model and write results to output file

In [13]:
# output_format = "trec"
output_format = "whydoweneedthis"
# Get feature vectors
feats = get_features(queries2)

with open(OUTPUT_FILE, "w") as fout:
    fout.write("QueryId,DocumentId\n")
    for qid, query in sorted(queries2.items()):            
        # Convert into the format required by the `PointWiseLTRModel` class
        # and deal with missing feature values
        doc_fts = []
        doc_ids = []
        
        for doc_id, feat in features.items():
            for fid in range(1, NUM_FEAT + 1):
                if fid not in feat:
                    feat[fid] = -1
            try: 
                doc_fts.append(np.array([float(val) for fid, val in sorted(feat.items())]))
            except TypeError:
                print(feat.items())
            doc_ids.append(doc_id)
        
        # Get ranking
        r = ltr.rank(doc_fts, doc_ids)    
        # Write the results to file
        rank = 1
        for doc_id, score in r:
            if rank <= TOP_DOCS:
                if output_format == "trec":
                    fout.write(("\t".join(["{}"] * 6) + "\n").format(qid, "Q0", doc_id, str(rank),
                                                                 str(score), "A3_3_Baseline"))
                else: 
                    fout.write(qid + "," + doc_id + "\n")                            
            rank += 1


Computing values for feature 'title & BM25' . . .
Query 'identifying spider bites'
Query 'history of orcas island'
Query 'tooth abscess'
Query 'barrett's esophagus'
Query 'teddy bears'
Query 'patron saint of mental illness'
Query 'holes by louis sachar'
Query 'hip roof'
Query 'carpenter bee'
Query 'the american revolutionary'
Query 'folk remedies sore throat'
Query 'balding cure'
Query 'evidence for evolution'
Query 'tribe formerly living in alabama'
Query 'F5 tornado'
Query 'symptoms of heart attack'
Query 'feliz navidad lyrics'
Query 'benefits of running'
Query 'marshall county schools'
Query 'sun tzu'
Query 'halloween activities for middle school'
Query 'dreams interpretation'
Query 'wilson's disease'
Query 'golf instruction'
Query 'uss cole'
Query 'how has african american music influence history'
Query 'bewitched cast'
Query 'mister rogers'
Query 'game theory'
Query 'view my internet history'
Query 'ketogenic diet'
Query 'nasa interplanetary missions'
Query 'hayrides in pa'
Query

Query 'the american revolutionary'
Query 'folk remedies sore throat'
Query 'balding cure'
Query 'evidence for evolution'
Query 'tribe formerly living in alabama'
Query 'F5 tornado'
Query 'symptoms of heart attack'
Query 'feliz navidad lyrics'
Query 'benefits of running'
Query 'marshall county schools'
Query 'sun tzu'
Query 'halloween activities for middle school'
Query 'dreams interpretation'
Query 'wilson's disease'
Query 'golf instruction'
Query 'uss cole'
Query 'how has african american music influence history'
Query 'bewitched cast'
Query 'mister rogers'
Query 'game theory'
Query 'view my internet history'
Query 'ketogenic diet'
Query 'nasa interplanetary missions'
Query 'hayrides in pa'
Query 'where to find morel mushrooms'
Query 'magnesium rich foods'
Query 'common schizophrenia drugs'
Query 'carotid cavernous fistula treatment'
Query 'fidel castro'
Query 'benefits of yoga'
Query 'norway spruce'
Query 'sangre de cristo mountains'
Query 'history of the electronic medical record'
Q