# Assignment 2A, Part 3: Multifield retrieval

Implement BM25F and the Mixture of Language Models (MLM). Use two fields: title and content.

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import pickle
import math
import itertools

from collections import Counter
from tqdm import tqdm
import numpy as np

from hashedindex import textparser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

N_GRAMS = 1

## Load index

In [4]:
def load_file(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [5]:
index = load_file('data/basic_index_new_idf.dat')

In [6]:
len(index['content_doc_length'])

1030547

### Load the queries from the file

See the assignment description for the format of the query file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#queries).

In [7]:
QUERY_FILE = "data/queries.txt"  # make sure the query file exists on this location
BM25_OUTPUT_FILE = "data/bm25_multifield.txt"  # output the ranking
MLM_OUTPUT_FILE = "data/lm_multifield.txt"  # output the ranking

In [8]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [9]:
queries = load_queries(QUERY_FILE)

## Retrieval models

In [10]:
class BM25F():
    
    def __init__(self, index, w_title = 0.2, k1 = 1.2, b_title = 0.75, b_content = 0.75):
        self.k1 = k1
        self.b_title = b_title
        self.b_content = b_content
        self.w_title = w_title
        self.w_content = 1-w_title
        
        self.content_index = index['content_index']
        self.title_index = index['title_index']
        
        # combine different idf ?
        self.content_idf = index['content_idf']
        self.title_idf = index['title_idf']
        
        self.content_doc_length = index['content_doc_length']
        self.title_doc_length = index['title_doc_length']
        self.average_content_length = index['average_content_length']
        self.average_title_length = index['average_title_length']
        

    
    def rank_query(self, query):
        
        # tokenize with nltk tokenizer because it works well
        query = ' '.join([word for word in word_tokenize(query)])
        
        ranking = Counter()
        
        # tokenize a second time with hashedindex tokenizer to have tuple tokens
        for token in textparser.word_tokenize(query, stopwords=stopwords, ngrams=N_GRAMS):
            if token in self.content_index or token in self.title_index:
                
                documents = self.content_index.get_documents(token)
                titles = self.title_index.get_documents(token)
                #print('"{}" appears in {} documents'.format(token, len(documents)))   
                
                ranking += Counter(self.rank_term(token, documents, titles))
            else:
                print(token, 'is not in the index')   
        
        return ranking.most_common(100)
    
        
    def rank_term(self, term, documents, titles):
        # here we use a dictionnary because adding counters is a slow operation compared to dictionnary access
        pseudo_content = {}
        pseudo_title = {}
        
        for doc in documents:
            content_tf = self.content_index.get_term_frequency(term, doc) # not normalized ! 
            content_smoothing = 1 - self.b_content + self.b_content * (self.content_doc_length[doc]/self.average_content_length)         
            content_score = (content_tf*(1+self.k1)) / (content_tf + self.k1 * content_smoothing)
            pseudo_content[doc] = self.w_content * content_score
            
        for title in titles:
            title_tf = self.title_index.get_term_frequency(term, title) # not normalized !
            title_smoothing = 1 - self.b_title + self.b_title * (self.title_doc_length[title]/self.average_title_length)
            title_score = (title_tf*(1+self.k1)) / (title_tf + self.k1 * content_smoothing)
            pseudo_title[title] = self.w_title * title_score
        
        pseudo_all = self.merge_dicts(pseudo_content, pseudo_title)
            
        document_scores = {doc_id:score * (self.content_idf[term] + self.title_idf[term]) for doc_id, score in pseudo_all.items()}
            
        return document_scores
    
    def merge_dicts(self, dictA, dictB):
        for key, item in dictB.items():
            dictA[key] = dictA.get(key, 0) + item
        return dictA
        

In [11]:
class MLM():
    
    def __init__(self, index, smoothing='jelinek', w_title=0.5, lambda_param=0.1, mu_param=1000):
        
        if smoothing == 'jelinek':
            self.lambda_param = lambda_param
        elif smoothing == 'dirichlet':
            self.mu_param = mu_param
        else:
            raise ValueError('smoothing should in [jelinek, dirichlet]')
            
        self.smoothing = smoothing
        self.w_title = w_title
        self.w_content = 1-w_title
        
        self.content_index = index['content_index']
        self.title_index = index['title_index']
        
        self.content_doc_length = index['content_doc_length']
        self.title_doc_length = index['title_doc_length']
        
        self.content_sum_tf = index['content_sum_tf']
        self.title_sum_tf = index['title_sum_tf']
        
        self.content_sum_length = index['content_sum_length']
        self.title_sum_length = index['title_sum_length']
        
        self.content_collection_probability = index['content_collection_probability']
        self.title_collection_probability = index['title_collection_probability']
        
    
    def rank_query(self, query):
        query = ' '.join([word for word in word_tokenize(query)])
        
        # we can't use counter objects to track scores because it does not support negative addition
        ranking = {}
        
        for token in textparser.word_tokenize(query, stopwords=stopwords, ngrams=N_GRAMS):
            if token in self.content_index or token in self.title_index:
                
                documents = self.content_index.get_documents(token)
                titles = self.title_index.get_documents(token)
                #print('"{}" appears in {} documents'.format(token, len(documents)))
                
                if self.smoothing == 'jelinek':
                    ranking = self.merge_rankings(ranking, self.rank_term_jelinek(token, documents, titles))
                elif self.smoothing == 'dirichlet':
                    ranking = self.merge_rankings(ranking, self.rank_term_dirichlet(token, documents, titles))
                
            else:
                print(token, 'is not in the index')
                
        return sorted(ranking.items(), key=lambda x: x[1])[:100]
        
    def rank_term_jelinek(self, term, documents, titles):
        title_scores = {}
        document_scores = {}
        
        for doc in documents:
            ptd = self.content_index.get_term_frequency(term, doc, normalized=True)
            ptc = self.content_collection_probability[term]
            score = ((1 - self.lambda_param) * ptd) + (self.lambda_param * ptc)
            document_scores[doc] = self.w_content * math.log(score)
        
        for title in titles:
            ptd = self.title_index.get_term_frequency(term, title, normalized=True)
            ptc = self.title_collection_probability[term]
            score = ((1 - self.lambda_param) * ptd) + (self.lambda_param * ptc)
            title_scores[title] = self. w_title * math.log(score)

        return self.merge_rankings(title_scores, document_scores)
    
    def rank_term_dirichlet(self, term, documents, titles):
        title_scores = {}
        document_scores = {}
        
        for doc in documents:
            tf = self.content_index.get_term_frequency(term, doc, normalized=True)
            ptc = self.content_collection_probability[term]
            score = (tf + self.mu_param*ptc) / (self.content_doc_length[doc] + self.mu_param)
            document_scores[doc] = self.w_content * score
            
        for title in titles:
            tf = self.title_index.get_term_frequency(term, title, normalized=True)
            ptc = self.title_collection_probability[term]
            score = (tf + self.mu_param*ptc) / (self.title_doc_length[title] + self.mu_param)
            title_scores[title] = self.w_title * score
            
        for doc_id, score in title_scores:
            document_scores[doc_id] = math.log(document_scores.get(doc_id, 0) + score)
            
        return self.merge_rankings(title_scores, document_scores)

    def merge_rankings(self, base_ranking, to_add):
        for doc, score in to_add.items():
            base_ranking[doc] = base_ranking.get(doc, 0) + score
        return base_ranking

### Perform retrieval

**TODO** Generate a ranking for each query and output the results to `OUTPUT_FILE`

See the assignment description for the format of the output file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#output-file-format).

## BM25F

In [12]:
bm25 = BM25F(index, w_title=0.1, k1=1, b_content=0.29, b_title=0.2)

with open(BM25_OUTPUT_FILE, 'w') as f:
    f.write('QueryId,DocumentId\n')
    
    for q_id, query in queries.items():
        
        print("Ranking documents for [%s] '%s'" % (q_id, query))
        ranking = bm25.rank_query(query)
        f.writelines(['{},{}\n'.format(q_id, document[0], '\n') for document in ranking])
        f.flush()

Ranking documents for [303] 'Hubble Telescope Achievements'
Ranking documents for [307] 'New Hydroelectric Projects'
Ranking documents for [310] 'Radio Waves and Brain Cancer'
Ranking documents for [314] 'Marine Vegetation'
Ranking documents for [322] 'International Art Crime'
Ranking documents for [325] 'Cult Lifestyles'
Ranking documents for [330] 'Iran-Iraq Cooperation'
Ranking documents for [336] 'Black Bear Attacks'
Ranking documents for [341] 'Airport Security'
Ranking documents for [344] 'Abuses of E-Mail'
Ranking documents for [345] 'Overseas Tobacco Sales'
Ranking documents for [347] 'Wildlife Extinction'
Ranking documents for [353] 'Antarctica exploration'
Ranking documents for [354] 'journalist risks'
Ranking documents for [362] 'human smuggling'
Ranking documents for [363] 'transportation tunnel disasters'
Ranking documents for [367] 'piracy'
Ranking documents for [372] 'Native American casino'
Ranking documents for [374] 'Nobel prize winners'
Ranking documents for [375] 'h

In [42]:
def BM25F_parameters_gridsearch(w_start = 0.2, w_stop = 0.3, w_step = 0.1, k_start=1.0, k_stop=1.3, k_step=0.1, b_content_start=0.0, b_content_stop=1.1, b_title_start=0.0, b_title_stop=1.1, b_step=0.1):
    w_range = np.arange(w_start, w_stop, w_step)
    k_range = np.arange(k_start, k_stop, k_step)
    b_content_range = np.arange(b_content_start, b_content_stop, b_step)
    b_title_range = np.arange(b_title_start, b_title_stop, b_step)
    grid = itertools.product(w_range, k_range, b_content_range, b_title_range)
    
    for w, k, bc, bt in grid:
        print(w, k, bc, bt)
        model = BM25F(index, w_title=w, k1=k, b_content=bc, b_title=bt)
        with open('data/gridsearch_BM25F_w{:.3}_k{:.3}_bc{:.3}_bt{:.3}.txt'.format(w, k, bc, bt), 'w') as f:
            f.write('QueryId,DocumentId\n')

            for q_id, query in queries.items():
                ranking = model.rank_query(query)
                f.writelines(['{},{}\n'.format(q_id, document[0], '\n') for document in ranking])
                f.flush()

In [None]:
BM25F_parameters_gridsearch()

0.2 1.0 0.0 0.0


## MLM

In [13]:
mlm = MLM(index, smoothing='jelinek', w_title=0.2, lambda_param=1)

with open(MLM_OUTPUT_FILE, 'w') as f:
    f.write('QueryId,DocumentId\n')
    
    for q_id, query in queries.items():
        
        print("Ranking documents for [%s] '%s'" % (q_id, query))
        ranking = mlm.rank_query(query)
        f.writelines(['{},{}\n'.format(q_id, document[0], '\n') for document in ranking])
        f.flush()

Ranking documents for [303] 'Hubble Telescope Achievements'
Ranking documents for [307] 'New Hydroelectric Projects'
Ranking documents for [310] 'Radio Waves and Brain Cancer'
Ranking documents for [314] 'Marine Vegetation'
Ranking documents for [322] 'International Art Crime'
Ranking documents for [325] 'Cult Lifestyles'
Ranking documents for [330] 'Iran-Iraq Cooperation'
Ranking documents for [336] 'Black Bear Attacks'
Ranking documents for [341] 'Airport Security'
Ranking documents for [344] 'Abuses of E-Mail'
Ranking documents for [345] 'Overseas Tobacco Sales'
Ranking documents for [347] 'Wildlife Extinction'
Ranking documents for [353] 'Antarctica exploration'
Ranking documents for [354] 'journalist risks'
Ranking documents for [362] 'human smuggling'
Ranking documents for [363] 'transportation tunnel disasters'
Ranking documents for [367] 'piracy'
Ranking documents for [372] 'Native American casino'
Ranking documents for [374] 'Nobel prize winners'
Ranking documents for [375] 'h

In [16]:
def MLM_parameters_gridsearch(w_start = 0.0, w_stop = 1.1, w_step = 0.1, lambda_start = 0.0, lambda_stop = 1.1, lambda_step = 0.1, mu_start=1, mu_stop=10000, n_mu=8):
    smoothings = ['jelinek', 'dirichlet']
    lambda_range = np.arange(lambda_start, lambda_stop, lambda_step)
    w_range = np.arange(w_start, w_stop, w_step)
    mu_range = np.linspace(mu_start, mu_stop, n_mu)
    grid = [[smoothings[0], param] for param in lambda_range] + [[smoothings[1], param] for param in mu_range]
    grid = itertools.product(w_range, grid)
    
    for w, params in grid:
        print(w, params[0], params[1])
        model = MLM(index, w_title=w, smoothing=params[0], lambda_param=params[1], mu_param=params[1])
        with open('data/gridsearch_mlm_w{:.3}_s{:.3}_param{:.3}.txt'.format(w, params[0], params[1]), 'w') as f:
            f.write('QueryId,DocumentId\n')

            for q_id, query in queries.items():
                ranking = model.rank_query(query)
                f.writelines(['{},{}\n'.format(q_id, document[0], '\n') for document in ranking])
                f.flush()

In [17]:
MLM_parameters_gridsearch()

0.0 jelinek 0.0
0.0 jelinek 0.1
0.0 jelinek 0.2
0.0 jelinek 0.30000000000000004
0.0 jelinek 0.4
0.0 jelinek 0.5
0.0 jelinek 0.6000000000000001
0.0 jelinek 0.7000000000000001
0.0 jelinek 0.8
0.0 jelinek 0.9
0.0 jelinek 1.0
0.0 dirichlet 1.0
0.0 dirichlet 1429.4285714285713
0.0 dirichlet 2857.8571428571427
0.0 dirichlet 4286.285714285714
0.0 dirichlet 5714.714285714285
0.0 dirichlet 7143.142857142857
0.0 dirichlet 8571.571428571428
0.0 dirichlet 10000.0
0.1 jelinek 0.0
0.1 jelinek 0.1
0.1 jelinek 0.2
0.1 jelinek 0.30000000000000004
0.1 jelinek 0.4
0.1 jelinek 0.5
0.1 jelinek 0.6000000000000001
0.1 jelinek 0.7000000000000001
0.1 jelinek 0.8
0.1 jelinek 0.9
0.1 jelinek 1.0
0.1 dirichlet 1.0
0.1 dirichlet 1429.4285714285713
0.1 dirichlet 2857.8571428571427
0.1 dirichlet 4286.285714285714
0.1 dirichlet 5714.714285714285
0.1 dirichlet 7143.142857142857
0.1 dirichlet 8571.571428571428
0.1 dirichlet 10000.0
0.2 jelinek 0.0
0.2 jelinek 0.1
0.2 jelinek 0.2
0.2 jelinek 0.30000000000000004
0.2 jeli

## Evaluation

Report on the evaluation results (using the [Evaluation notebook](1_Evaluation.ipynb)) here.

Describe the parameter settings used for the two methods and the method you used for exploring the parameter space.


The models parameters and the field weights were decided using a grid search over the reasonable values, ie $weights \in [0, 1]$, $k1 \in [1, 2]$, $b \in [0, 1]$ for BM25F and $smoothing \in [jelinek, dirichlet]$, $ \lambda \in [1, 2]$, $ \mu \in [10, 10000]$ for MLM. Then, the values around the parameters yielding the best results were manually tested to check if some fine-tuning was possible. The parameters achieveing the highest mean accuracy precision were then tested on Kaggle.

Report only the best performing setting for each model in the table below. The corresponding result files should be pushed to your repository.


| **Method** | **Parameter settings** | **Output file** | **P@10** | **MAP** | **MRR** |
| -- | -- | -- | -- | -- | -- |
| BM25F | k1: 1.0, $b_{title}$: 0.2, $b_{content}$: 0.29, $w_{title}$: 0.1, $w_{content}$: 0.9 | `data/bm25_multifield.txt` | 0.2156 | 0.0802 | 0.3636 |
| MLM | Smoothing method: jelinek, smoothing param: $\lambda = 1.0$, $w_{title}$: 0.2, $w_{content}$: 0.8 | `data/lm_multifield.txt` | 0.1689 | 0.0542 | 0.3551 |
