### BM25 DEV_0, DEV_1

In [2]:
import os
from rank_bm25 import BM25Okapi
import pandas as pd
import numpy as np

raw_dir = 'raw'
processed_dir = 'data'
BATCH_SIZE = 10
if not os.path.exists(raw_dir):
    os.mkdir(raw_dir)

if not os.path.exists(processed_dir):
    os.mkdir(processed_dir)

In [3]:
raw_doc = 'docdev-stopstem.xml_1.out'
raw_qrels = 'msmarco-docdev-qrels.tsv'
raw_queries = 'queries.docdev.tsv'

preproc_doc = 'docs.tsv'
preproc_qrels = 'qrels.tsv'
preproc_queries = 'queries.tsv'

In [4]:
def read_local_files(raw_dir=raw_dir):
    df = pd.read_csv(os.path.join(raw_dir, 'docdev-stopstem.xml_1.out'), names=['query_id', 'q0', 'doc_id', 'rank', 'score', 'text'],  sep=' ')
    df_qrels = pd.read_csv(os.path.join(raw_dir, 'msmarco-docdev-qrels.tsv'), sep=' ', names=['query_id', 'rank_0', 'doc_id', 'rank_1'])
    df_queries = pd.read_csv(os.path.join(raw_dir, 'queries.docdev.tsv'), sep='\t', names=['query_id', 'data'])
    df_docs = pd.read_csv(os.path.join(raw_dir, 'docs.tsv'), sep='\t', names=['id', 'doc_id', 'data'])

    df_qrels['id'] = df_qrels.index
    df_qrels = df_qrels[['id', 'query_id', 'doc_id']]

    df_queries['id'] = df_queries.index
    df_queries = df_queries[['id', 'query_id', 'data']]
    return df_docs, df_qrels, df_queries

In [5]:
def create_joined_file(df_docs, df_qrels, df_queries, path_processed_joined=None):
    joined_df = df_qrels.merge(df_queries, on='query_id').merge(df_docs, on='doc_id', how='left')[['query_id', 'data_x', 'doc_id', 'data_y']]
    joined_df.rename(columns={'data_x':'query_data', 'data_y':'doc_data'}, inplace=True)
    if path_processed_joined:
        joined_df.to_csv(path_processed_joined, sep='\t', index=None, header=None)
    return joined_df

### Reading files from raw dir. Creating first joined view for batch iterator

In [6]:
df_docs, df_qrels, df_queries = read_local_files()
joined_df = create_joined_file(df_docs, df_qrels, df_queries)

In [14]:
joined_df.head()

Unnamed: 0,query_id,query_data,doc_id,doc_data
0,2,androgen receptor define,D1650436,"""From Wikipedia, the free encyclopedianavigati..."
1,1215,3 levels of government in canada and their res...,D1202771,Immigration & Citizenship Canadian Government ...
2,1288,3/5 of 60,D1547717,Science & Mathematics Mathematics What is 3/5 ...
3,1576,60x40 slab cost,D1313702,"""Forum Dock Side (Discussion) PB Open Water Co..."
4,2235,bethel university was founded in what year,D2113408,73% of our students get into one of their top ...


### Baseline DEV_0. We will use bm25 to predict mrr100 on documents only
Queries in this this case are empty strings

In [15]:
df_docs, df_qrels, df_queries = read_local_files()
print(len(df_docs), len(df_qrels), len(df_queries))
joined_df = create_joined_file(df_docs, df_qrels, df_queries)
joined_df.head(3)

5185 5193 5193


Unnamed: 0,query_id,query_data,doc_id,doc_data
0,2,androgen receptor define,D1650436,"""From Wikipedia, the free encyclopedianavigati..."
1,1215,3 levels of government in canada and their res...,D1202771,Immigration & Citizenship Canadian Government ...
2,1288,3/5 of 60,D1547717,Science & Mathematics Mathematics What is 3/5 ...


In [16]:
corpus = list(joined_df['doc_data'].values)
tokenized_corpus = [corp.split(" ") for corp in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [11]:
def predicting_100_top_doc_id(joined_df, bm25):
    predict_on_query = []
    predicted_scores = []
    for query_id, query in zip(list(joined_df['query_id']), list(joined_df['query_data'])):
        query = query.split()
        score = bm25.get_scores(query)
        top_100_scores = np.round(score[score >= pd.Series(score).nlargest(100).values[-1]], 1)# score[score >= pd.Series(score).nlargest(100).values[-1]]
        top_100_doc_id = joined_df['doc_id'][score >= pd.Series(score).nlargest(100).values[-1]].values
        score_dicts = [{doc:score} for (doc, score) in zip(top_100_doc_id, top_100_scores)]
        predict_on_query.append(list(top_100_doc_id))
        predicted_scores.append(list(top_100_scores))
    return predict_on_query, predicted_scores

In [18]:
joined_df['predict_100'], joined_df['predict_100_score'] = predicting_100_top_doc_id(joined_df, bm25)
joined_df.head(3)

Unnamed: 0,query_id,query_data,doc_id,doc_data,predict_100,predict_100_score
0,2,androgen receptor define,D1650436,"""From Wikipedia, the free encyclopedianavigati...","[D1650436, D344906, D993875, D2339274, D802481...","[26.0, 6.1, 4.1, 7.7, 10.0, 5.3, 4.9, 5.8, 5.3..."
1,1215,3 levels of government in canada and their res...,D1202771,Immigration & Citizenship Canadian Government ...,"[D1202771, D1361055, D22461, D2339274, D217536...","[24.7, 20.8, 19.4, 20.4, 19.5, 19.2, 21.2, 20...."
2,1288,3/5 of 60,D1547717,Science & Mathematics Mathematics What is 3/5 ...,"[D1547717, D815091, D855092, D3461202, D628872...","[26.6, 10.1, 8.5, 9.7, 9.1, 9.6, 8.1, 9.0, 8.8..."


In [19]:
from rank_eval import Qrels, Run, evaluate
qrels = Qrels()
qrels.add_multi(q_ids=[str(i) for i in list(joined_df['query_id'].values)], 
                doc_ids=[[i] for i in joined_df['doc_id']],
                scores=[[1.0] for i in range(len(joined_df))])

run = Run()
run.add_multi(q_ids=[str(i) for i in list(joined_df['query_id'].values)], 
                doc_ids=[i for i in joined_df['predict_100']],
                scores=[i for i in joined_df['predict_100_score']])

evaluate(qrels, run, ["mrr@100"])

0.5366385958226686

### Baseline DEV_1. We will use bm25 to predict mrr100 of docs + ideal queries

In [7]:
df_docs, df_qrels, df_queries = read_local_files()
print(len(df_docs), len(df_qrels), len(df_queries))
joined_df = create_joined_file(df_docs, df_qrels, df_queries)
joined_df.head(3)

5185 5193 5193


Unnamed: 0,query_id,query_data,doc_id,doc_data
0,2,androgen receptor define,D1650436,"""From Wikipedia, the free encyclopedianavigati..."
1,1215,3 levels of government in canada and their res...,D1202771,Immigration & Citizenship Canadian Government ...
2,1288,3/5 of 60,D1547717,Science & Mathematics Mathematics What is 3/5 ...


In [8]:
joined_df['old_doc_data'] = joined_df['doc_data']
joined_df['doc_data'] = joined_df['query_data'] + ' ' + joined_df['doc_data']
joined_df.head(3)

Unnamed: 0,query_id,query_data,doc_id,doc_data,old_doc_data
0,2,androgen receptor define,D1650436,"androgen receptor define ""From Wikipedia, the...","""From Wikipedia, the free encyclopedianavigati..."
1,1215,3 levels of government in canada and their res...,D1202771,3 levels of government in canada and their res...,Immigration & Citizenship Canadian Government ...
2,1288,3/5 of 60,D1547717,3/5 of 60 Science & Mathematics Mathematics Wh...,Science & Mathematics Mathematics What is 3/5 ...


In [9]:
corpus = list(joined_df['doc_data'].values)
tokenized_corpus = [corp.split(" ") for corp in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [12]:
# predict 100 docs on the added queries and docs
joined_df['predict_100'], joined_df['predict_100_score'] = predicting_100_top_doc_id(joined_df, bm25)
joined_df.head(3)

Unnamed: 0,query_id,query_data,doc_id,doc_data,old_doc_data,predict_100,predict_100_score
0,2,androgen receptor define,D1650436,"androgen receptor define ""From Wikipedia, the...","""From Wikipedia, the free encyclopedianavigati...","[D1650436, D344906, D2339274, D802481, D104368...","[27.0, 6.0, 7.7, 10.0, 5.3, 4.8, 4.6, 5.3, 4.6..."
1,1215,3 levels of government in canada and their res...,D1202771,3 levels of government in canada and their res...,Immigration & Citizenship Canadian Government ...,"[D1202771, D1361055, D22461, D2339274, D217536...","[39.7, 20.8, 19.4, 20.4, 19.5, 19.2, 21.2, 20...."
2,1288,3/5 of 60,D1547717,3/5 of 60 Science & Mathematics Mathematics Wh...,Science & Mathematics Mathematics What is 3/5 ...,"[D1547717, D815091, D855092, D3461202, D628872...","[27.2, 10.1, 8.5, 9.7, 9.1, 9.6, 8.1, 9.0, 8.8..."


In [13]:
from rank_eval import Qrels, Run, evaluate
qrels = Qrels()
qrels.add_multi(q_ids=[str(i) for i in list(joined_df['query_id'].values)], 
                doc_ids=[[i] for i in joined_df['doc_id']],
                scores=[[1.0] for i in range(len(joined_df))])

run = Run()
run.add_multi(q_ids=[str(i) for i in list(joined_df['query_id'].values)], 
                doc_ids=[i for i in joined_df['predict_100']],
                scores=[i for i in joined_df['predict_100_score']]) # [[1.0] * 100 for i in range(len(joined_df))])

In [14]:
evaluate(qrels, run, ["mrr@100"])

0.9534587917203479