# Neural information retrieval. Qeury expansion & reranking.

In [None]:
import pandas as pd
import os
import json

In [None]:
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])  # Initialisation package for RM3

In [None]:
# Loading indexes
indexref_np = pt.IndexRef.of('./indexes_p/iterindex_noprocess/data.properties')
index_np = pt.IndexFactory.of(indexref_np)
indexref1 = pt.IndexRef.of('./indexes_p/iterindex/data.properties')
index1 = pt.IndexFactory.of(indexref1)
indexref2 = pt.IndexRef.of('./indexes_p/iterindex_opt/data.properties')
index2 = pt.IndexFactory.of(indexref2)

In [None]:
topics_df = pd.read_csv("NIR2022 dataset/train_query.csv", dtype=str)
print(topics_df.shape)
topics_df
qrels_df = pd.read_csv("NIR2022 dataset/train_qrel.csv")
print(qrels_df.shape)
qrels_df.head()

### word2vec rewriting

In [None]:
# word embedding
import gensim
import gensim.downloader as api

In [None]:
w2v_model = api.load('word2vec-google-news-300')

k=2

topics_qe_df = topics_df.copy()
for i in range(len(topics_qe_df)):
    q = topics_qe_df.iloc[i]['query']
    qe = []
    for word in q.split(' '):
        # OOV
        try:
            expanded_words = [pair[0] for pair in w2v_model.most_similar(word, topn=k) if pair[0].isalnum()]
        except:
            expanded_words = []
        expanded_words.append(word)
        qe.append(expanded_words)
    topics_qe_df.iloc[i]['query'] = gensim.parsing.preprocessing.remove_stopwords(" ".join([e for l in qe for e in l]))

In [None]:
# bm25 model with optimized parameters
bm25 = pt.BatchRetrieve(index2, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8,'bm25.k_3':0.5})
DLM = pt.BatchRetrieve(index2, wmodel='DirichletLM', controls={'c':275})
DPH = pt.BatchRetrieve(index2, wmodel="DPH")
rm3 = pt.rewrite.RM3(index2)
rm3_pipe = bm25 >> rm3 >> bm25


bo1 = pt.rewrite.Bo1QueryExpansion(index2)
bm25_bo1 = bm25 >> bo1 >> bm25
dph_bo1 = DPH >> bo1 >> DPH

In [None]:
pt.Experiment(
    retr_systems=[bm25,DLM, DPH, bm25_bo1, dph_bo1, rm3_pipe],
    names=['bm25','DLM','DPH','bm25_bo1','DPH_bo1','bm25_rm3'],
    topics=topics_df,
    qrels=qrels_df,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "P_10", ])

In [None]:
pt.Experiment(
    retr_systems=[bm25,DLM],
    names=['bm25','DLM'],
    topics=topics_qe_df,
    qrels=qrels_df,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "P_10"])

In [None]:
# save runs for bm25+rm3 top1000
K=1000
rm3_pipe_topk = rm3_pipe % K
bm25_run = []
for _, row in topics_df.iterrows():
    qid, query = row
    res_df = rm3_pipe_topk.search(query)
    for _, res_row in res_df.iterrows():
        _, docid, docno, rank, score, query, _ = res_row
        row_str = f"{qid} 0 {docno} {rank} {score} tfidf"
        bm25_run.append(row_str)
with open("outputs/bm25_rm3_top1000.run", "w") as f:
    for l in bm25_run:
        f.write(l + "\n")

### BERT encoder

In [None]:
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import CrossEncoder
import tqdm
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

In [None]:
import jsonlines
from tqdm import tqdm
corpus = {}
with jsonlines.open("NIR2022 dataset/corpus.jsonl", mode="r") as reader:
    for row in tqdm(reader):
            docno = row["_id"]
            title = row["title"]
            text = row["text"]

            corpus[docno] = text

    print("corpus num",len(corpus))

In [None]:
import collections
from tqdm import tqdm
def load_run(path):
    """Loads run into a dict of key: query_id, value: list of candidate doc
    ids."""
    print('Loading run...')
    run = collections.OrderedDict()
    with open(path) as f:
        for line in tqdm(f):
            query_id, _, doc_title, rank, _, _ = line.split(' ')
            if query_id not in run:
                run[query_id] = []
            run[query_id].append((doc_title, int(rank)))

    # Sort candidate docs by rank.
    sorted_run = collections.OrderedDict()
    for query_id, doc_titles_ranks in run.items():
        sorted(doc_titles_ranks, key=lambda x: x[1])
        doc_titles = [doc_titles for doc_titles, _ in doc_titles_ranks]
        sorted_run[query_id] = doc_titles

    return sorted_run

run = load_run("outputs/bm25_rm3_top1000.run")

In [None]:
test_query = topics_df
# test_query = topics_test_df

# Pipeline
import spacy
# Sentencizer
nlp = spacy.blank("en")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

stride = 5
max_length = 10
queries = dict(zip(test_query['qid'].to_list(), test_query['query'].to_list()))

n_segments = 0
n_docs = 0
n_doc_ids_not_found = 0

# model_name_avg = "cross_avg"
# model_name_sum = "cross_sum"
model_name = "cross_max1"
# cross_run_avg = []
# cross_run_sum = []
cross_run = []

for query_id, doc_ids in tqdm(run.items(), total=len(run)):
  
    print(f'{query_id}: Converting to segments...')
    query_text = queries[query_id]
    passages = []
    for doc_id in doc_ids:
        if doc_id not in corpus:
            n_doc_ids_not_found += 1
            continue
        n_docs += 1
        doc_text = corpus[doc_id]
        doc = nlp(doc_text[:10000])
        sentences = [str(sent).strip() for sent in doc.sents]
        for i in range(0, len(sentences), stride):
            segment = ' '.join(sentences[i:i + max_length])
            passages.append([doc_id, segment])
            n_segments += 1
            if i + max_length >= len(sentences):
                break

    print(f'{query_id}: Reranking...')

    # get the score 
    # todo
    model_inputs = []
    docno_inputs = []
    model_doc_inputs = pd.DataFrame([[query_text, docno, doc] for docno, doc in passages], columns=['query', 'docno', 'doc'])
    model_inputs = model_doc_inputs[['query','doc']].values.tolist()
    scores = model.predict(model_inputs)
    model_doc_inputs['scores'] = scores
    # avg_scores = model_doc_inputs['scores'].groupby(model_doc_inputs['docno']).mean().to_dict()
    # sum_scores = model_doc_inputs['scores'].groupby(model_doc_inputs['docno']).sum().to_dict()
    max_scores = model_doc_inputs['scores'].groupby(model_doc_inputs['docno']).max().to_dict()
    # results_avg = avg_scores
    # results_sum = sum_scores
    results_max = max_scores
    # sorted_avg_results=dict(sorted(results_avg.items(), key=lambda x: x[1], reverse=True))
    # sorted_sum_results=dict(sorted(results_sum.items(), key=lambda x: x[1], reverse=True))
    sorted_max_results=dict(sorted(results_max.items(), key=lambda x: x[1], reverse=True))
    # Sort the scores in decreasing order
    # results = [{'input': inp, 'docno': docno, 'score': score} for inp, docno, score in zip(model_inputs, docno_inputs, scores)]
    # results = sorted(results, key=lambda x: x['score'], reverse=True)

    # Save the results in TREC format
    # for rank, hit in enumerate(sorted_avg_results.items()):
    #     docno = hit[0]
    #     score = hit[1]
    #     row_str = f"{query_id} Q0 {docno} {rank+1} {score} {model_name_avg}"
    #     cross_run_avg.append(row_str)    

    # for rank, hit in enumerate(sorted_sum_results.items()):
    #     docno = hit[0]
    #     score = hit[1]
    #     row_str = f"{query_id} Q0 {docno} {rank+1} {score} {model_name_sum}"
    #     cross_run_sum.append(row_str)
  
    for rank, hit in enumerate(sorted_max_results.items()):
        docno = hit[0]
        score = hit[1]
        row_str = f"{query_id} Q0 {docno} {rank+1} {score} {model_name}"
        cross_run.append(row_str)    

In [None]:
# Store ranking on disk in TREC format  
with open("outputs/" + f"{model_name}.run", "w") as f:
    for l in cross_run:
        f.write(l + "\n")