# Import All Libaries

In [54]:
# Ensure necessary libraries are imported
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pandas as pd
import pyterrier as pt
from tqdm import tqdm
from jnius import autoclass
import gzip
import json
import re
import nltk
from nltk.stem import WordNetLemmatizer

ensure_pyterrier_is_loaded()

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
tira = Client()


# Load the Dataset and the Index

In [55]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'antique-test-20230107-training'
pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')
bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

## Document Expansion by Query Prediction with docT5query
The basic idea is to train a model, that when given an input document, generates questions that the document might answer (or more broadly, queries for which the document might be relevant). These predicted questions (or queries) are then appended to the original documents, which are then indexed as before. The docT5query model gets its name from the use of T5 as the expansion model.

The primary advantage of this approach is that expensive neural inference is pushed to indexing time, which means that "bag of words" queries against an inverted index built on the augmented document collection are only slightly slower (due to longer documents) — but the retrieval results are much better.

First we check, if our corpus has a high recall or a lower. Our Corpus in this case is the union of the IR Anthology and the ACL Anthology. The recall may change if we use another corpus.

In [56]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment(
    retr_systems=[bm25],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    names=['BM25'],
    eval_metrics=['recall_1000']
)

Unnamed: 0,name,recall_1000
0,BM25,0.788732


As we can see, we have already a high recall. This is important for the way we implement the docT5query.
More information about the implementation in the [Tutorial](https://github.com/tira-io/teaching-ir-with-shared-tasks/blob/main/tutorials/tutorial-doc-t5-query.ipynb).

In [57]:
def doc_t5_query(dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l

def doc_t5_query_index(dataset):
    indexer = pt.IterDictIndexer("/tmp/index2", overwrite=True, meta={'docno': 100, 'text': 20480})
    index_ref = indexer.index(doc_t5_query(dataset))
    return pt.IndexFactory.of(index_ref)


In [58]:
indexD = doc_t5_query_index(dataset)

928it [00:00, 9101.49it/s]

3302it [00:00, 11282.39it/s]



403666it [00:31, 12870.13it/s]


12:45:33.502 [ForkJoinPool-7-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 113 empty documents


In [59]:
docs_retrieved_by_bm25 = {}

bm25_result = bm25(pt_dataset.get_topics('title'))

for _, i in tqdm(bm25_result.iterrows()):
    qid, docno = str(i['qid']), str(i['docno'])

    if qid not in docs_retrieved_by_bm25:
        docs_retrieved_by_bm25[qid] = set()
    
    docs_retrieved_by_bm25[qid].add(docno)

188633it [00:07, 24258.53it/s]


In [60]:
omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]
omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)

In [61]:
bm25_doct5query = pt.BatchRetrieve(indexD, wmodel="BM25")
bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs

## Stopwords

In [62]:
def create_index(documents, stopwords):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=customStopwords)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

customStopwords =[
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
    'will', 'just', 'don', 'should', 'now'
]

index = create_index(pt_dataset.get_corpus_iter(), customStopwords)

bm25_stopwords = pt.BatchRetrieve(index, wmodel="BM25")

ir-benchmarks/antique-test-20230107-training documents:   0%|          | 1654/403666 [00:00<00:46, 8728.45it/s]



ir-benchmarks/antique-test-20230107-training documents: 100%|██████████| 403666/403666 [00:36<00:00, 11165.33it/s]


12:46:27.345 [ForkJoinPool-8-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1570 empty documents


In [63]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Query Expansion with Large Language Models

In [64]:
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
bm25_kl = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

In [65]:
# llm expansions with gpt
gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')
gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')
gpt_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with llama
llama_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-cot', dataset, prefix='llm_expansion_')
llama_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-fs', dataset, prefix='llm_expansion_')
llama_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with flan-ul2
flan_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-cot', dataset, prefix='llm_expansion_')
flan_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-fs', dataset, prefix='llm_expansion_')
flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')

In [66]:
tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

def pt_tokenize(text):
    return ' '.join(tokeniser.getTokens(text))

def expand_query(topic):
  ret = ' '.join([topic['query'], topic['query'], topic['query'],  topic['query'],  topic['query'], topic['llm_expansion_query']])

  # apply the tokenization
  return pt_tokenize(ret)

# we wrap this into an pyterrier transformer
# Documentation: https://pyterrier.readthedocs.io/en/latest/apply.html
pt_expand_query = pt.apply.query(expand_query)

In [67]:
def pt_tokenize(text):
    return ' '.join(tokeniser.getTokens(text))

def expand_and_lemmatize_query(topic):
    expanded_query = ' '.join([topic['query'], topic['query'], topic['query'],  topic['query'],  topic['query'], topic['llm_expansion_query']])
    lemmatized_query = lemmatize_text(expanded_query)
    return pt_tokenize(lemmatized_query)

pt_expand_and_lemmatize_query = pt.apply.query(expand_and_lemmatize_query)

pipeline_gpt_cot = (gpt_cot >> pt_expand_and_lemmatize_query) >> bm25
pipeline_gpt_sq_fs = (gpt_sq_fs >> pt_expand_and_lemmatize_query) >> bm25
pipeline_gpt_sq_zs = (gpt_sq_zs >> pt_expand_and_lemmatize_query) >> bm25

pipeline_llama_cot = (llama_cot >> pt_expand_and_lemmatize_query) >> bm25
pipeline_llama_sq_fs = (llama_sq_fs >> pt_expand_and_lemmatize_query) >> bm25
pipeline_llama_sq_zs = (llama_sq_zs >> pt_expand_and_lemmatize_query) >> bm25

pipeline_flan_cot = (flan_cot >> pt_expand_and_lemmatize_query) >> bm25
pipeline_flan_sq_fs = (flan_sq_fs >> pt_expand_and_lemmatize_query) >> bm25
pipeline_flan_sq_zs = (flan_sq_zs >> pt_expand_and_lemmatize_query) >> bm25


### Bo1 Query Expansion

In [68]:
bo1_expansion = bm25_doct5query_new >> pt.rewrite.Bo1QueryExpansion(index)
# build final pipeline for retrieval
bm25_bo1 = bo1_expansion >> bm25

### Evaluation

In [69]:
# Evaluate with the new lemmatized index and query pipelines
pt.Experiment(
    retr_systems=[bm25, pipeline_gpt_cot, pipeline_gpt_sq_fs, pipeline_gpt_sq_zs,
                  pipeline_llama_cot, pipeline_llama_sq_fs, pipeline_llama_sq_zs,
                  pipeline_flan_cot, pipeline_flan_sq_fs, pipeline_flan_sq_zs, bm25_bo1],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    names=['BM25', 'GPT-3.5 CoT', 'GPT-3.5 SQ FS', 'GPT-3.5 SQ ZS',
           'LLAMA CoT', 'LLAMA SQ FS', 'LLAMA SQ ZS',
           'FLAN-UL2 CoT', 'FLAN-UL2 SQ FS', 'FLAN-UL2 SQ ZS', 'BM25 + Bo1'],
    eval_metrics=['recall_1000', 'ndcg_cut_5', 'ndcg_cut_10', 'recip_rank']
)


Unnamed: 0,name,recall_1000,ndcg_cut_5,ndcg_cut_10,recip_rank
0,BM25,0.788732,0.529428,0.510402,0.934803
1,GPT-3.5 CoT,0.803464,0.535214,0.505071,0.866274
2,GPT-3.5 SQ FS,0.794062,0.536776,0.512088,0.923202
3,GPT-3.5 SQ ZS,0.786726,0.530052,0.496447,0.915797
4,LLAMA CoT,0.789377,0.518131,0.489208,0.857129
5,LLAMA SQ FS,0.793214,0.543543,0.509856,0.92337
6,LLAMA SQ ZS,0.799414,0.557042,0.522821,0.921519
7,FLAN-UL2 CoT,0.792891,0.513493,0.489407,0.886051
8,FLAN-UL2 SQ FS,0.787538,0.525658,0.503038,0.917737
9,FLAN-UL2 SQ ZS,0.795934,0.522575,0.49966,0.895438
