# Import All Libaries

In [18]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()
import pandas as pd
import pyterrier as pt
from tqdm import tqdm
from jnius import autoclass
import gzip
import json
import re

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
tira = Client()

# Load the Dataset and the Index

In [19]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'antique-test-20230107-training'
pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')
bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

## Document Expansion by Query Prediction with docT5query
The basic idea is to train a model, that when given an input document, generates questions that the document might answer (or more broadly, queries for which the document might be relevant). These predicted questions (or queries) are then appended to the original documents, which are then indexed as before. The docT5query model gets its name from the use of T5 as the expansion model.

The primary advantage of this approach is that expensive neural inference is pushed to indexing time, which means that "bag of words" queries against an inverted index built on the augmented document collection are only slightly slower (due to longer documents) — but the retrieval results are much better.

First we check, if our corpus has a high recall or a lower. Our Corpus in this case is the union of the IR Anthology and the ACL Anthology. The recall may change if we use another corpus.

In [20]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

pt.Experiment(
    retr_systems=[bm25],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    names=['BM25'],
    eval_metrics=['recall_1000']
)

Unnamed: 0,name,recall_1000
0,BM25,0.788732


As we can see, we have already a high recall. This is important for the way we implement the docT5query.
More information about the implementation in the [Tutorial](https://github.com/tira-io/teaching-ir-with-shared-tasks/blob/main/tutorials/tutorial-doc-t5-query.ipynb).

In [21]:
def doc_t5_query(dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l

def doc_t5_query_index(dataset):
    indexer = pt.IterDictIndexer("/tmp/index2", overwrite=True, meta={'docno': 100, 'text': 20480})
    index_ref = indexer.index(doc_t5_query(dataset))
    return pt.IndexFactory.of(index_ref)


In [22]:
indexD = doc_t5_query_index(dataset)

0it [00:00, ?it/s]

3724it [00:00, 9544.60it/s]



403666it [00:28, 14180.32it/s]


09:16:52.558 [ForkJoinPool-5-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 113 empty documents


In [23]:
docs_retrieved_by_bm25 = {}

bm25_result = bm25(pt_dataset.get_topics('title'))

for _, i in tqdm(bm25_result.iterrows()):
    qid, docno = str(i['qid']), str(i['docno'])

    if qid not in docs_retrieved_by_bm25:
        docs_retrieved_by_bm25[qid] = set()
    
    docs_retrieved_by_bm25[qid].add(docno)

188633it [00:07, 23755.07it/s]


In [24]:
omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]
omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)

In [25]:
bm25_doct5query = pt.BatchRetrieve(indexD, wmodel="BM25")
bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs

## Stopwords

In [34]:
def create_index(documents, stopwords):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

chatGPTStopwords =[
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
    'will', 'just', 'don', 'should', 'now'
]
##Von Github https://github.com/igorbrigadir/stopwords/tree/master
##Natural Language Toolkit (NLTK)
ntlkStopwords = [
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", 
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", 
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", 
    "at", "by", "for", "with", "about", "against", "between", "into", "through", 
    "during", "before", "after", "above", "below", "to", "from", "up", "down", 
    "in", "out", "on", "off", "over", "under", "again", "further", "then", 
    "once", "here", "there", "when", "where", "why", "how", "all", "any", 
    "both", "each", "few", "more", "most", "other", "some", "such", "no", 
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", 
    "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o", 
    "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", 
    "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn", 
    "wasn", "weren", "won", "wouldn"
]
##spaCy 
spacyStopwords = [
    "'d", "'ll", "'m", "'re", "'s", "'ve", "a", "about", "above", "across", "after", 
    "afterwards", "again", "against", "all", "almost", "alone", "along", "already", 
    "also", "although", "always", "am", "among", "amongst", "amount", "an", "and", 
    "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", 
    "around", "as", "at", "back", "be", "became", "because", "become", "becomes", 
    "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", 
    "besides", "between", "beyond", "both", "bottom", "but", "by", "ca", "call", 
    "can", "cannot", "could", "did", "do", "does", "doing", "done", "down", "due", 
    "during", "each", "eight", "either", "eleven", "else", "elsewhere", "empty", 
    "enough", "even", "ever", "every", "everyone", "everything", "everywhere", 
    "except", "few", "fifteen", "fifty", "first", "five", "for", "former", "formerly", 
    "forty", "four", "from", "front", "full", "further", "get", "give", "go", "had", 
    "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", 
    "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", 
    "hundred", "i", "if", "in", "indeed", "into", "is", "it", "its", "itself", "just", 
    "keep", "last", "latter", "latterly", "least", "less", "made", "make", "many", 
    "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", 
    "move", "much", "must", "my", "myself", "n't", "name", "namely", "neither", 
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", 
    "not", "nothing", "now", "nowhere", "n‘t", "n’t", "of", "off", "often", "on", 
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", 
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", 
    "put", "quite", "rather", "re", "really", "regarding", "same", "say", "see", 
    "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", 
    "show", "side", "since", "six", "sixty", "so", "some", "somehow", "someone", 
    "something", "sometime", "sometimes", "somewhere", "still", "such", "take", 
    "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", 
    "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", 
    "they", "third", "this", "those", "though", "three", "through", "throughout", 
    "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", 
    "twenty", "two", "under", "unless", "until", "up", "upon", "us", "used", "using", 
    "various", "very", "via", "was", "we", "well", "were", "what", "whatever", 
    "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", 
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", 
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 
    "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", 
    "‘d", "‘ll", "‘m", "‘re", "‘s", "‘ve", "’d", "’ll", "’m", "’re", "’s", "’ve"
]

index = create_index(pt_dataset.get_corpus_iter(), chatGPTStopwords)

bm25_chatGPTStopwords = pt.BatchRetrieve(index, wmodel="BM25")

index2 = create_index(pt_dataset.get_corpus_iter(),ntlkStopwords)
bm25_ntlkStopwords = pt.BatchRetrieve(index2, wmodel="BM25")

index3 = create_index(pt_dataset.get_corpus_iter(),spacyStopwords)
bm25_spacyStopwords = pt.BatchRetrieve(index3, wmodel="BM25")

ir-benchmarks/antique-test-20230107-training documents:   0%|          | 1768/403666 [00:00<00:43, 9194.72it/s]



ir-benchmarks/antique-test-20230107-training documents: 100%|██████████| 403666/403666 [00:38<00:00, 10483.08it/s]


09:26:30.261 [ForkJoinPool-10-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1570 empty documents


ir-benchmarks/antique-test-20230107-training documents:   0%|          | 1276/403666 [00:00<01:02, 6455.10it/s]



ir-benchmarks/antique-test-20230107-training documents: 100%|██████████| 403666/403666 [00:43<00:00, 9364.59it/s] 


09:27:23.807 [ForkJoinPool-11-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1701 empty documents


ir-benchmarks/antique-test-20230107-training documents:   0%|          | 527/403666 [00:00<01:16, 5269.34it/s]



ir-benchmarks/antique-test-20230107-training documents: 100%|██████████| 403666/403666 [00:42<00:00, 9494.14it/s] 


09:28:16.652 [ForkJoinPool-12-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2133 empty documents


## Query Expansion with Large Language Models

In [28]:
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
bm25_kl = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

In [29]:
# llm expansions with gpt
gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')
gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')
gpt_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with llama
llama_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-cot', dataset, prefix='llm_expansion_')
llama_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-fs', dataset, prefix='llm_expansion_')
llama_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with flan-ul2
flan_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-cot', dataset, prefix='llm_expansion_')
flan_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-fs', dataset, prefix='llm_expansion_')
flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')
     

In [30]:
tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

def pt_tokenize(text):
    return ' '.join(tokeniser.getTokens(text))

def expand_query(topic):
  ret = ' '.join([topic['query'], topic['query'], topic['query'],  topic['query'],  topic['query'], topic['llm_expansion_query']])

  # apply the tokenization
  return pt_tokenize(ret)

# we wrap this into an pyterrier transformer
# Documentation: https://pyterrier.readthedocs.io/en/latest/apply.html
pt_expand_query = pt.apply.query(expand_query)

In [40]:
pipeline_gpt_cot = (gpt_cot >> pt_expand_query) >> bm25
pipeline_gpt_cot_doct5query = (gpt_cot >> pt_expand_query) >> bm25_doct5query
pipeline_gpt_cot_chatgptstopwords = (gpt_cot >> pt_expand_query) >> bm25_chatGPTStopwords
pipeline_gpt_cot_ntlkstopwords = (gpt_cot >> pt_expand_query) >> bm25_ntlkStopwords
pipeline_gpt_cot_spacystopwords = (gpt_cot >> pt_expand_query) >> bm25_spacyStopwords
pipeline_gpt_sq_fs = (gpt_sq_fs >> pt_expand_query) >> bm25
pipeline_gpt_sq_zs = (gpt_sq_zs >> pt_expand_query) >> bm25

pipeline_llama_cot = (llama_cot >> pt_expand_query) >> bm25
pipeline_llama_sq_fs = (llama_sq_fs >> pt_expand_query) >> bm25
pipeline_llama_sq_zs = (llama_sq_zs >> pt_expand_query) >> bm25

pipeline_flan_cot = (flan_cot >> pt_expand_query) >> bm25
pipeline_flan_sq_fs = (flan_sq_fs >> pt_expand_query) >> bm25
pipeline_flan_sq_zs = (flan_sq_zs >> pt_expand_query) >> bm25


### Bo1 Query Expansion

In [38]:
bo1_expansion = bm25_doct5query_new >> pt.rewrite.Bo1QueryExpansion(index)
# build final pipeline for retrieval
bm25_bo1 = bo1_expansion >> bm25



### Evaluation

In [41]:
pt.Experiment(
    retr_systems=[bm25, bm25_doct5query, bm25_chatGPTStopwords,bm25_ntlkStopwords,bm25_spacyStopwords, bm25_bo1, bm25_rm3, bm25_kl, pipeline_gpt_cot,pipeline_gpt_cot_doct5query, pipeline_gpt_cot_chatgptstopwords, pipeline_gpt_cot_ntlkstopwords, pipeline_gpt_cot_spacystopwords, pipeline_gpt_sq_fs, pipeline_gpt_sq_zs, pipeline_llama_cot, pipeline_llama_sq_fs, pipeline_llama_sq_zs, pipeline_flan_cot, pipeline_flan_sq_fs, pipeline_flan_sq_zs],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    names=['BM25', 'DocT5Query >> BM25', 'BM25_chatGPTStopwords','BM25_ntlkStopwords','BM25_spacyStopwords','BM25_Bo1', 'BM25+RM3', 'BM25+KL', 'BM25+GPT-COT','DocT5Query >> BM25+GPT-COT','BM25_chatgptstopwords+GPT-COT','BM25_ntlkstopwords+GPT-COT','BM25_spacystopwords+GPT-COT', 'BM25+GPT-SQ-FS', 'BM25+GPT-SQ-ZS', 'BM25+Llama-COT', 'BM25+Llama-SQ-FS', 'BM25+Llama-SQ-ZS', 'BM25+Flan-COT', 'BM25+Flan-SQ-FS', 'BM25+Flan-SQ-ZS'],
    eval_metrics=['recall_1000', 'ndcg_cut_5', 'ndcg_cut.10', 'recip_rank']
)

Unnamed: 0,name,recall_1000,ndcg_cut_5,ndcg_cut.10,recip_rank
0,BM25,0.788732,0.529428,0.510402,0.934803
1,DocT5Query >> BM25,0.534685,0.399011,0.348678,0.793546
2,BM25_chatGPTStopwords,0.794154,0.536473,0.516196,0.948333
3,BM25_ntlkStopwords,0.794738,0.535913,0.516055,0.948027
4,BM25_spacyStopwords,0.79294,0.534672,0.506493,0.944375
5,BM25_Bo1,0.781723,0.519123,0.501171,0.913488
6,BM25+RM3,0.779805,0.509865,0.489359,0.934375
7,BM25+KL,0.788479,0.532529,0.503925,0.949333
8,BM25+GPT-COT,0.806138,0.541391,0.506853,0.875463
9,BM25+GPT-COT,0.565041,0.391847,0.351724,0.74785
