# Import All Libaries

In [1]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()

import pyterrier as pt
from jnius import autoclass

import math

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


# Load the Dataset and the Index

In [2]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset(f'irds:ir-lab-sose-2024/{dataset}')
#bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

## Stopwords

In [3]:
def create_index(documents, stopwords):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=customStopwords)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

customStopwords =[
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
    'will', 'just', 'don', 'should', 'now', 'e', 'n', 'd', 'o', 'c', 'r'
]

index = create_index(pt_dataset.get_corpus_iter(), customStopwords)

bm25_stopwords = pt.BatchRetrieve(index, wmodel="BM25")

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  71%|███████   | 89806/126958 [00:21<00:05, 6597.26it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:27<00:00, 4677.43it/s] 


15:27:20.119 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


## Query Expansion with Large Language Models

In [4]:
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
bm25_kl = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

In [5]:
# llm expansions with gpt
gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')
gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')
gpt_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with llama
llama_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-cot', dataset, prefix='llm_expansion_')
llama_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-fs', dataset, prefix='llm_expansion_')
llama_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with flan-ul2
flan_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-cot', dataset, prefix='llm_expansion_')
flan_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-fs', dataset, prefix='llm_expansion_')
flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')

In [6]:
tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

documents = []
for doc in pt_dataset.get_corpus_iter():
        documents.append({
        'docno': doc['docno'],
        'text': doc['text'],
})   

def pt_tokenize(text):
    return ' '.join(tokeniser.getTokens(text))

# we wrap this into an pyterrier transformer
# Documentation: https://pyterrier.readthedocs.io/en/latest/apply.html

def calculate_tf_idf(term, doc_text, index):
    # Implement your TF-IDF calculation logic here
    # Example placeholder implementation:
    tf = doc_text.lower().count(term.lower()) / len(doc_text.split())
    #print(tf)
    num_docs = index.getCollectionStatistics().getNumberOfDocuments()
    lexicon = index.getLexicon()
    entry = lexicon.getLexiconEntry(term.lower())
    doc_freq = entry.getDocumentFrequency() if entry is not None else 0
    idf = math.log((num_docs + 1) / (doc_freq + 1)) + 1

    #print(tf * idf)
    return tf * idf

# Define a function to retrieve top documents and extract terms with scores
def retrieve_top_docs_terms(query, num_docs=10, num_terms=5):
    # Annahme: bm25 ist ein BatchRetrieve-Objekt
    from jnius import autoclass
    tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
    query = " ".join(tokeniser.getTokens(query))
    query_results = bm25.search(query)
    top_docs = query_results.head(num_docs)['docid'].tolist()
    terms_scores = {}
    
    for docid in top_docs:
        
        doc_text = documents[docid]['text']
        doc_terms = pt_tokenize(doc_text).split()
        doc_score = query_results[query_results['docid'] == docid]['score'].values[0]
        for x in range(10):
            for term in doc_terms:
                if term in customStopwords:
                    doc_terms.remove(term)
            x += 1

        for term in doc_terms:
            if term not in terms_scores:
                terms_scores[term] = 0
            terms_scores[term] += doc_score * calculate_tf_idf(term, doc_text, index)
    
    sorted_terms = sorted(terms_scores.items(), key=lambda x: x[1], reverse=True)
   
    
    return [term for term, score in sorted_terms[:num_terms]]
        


# Function to expand query
def expand_query(topic):
    original_query = topic['query']
    #expanded_terms = retrieve_top_docs_terms(original_query)
    llm_query = topic['llm_expansion_query']
    #print("Test")
   
    
    #print(llm_query)
    
    llm_terms = retrieve_top_docs_terms(llm_query)
    #print(llm_terms)
    
    expanded_query = ' '.join([original_query] * 6 + llm_terms)

    return pt_tokenize(expanded_query)

# Wrapper for PyTerrier query expansion
pt_expand_query = pt.apply.query(expand_query)


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 53580.96it/s]


In [7]:
pipeline_gpt_cot = (gpt_cot >> pt_expand_query) >> bm25
pipeline_gpt_sq_fs = (gpt_sq_fs >> pt_expand_query) >> bm25
pipeline_gpt_sq_zs = (gpt_sq_zs >> pt_expand_query) >> bm25

pipeline_llama_cot = (llama_cot >> pt_expand_query) >> bm25
pipeline_llama_sq_fs = (llama_sq_fs >> pt_expand_query) >> bm25
pipeline_llama_sq_zs = (llama_sq_zs >> pt_expand_query) >> bm25



### Evaluation

In [10]:
pt.Experiment(
    retr_systems=[bm25, bm25_rm3, bm25_kl, pipeline_gpt_cot, pipeline_gpt_sq_fs, pipeline_gpt_sq_zs, pipeline_llama_cot, pipeline_llama_sq_fs, pipeline_llama_sq_zs],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    names=['BM25', 'BM25+RM3', 'BM25+KL', 'BM25+GPT-COT', 'BM25+GPT-SQ-FS', 'BM25+GPT-SQ-ZS', 'BM25+Llama-COT', 'BM25+Llama-SQ-FS', 'BM25+Llama-SQ-ZS'],
    eval_metrics=['recall_1000', 'ndcg_cut_5', 'ndcg_cut.10', 'recip_rank']
)

15:28:08.760 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 125137 among 6 possibilities
15:28:09.058 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 116910 among 5 possibilities


Unnamed: 0,name,recall_1000,ndcg_cut_5,ndcg_cut.10,recip_rank
0,BM25,0.825376,0.39365,0.374041,0.579877
1,BM25+RM3,0.807109,0.340194,0.324716,0.543215
2,BM25+KL,0.821885,0.351864,0.347616,0.549747
3,BM25+GPT-COT,0.835668,0.390734,0.381299,0.578489
4,BM25+GPT-SQ-FS,0.839683,0.360956,0.351081,0.563426
5,BM25+GPT-SQ-ZS,0.841613,0.37777,0.350431,0.555526
6,BM25+Llama-COT,0.834232,0.369012,0.352517,0.557993
7,BM25+Llama-SQ-FS,0.842011,0.368341,0.357767,0.532263
8,BM25+Llama-SQ-ZS,0.838754,0.389072,0.370304,0.589386
