In [15]:

def q2d_few_shot_prompt(query, examples):
    prompt = "Write a passage that answers the given query:\n\n"
    for example in examples:
        prompt += f"Query: {example['query_text']}\n"
        prompt += f"Passage: {example['doc_text']}\n\n"
    prompt += f"Query: {query}\nPassage: "
    return prompt

def q2e_few_shot_prompt(query, examples):
    prompt = "Write a list of keywords for the given query:\n\n"
    for example_query, example_keywords in examples.items():
        prompt += f"Query: {example_query}\n"
        prompt += f"Keywords: {example_keywords}\n\n"
    prompt += f"Query: {query}\nKeywords: "
    return prompt

In [4]:
import json
import random
query = "What is your name?"

with open('query_doc_dict.json', 'r') as file:
    query_doc_dict = json.load(file)
# Get the list of unique query_texts from the keys of the dictionary
unique_queries = list(query_doc_dict.keys())

# Randomly select num_samples unique query_texts
num_samples = 3
selected_queries = random.sample(unique_queries, num_samples)
# For each selected query_text, randomly choose one document
selected_entries = []
for query_text in selected_queries:
    # Retrieve documents for the current query_text from the preprocessed dictionary
    docs_for_query = query_doc_dict.get(query_text, [])
    if docs_for_query:
        # Randomly select one document from the list
        doc_text = random.choice(docs_for_query)
        selected_entries.append({"query_text": query_text, "doc_text": doc_text})
    else:
        # Raise an exception if no documents are found for a query_text; should actually not happen
        raise ValueError(f"No documents found for query_text: {query_text}")

# Format the prompt with the selected examples
prompt = q2d_few_shot_prompt(query, selected_entries)

print(prompt)

Write a passage that answers the given query:

Query: Algorithm acceleration with Nvidia CUDA
Passage: Extraction of topic evolutions from references in scientific articles and its GPU acceleration. This paper provides a topic model for extracting topic evolutions as a corpus-wide transition matrix among latent topics. Recent trends in text mining point to a high demand for exploiting metadata. Especially, exploitation of reference relationships among documents induced by hyperlinking Web pages, citing scientific articles, tumblring blog posts, retweeting tweets, etc., is put in the foreground of the effort for an effective mining. We focus on scholarly activities and propose a topic model for obtaining a corpus-wide view on how research topics evolve along citation relationships. Our model, called TERESA, extends latent Dirichlet allocation (LDA) by introducing a corpus-wide topic transition probability matrix, which models reference relationships as transitions among topics. Our appr

In [7]:

with open('ms-marco_query_doc.json', 'r') as file:
    ms_marco_query_doc = json.load(file)

# Randomly select num_samples examples for the prompt
    selected_entries = random.sample(ms_marco_query_doc, num_samples)
    prompt = q2d_few_shot_prompt(query, selected_entries)

print(prompt)

Write a passage that answers the given query:

Query: what company is primark owned by
Passage: High street clothes retailer Primark, which is owned by Associated British Foods, is always in and out of the news. Every year we hear stories of abuse at supplier factories, usually on the Indian sub continent. Despite selling own brand cosmetics Primark has no animal testing policy. The company says:  Primark is against animal testing. Primark and our own label manufacturers do not commission animal testing on any Primark own brand products or ingredients.

Query: what age should a child start wearing deodorant
Passage: deodorant but what deodorant to use. I have done a lot of research on when kids should start using deodorant. In my opinion if the kids are using a safe natural deodorant it should be whenever the children begin to develop odor that showering once a day will not contain. This can be 5 years old or earlier in some cases. 10 most extreme places on Earth. Children begin to sta

In [17]:
with open('query_keywords_dict.json', 'r') as file:
    query_keywords_dict = json.load(file)

# Convert the dictionary to a list of key-value pairs (tuples)
items = list(query_keywords_dict.items())

sampled_pairs = dict(random.sample(items, num_samples))


prompt = q2e_few_shot_prompt(query, sampled_pairs)
print(prompt)


Write a list of keywords for the given query:

Query: what makes natural language processing natural
Keywords: make, natur, languag, process, decis, support, relat, debug, view, speech

Query: what is ahp
Keywords: ahp, system, decis, analyt, hierarchi, approach, elsevi, ltd, prefer, reserv

Query: audio based animal recognition
Keywords: audio, base, anim, recognit, biodivers, identif, plant, speci, snake, lifeclef, herbarium, teaser

Query: What is your name?
Keywords: 


In [21]:
#!/usr/bin/env python3
from tira.third_party_integrations import persist_and_normalize_run, ir_datasets
from tira.rest_api_client import Client

input_dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
dataset = ir_datasets.load(input_dataset)
print(dataset)



Dataset(id='ir-lab-sose-2024/ir-acl-anthology-20240504-training', provides=['docs', 'queries', 'qrels'])


In [47]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import re


# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

input_dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'

dataset = ir_datasets.load(input_dataset)

docs_store = dataset.docs_store()
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
query = "stemming arabic"
prf = bm25.search(query)

# Extract the top num_samples docno
top_k_docnos = prf['docno'][:num_samples].tolist()
prf_docs = [docs_store.get(docno).text for docno in top_k_docnos]

print(prf_docs)
print(len(prf_docs))


['Towards an error-free Arabic stemming\n\n\n ABSTRACTStemming is a computational process for reducing words to their roots (or stems). It can be classified as a recall-enhancing or precision-enhancing component.Existing Arabic stemmers suffer from high stemming error-rates. Arabic stemmers blindly stem all the words and perform poorly especially with compound words, nouns and foreign Arabized words.The Educated Text Stemmer (ETS) is presented in this paper. ETS is a dictionary free, simple, and highly effective Arabic stemming algorithm that can reduce stemming errors in addition to decreasing computational time and data storage.The novelty of the work arises from the use of neglected Arabic stop-words. These stop-words can be highly important and can provide a significant improvement to processing Arabic documents.The ETS stemmer is evaluated by comparison with output from human generated stemming and the stemming weight technique.', 'Impact of Stemmer on Arabic Text Retrieval\n\n\n 

In [50]:

def process_documents(docs):
    processed_docs = []
    for doc in docs:
        # Replace multiple newline characters with a single space
        doc = re.sub(r'\n+', ' ', doc)
        # Remove any periods immediately following "ABSTRACT" or "Abstract"
        doc = re.sub(r'\b(ABSTRACT|Abstract)\s*\.\s*', r'\1 ', doc)
        # Insert a space after "ABSTRACT" or "Abstract" if followed by a non-space character
        doc = re.sub(r'\b(ABSTRACT|Abstract)(\S)', r'\1 \2', doc)
        # Replace occurrences of 'Abstract' or 'ABSTRACT' with a period between words
        doc = re.sub(r'\s*(Abstract|ABSTRACT)\s*', r'. ', doc)
        # Normalize multiple spaces to a single space
        doc = re.sub(r'\s+', ' ', doc)
        # Remove "INTRODUCTION" or "Introduction" followed by a non-space character
        doc = re.sub(r'\b(INTRODUCTION|Introduction)(\S)', r'\2', doc)
        doc = doc.strip()
        processed_docs.append(doc)
    
    return processed_docs

processed_prf_docs = process_documents(prf_docs)
print(processed_prf_docs)
len(processed_prf_docs)

for doc in processed_prf_docs:
    print(doc[:10])


['Towards an error-free Arabic stemming. Stemming is a computational process for reducing words to their roots (or stems). It can be classified as a recall-enhancing or precision-enhancing component.Existing Arabic stemmers suffer from high stemming error-rates. Arabic stemmers blindly stem all the words and perform poorly especially with compound words, nouns and foreign Arabized words.The Educated Text Stemmer (ETS) is presented in this paper. ETS is a dictionary free, simple, and highly effective Arabic stemming algorithm that can reduce stemming errors in addition to decreasing computational time and data storage.The novelty of the work arises from the use of neglected Arabic stop-words. These stop-words can be highly important and can provide a significant improvement to processing Arabic documents.The ETS stemmer is evaluated by comparison with output from human generated stemming and the stemming weight technique.', 'Impact of Stemmer on Arabic Text Retrieval. Stemming is a proc