In [1]:
#imports
import openai
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [2]:
client = openai.OpenAI() #connect to OpenAI API

In [3]:
def ask_gpt(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [4]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [6]:
querys = pt_dataset.get_topics('query')

In [10]:
querys[50:60]

Unnamed: 0,qid,query
50,52,reverse indexing
51,53,index compression techniques
52,54,search engine optimization with query logs
53,55,bm25
54,56,what makes natural language processing natural
55,57,principle of a information retrieval indexing
56,58,architecture of web search engine
57,59,what is ahp
58,60,what is information retrieval
59,61,efficient retrieval algorithms


In [7]:
answers = dict()
for i in range(len(querys)):
    determine_abbrevation = f""" 
    You are an scientific expert especially in the domain of Information Retrieval. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, is a question. You can detect whether a query
    is a question if it contains question words like 'What' or 'Why'.
    For example given a query 'What is crx' you should answer yes, since the query contains 'What' which is a 
    question word. However if the given query is 'Information Retrieval' you should answer no, since there is no
    question word in the query.

    query: '''{querys['query'][i]}'''
    """
    answer = ask_gpt(prompt=determine_abbrevation) #check answer more carefully perhaps model will return not only {yes,no}
    #print(answer)
    qid = str(querys['qid'][i])
    if "yes" in answer.lower().strip():
        answers[qid] = True
    else:
        answers[qid] = False
print(answers)

{'1': False, '2': False, '3': False, '4': False, '5': False, '6': False, '7': False, '8': False, '9': False, '10': False, '11': False, '12': False, '13': False, '14': False, '15': False, '16': False, '17': False, '19': False, '20': False, '21': False, '22': False, '23': False, '24': False, '25': False, '26': False, '27': False, '28': False, '29': False, '30': False, '31': False, '32': False, '33': False, '34': False, '35': False, '36': False, '37': False, '38': False, '39': False, '40': False, '41': False, '42': False, '43': False, '44': False, '45': False, '46': False, '47': False, '48': False, '49': False, '50': False, '51': False, '52': False, '53': False, '54': False, '55': False, '56': True, '57': False, '58': False, '59': True, '60': True, '61': False, '62': False, '63': False, '64': False, '65': False, '66': False, '67': False, '68': False, '18': False}


In [11]:
for key in answers.keys():
    if bool(answers[key]):
        #find query
        for i in range(len(querys)):
            if querys['qid'][i] == key:
                query= querys['query'][i]
                print("Old query: ",query)
        #ask gpt to expand query
        expand = f""" 
        You are an expert in Information Retrieval.  I am building an retrieval System specificially for the scientific domain, just like Google Scholar I want to find scientific papers for a query,. I want to improve the precision of my retrieval system by expanding specific question querys. Please understand that the information need behind a question query is much different than for a keyword query. For example the intent behind a question query like "What is Deep Learning" much more likely is to find papers that focus on an introduction to deep learning and explains core concepts. Meanwhile the intent behind the keyword query "Deep Learning" is find any papers that focus on deep learning.
        Your task is to semanticly interpret a question query, which is below delimited by triple quotes, and return three to six expanded queries so I can gather the score for each query and then aggregate over the scores to calculate a final score. Here are some examples:
        For the question query 'What is deep learning' the intent is to find papers that introduce and explain deep learning. Therefore expanded querys could be:
        'Introduction to deep Learning'
        'Overview of deep Learning'
        'basic concepts of deep learning'
        For the question query 'Why use boolean retrieval model' the inent is to find papers that gve insights to reasons to apply the boolean retrieval model. Therefore expanded querys could be:
        'Reasons to use boolean retrieval model'
        'Introduction to boolean retrieval model'

        query: '''{query}'''
        """
        new_query = ask_gpt(prompt=expand).lower().strip().replace("'", " ").replace('"', ' ')
        print("New query: ",new_query)

Old query:  what makes natural language processing natural
New query:   fundamentals of natural language processing 
 understanding natural language processing 
 concepts behind natural language processing 
 principles of natural language processing 
 nature of natural language processing 
Old query:  what is ahp
New query:   introduction to ahp 
 overview of ahp 
 basic concepts of ahp 
 understanding ahp 
 principles of ahp 
 fundamentals of ahp 
Old query:  what is information retrieval
New query:   introduction to information retrieval 
 overview of information retrieval 
 basic concepts of information retrieval 
 principles of information retrieval 
 understanding information retrieval 
 fundamentals of information retrieval 
