In [1]:
#imports
import openai
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [2]:
client = openai.OpenAI() #connect to OpenAI API

In [3]:
def ask_gpt(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [4]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [6]:
querys = pt_dataset.get_topics('query')

In [7]:
def extract_integers_from_qid(df):
    # Convert the qid column to integers
    return df['qid'].astype(int).tolist()

In [8]:
def get_query_by_qid(df, qid):
    # Convert the qid to string and find the corresponding query
    result = df.loc[df['qid'] == str(qid), 'query']
    return result.iloc[0] if not result.empty else None

In [9]:
def set_query_by_qid(df, qid, new_query):
    # Convert the qid to string and update the query
    df.loc[df['qid'] == str(qid), 'query'] = new_query

In [10]:
qid_list = extract_integers_from_qid(querys)

In [11]:
answers = dict()
for i in qid_list:
    determine_abbrevation = f""" 
    You are an scientific expert especially in the domain of Information Retrieval. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, contains an abbrevation then answer with yes or not then answer with no.
    For example given a query 'What is crx' you should answer yes, since cxr is the abbrevation for the medical term
    'chest X-Ray'. However if the given query is 'What is Information Retrieval' you should answer no, since there is no
    abbrevation in the query.

    query: '''{get_query_by_qid(querys, i)}'''
    """
    answer = ask_gpt(prompt=determine_abbrevation) #check answer more carefully perhaps model will return not only {yes,no}
    #print(answer)
    qid = str(i)
    if "yes" in answer.lower().strip():
        answers[qid] = True
    else:
        answers[qid] = False
print(answers)

{'1': False, '2': False, '3': False, '4': False, '5': False, '6': False, '7': False, '8': False, '9': False, '10': False, '11': True, '12': False, '13': False, '14': False, '15': False, '16': False, '17': False, '19': False, '20': False, '21': False, '22': False, '23': False, '24': False, '25': False, '26': False, '27': False, '28': False, '29': False, '30': False, '31': False, '32': False, '33': False, '34': False, '35': False, '36': False, '37': False, '38': False, '39': False, '40': False, '41': False, '42': False, '43': False, '44': False, '45': False, '46': False, '47': False, '48': False, '49': False, '50': False, '51': False, '52': False, '53': False, '54': False, '55': True, '56': False, '57': False, '58': False, '59': True, '60': False, '61': False, '62': False, '63': False, '64': False, '65': False, '66': False, '67': False, '68': False, '18': False}


In [12]:
for key in answers.keys():
    if bool(answers[key]):
        #find query
        for i in qid_list:
            if str(i) == key:
                query= get_query_by_qid(querys, key)
                print("Old query: ",query)
        #ask gpt to expand query
        expand = f""" 
        You are an scientific expert especially in the domain of Information Retrieval. Your are given a query, which is below
        delimited by triple quotes, which contains an abbrevation. Your task is to identify the abbrevation and write it, then
        concat the original query with the written out abbrevation and return this new query as string only.
        For example given a query 'What is crx' you should detect that the abbrevation is crx, since cxr is the abbrevation for the medical term
        'chest X-Ray', then you should concat the originial query with the abbrevation 'chest X-Ray' resulting in a new query 'What is crx chest x-ray' which
        you should return. Another example, given the query 'Algorithms of nlp' you should detect that the abbrevation is nlp, since nlp is the abbrevation
        for the term 'natural language processing', then you should concat the original query 'Algorithms of nlp' with the abbrevation 'natural language processing'  
        resulting in a new query 'Algorithms of nlp natural language processing' which you should return.
        Please only answer with the new query. So your answer should only include the original query and the detected abbreviated words and no additional information.
        Don't wrap your answer in quotation marks.

        query: '''{query}'''
        """
        new_query = ask_gpt(prompt=expand).lower().strip().replace("'", " ").replace('"', ' ')
        print("New query: ",new_query)
        set_query_by_qid(querys, int(key), new_query) #overwrite old query

Old query:  algorithm acceleration with nvidia cuda
New query:  algorithm acceleration with nvidia cuda compute unified device architecture
Old query:  bm25
New query:  bm25 best match 25
Old query:  what is ahp
New query:  what is ahp analytic hierarchy process


In [15]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [16]:
run = bm25(querys)

In [21]:
run[57000:58000]

Unnamed: 0,qid,docid,docno,rank,score,query,system
57000,59,124882,2006.ipm_journal-ir0volumeA42A4.0,0,27.781898,what is ahp analytic hierarchy process,abbrevations
57001,59,114774,2017.ijirr_journal-ir0volumeA7A3.1,1,25.656730,what is ahp analytic hierarchy process,abbrevations
57002,59,118484,2013.jasis_journal-ir0volumeA64A7.12,2,22.011661,what is ahp analytic hierarchy process,abbrevations
57003,59,95978,2007.cikm_conference-2007.51,3,19.821424,what is ahp analytic hierarchy process,abbrevations
57004,59,114140,2018.wwwjournals_journal-ir0volumeA21A1.3,4,18.963078,what is ahp analytic hierarchy process,abbrevations
...,...,...,...,...,...,...,...
57995,59,38713,D17-1123,995,7.263059,what is ahp analytic hierarchy process,abbrevations
57996,59,58604,C90-3076,996,7.263059,what is ahp analytic hierarchy process,abbrevations
57997,59,81248,2004.sigirconf_conference-2004.62,997,7.263059,what is ahp analytic hierarchy process,abbrevations
57998,59,82476,1998.sigirconf_conference-98.19,998,7.263059,what is ahp analytic hierarchy process,abbrevations


In [19]:
persist_and_normalize_run(run, system_name='abbrevations', default_output='./runs')

The run file is normalized outside the TIRA sandbox, I will store it at "./runs".
Done. run file is stored under "./runs/run.txt".


In [22]:
run2 = bm25(pt_dataset.get_topics('text'))

In [23]:
run2[56279:56284]

Unnamed: 0,qid,docid,docno,rank,score,query
56279,58,113091,2005.ir_journal-ir0volumeA8A4.1,999,13.11225,architecture of web search engine
56280,59,124882,2006.ipm_journal-ir0volumeA42A4.0,0,10.066961,what is ahp
56281,59,114140,2018.wwwjournals_journal-ir0volumeA21A1.3,1,8.99716,what is ahp
56282,59,126037,2016.ipm_journal-ir0volumeA52A6.8,2,6.783162,what is ahp
56283,60,84131,2003.sigirconf_conference-2003.73,0,6.025493,what is information retrieval
