In [104]:
#imports
import openai
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt


In [105]:

import os
#openai.api_key = os.getenv("OPENAI_API_KEY_NEEDTHEGRADE")
client = openai.OpenAI() #connect to OpenAI API


In [106]:
def ask_gpt(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [107]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

In [108]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [109]:
querys = pt_dataset.get_topics('query')
expanded_queries_list = pt_dataset.get_topics('query').values.tolist()
print(expanded_queries_list)
print(expanded_queries_list[67][1])
print(len(expanded_queries_list))

[['1', 'retrieval system improving effectiveness'], ['2', 'machine learning language identification'], ['3', 'social media detect self harm'], ['4', 'stemming for arabic languages'], ['5', 'audio based animal recognition'], ['6', 'comparison different retrieval models'], ['7', 'cache architecture'], ['8', 'document scoping formula'], ['9', 'pseudo relevance feedback'], ['10', 'how to represent natural conversations in word nets'], ['11', 'algorithm acceleration with nvidia cuda'], ['12', 'mention of algorithm'], ['13', 'at least three authors'], ['14', 'german domain'], ['15', 'mention of open source'], ['16', 'inclusion of text mining'], ['17', 'the ethics of artificial intelligence'], ['19', 'machine learning for more relevant results'], ['20', 'crawling websites using machine learning'], ['21', 'recommenders influence on users'], ['22', 'search engine caching effects'], ['23', 'consumer product reviews'], ['24', 'limitations machine learning'], ['25', 'medicine related research'], [

## Identify question queries

In [110]:
#gpt goes through each query, question querys qid will be stored in answers (list)
answers = list()
for i in range(len(expanded_queries_list)):
#for i in range(0, 2):
    determine_abbrevation = f""" 
    You are an scientific expert especially in the domain of Information Retrieval. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, is a question. You can detect whether a query
    is a question if it contains question words like 'What' or 'Why'.
    For example given a query 'What is crx' you should answer yes, since the query contains 'What' which is a 
    question word. However if the given query is 'Information Retrieval' you should answer no, since there is no
    question word in the query.

    query: '''{querys['query'][i]}'''
    """

    determine_ngrams = f""" 
    You are an scientific expert in the domain of Information Retrieval and linguistics. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, contains bigrams or trigrams. This means, you should check for all 
    bigrams and trigrams in the query, if they are an existing term consisting of multiple words. Then, your answer should be the original query 
    with all the ngrams you found appended in the format word1$$word2. Your answer should only include the query and the bigrams, no additional information.
    This means that when there are no existing ngrams in the query, your answer should just be the original query. You should not wrap your answer in quotation marks.
    For example given a query 'usage of machine learning in image recognition' you should answer
      'usage of machine learning in image recognition machine$$learning image$$recognition'.

    query: '''{expanded_queries_list[i][1]}'''
    """
    answer = ask_gpt(prompt=determine_ngrams) #check answer more carefully perhaps model will return not only {yes,no}
    print(answer)
    expanded_queries_list[i].append(answer)

retrieval system improving effectiveness retrieval$$system
machine learning language identification machine$$learning language$$identification
social media detect self harm social$$media self$$harm
stemming for arabic languages arabic$$languages
audio based animal recognition audio$$based animal$$recognition
comparison different retrieval models retrieval$$models
cache architecture
document scoping formula document$$scoping
pseudo relevance feedback pseudo$$relevance$$feedback pseudo$$relevance relevance$$feedback
how to represent natural conversations in word nets natural$$conversations word$$nets
algorithm acceleration with nvidia cuda nvidia$$cuda
mention of algorithm
at least three authors three$$authors
german domain german$$domain
mention of open source open$$source
inclusion of text mining text$$mining
the ethics of artificial intelligence artificial$$intelligence
machine learning for more relevant results machine$$learning relevant$$results
crawling websites using machine learn

In [111]:
print(expanded_queries_list)

[['1', 'retrieval system improving effectiveness', 'retrieval system improving effectiveness retrieval$$system'], ['2', 'machine learning language identification', 'machine learning language identification machine$$learning language$$identification'], ['3', 'social media detect self harm', 'social media detect self harm social$$media self$$harm'], ['4', 'stemming for arabic languages', 'stemming for arabic languages arabic$$languages'], ['5', 'audio based animal recognition', 'audio based animal recognition audio$$based animal$$recognition'], ['6', 'comparison different retrieval models', 'comparison different retrieval models retrieval$$models'], ['7', 'cache architecture', 'cache architecture'], ['8', 'document scoping formula', 'document scoping formula document$$scoping'], ['9', 'pseudo relevance feedback', 'pseudo relevance feedback pseudo$$relevance$$feedback pseudo$$relevance relevance$$feedback'], ['10', 'how to represent natural conversations in word nets', 'how to represent n

### Cut question queries out of dataframe

In [112]:
for i in range(len(querys)):
    if querys['qid'][i] in answers:
        print(querys['qid'][i])
        index = querys[querys['qid'] == querys['qid'][i]].index
        querys = querys.drop(index)

In [113]:
#illustrate querys have been deleted
querys[53:57]

Unnamed: 0,qid,query
53,55,bm25
54,56,what makes natural language processing natural
55,57,principle of a information retrieval indexing
56,58,architecture of web search engine


rank queries with bm25

In [114]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [115]:
#create first ranking, will later 'merge' with run2 which contains ranked question querys 
run1 = bm25(querys)
run1.head(3)

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,94858,2004.cikm_conference-2004.47,0,15.681777,retrieval system improving effectiveness
1,1,125137,1989.ipm_journal-ir0volumeA25A4.2,1,15.04738,retrieval system improving effectiveness
2,1,125817,2005.ipm_journal-ir0volumeA41A5.11,2,14.144223,retrieval system improving effectiveness


## rank question queries seperately

In [116]:
#reset querys since now we need to find the previous deleted querys
querys = pt_dataset.get_topics('query')

In [117]:
questions = list()
for qid in answers:
    for i in range(len(querys)):
        if querys['qid'][i] == qid:
            query= querys['query'][i]
            print("Old query: ",query)
            #ask gpt to expand query
            expand = f""" 
            You are an expert in Information Retrieval.  I am building an retrieval System specificially for the scientific domain, just like Google Scholar I want to find scientific papers for a query,. I want to improve the precision of my retrieval system by expanding specific question querys. Please understand that the information need behind a question query is much different than for a keyword query. For example the intent behind a question query like "What is Deep Learning" much more likely is to find papers that focus on an introduction to deep learning and explains core concepts. Meanwhile the intent behind the keyword query "Deep Learning" is find any papers that focus on deep learning.
            Your task is to semanticly interpret a question query, which is below delimited by triple quotes, and return three to six expanded queries so I can gather the score for each query and then aggregate over the scores to calculate a final score. Here are some examples:
            For the question query 'What is deep learning' the intent is to find papers that introduce and explain deep learning. At the end of each query please add a '$'.
            Therefore expanded querys could be:
            'Introduction to deep Learning$'
            'Overview of deep Learning$'
            'basic concepts of deep learning$'
            For the question query 'Why use boolean retrieval model' the inent is to find papers that gve insights to reasons to apply the boolean retrieval model. Therefore expanded querys could be:
            'Reasons to use boolean retrieval model$'
            'Introduction to boolean retrieval model$'

            query: '''{query}'''
            """
            new_query = ask_gpt(prompt=expand).lower().strip().replace("'", " ").replace('"', ' ')
            print("New query: ",new_query)
            questions.append({'qid':qid,'query': query,'subqueries': new_query})

In [118]:
print(questions)

[]


In [119]:
#temporary
questions = [{'qid': '56', 'query': 'what makes natural language processing natural', 'subqueries': ' fundamentals of natural language processing$ \n understanding natural language processing$ \n concepts behind natural language processing$ \n principles of natural language processing$ \n introduction to natural language processing$ '}, {'qid': '59', 'query': 'what is ahp', 'subqueries': ' introduction to ahp$ \n overview of ahp$ \n basic concepts of ahp$ \n understanding ahp$ \n principles of ahp$ '}, {'qid': '60', 'query': 'what is information retrieval', 'subqueries': ' introduction to information retrieval$ \n overview of information retrieval$ \n basic concepts of information retrieval$ \n principles of information retrieval$ \n understanding information retrieval$ '}]

In [120]:
for item in questions:
    item['subqueries'] = item['subqueries'].replace('\n', ' ').split('$')
    del item['subqueries'][-1] #last item in list would be empty string, remove it
print(questions)

[{'qid': '56', 'query': 'what makes natural language processing natural', 'subqueries': [' fundamentals of natural language processing', '   understanding natural language processing', '   concepts behind natural language processing', '   principles of natural language processing', '   introduction to natural language processing']}, {'qid': '59', 'query': 'what is ahp', 'subqueries': [' introduction to ahp', '   overview of ahp', '   basic concepts of ahp', '   understanding ahp', '   principles of ahp']}, {'qid': '60', 'query': 'what is information retrieval', 'subqueries': [' introduction to information retrieval', '   overview of information retrieval', '   basic concepts of information retrieval', '   principles of information retrieval', '   understanding information retrieval']}]


now do the ranking of each subquery


In [121]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [122]:
import pandas as pd
final_score= list()
for query in questions: #iterate list to get dict of query
    qscores = list()
    qid = query['qid']
    old_query = query['query']
    for i,subquery in enumerate(query['subqueries']): #iterate through subqueries key of dict
        results = bm25.search(subquery) #rank each subquery, returns pd dataframe
        for index, row in results.iterrows(): #iterate through dataframe and format it
            docid = row['docid']
            docno = row['docno']
            score = row['score']
            df_row = {'qid':qid, 'docid':docid, 'docno':docno, 'rank': index, 'score': score, 'query': subquery}
            qscores.append(df_row)
    final_score.append(qscores)
df = pd.DataFrame(final_score[0])
print(len(df))

5000


In [123]:
#construct dataframe for each query
dataframes = list()
for i in range(len(final_score)):
    dataframes.append(pd.DataFrame(final_score[i]))

In [124]:
dataframes[1].head(3)

Unnamed: 0,qid,docid,docno,rank,score,query
0,59,3867,2009.mtsummit-plenaries.10,0,11.694037,introduction to ahp
1,59,22155,1997.eamt-1.1,1,11.694037,introduction to ahp
2,59,28715,2014.lilt-9.1,2,11.694037,introduction to ahp


In [125]:
new_dataframes = list()
for j in range(len(dataframes)):
    dataframe = dataframes[j]
    #print("DATAFRAME:",dataframe[:1])
    query_dataframe = list()
    seen_docnos = list()
    for index, row in dataframe.iterrows(): 
        #print(row)
        docno = row['docno']
        if docno not in seen_docnos:
            seen_docnos.append(docno)
            indices = dataframe.index[dataframe['docno'] == docno].tolist()
            #print(indices)
            sum_score = 0
            for i in indices:
                score = df.loc[i, 'score']
                sum_score += score
            query_dataframe.append({'qid':dataframe.loc[indices[0], 'qid'], 'docid': dataframe.loc[indices[0], 'docid'], 'docno': docno,'rank':0, 'score': sum_score, 'query': dataframe.loc[indices[0], 'query']})
    new_dataframes.append(query_dataframe)

In [126]:
print(new_dataframes[2])

[{'qid': '60', 'docid': 117725, 'docno': '2010.jasis_journal-ir0volumeA61A4.16', 'rank': 0, 'score': 18.680955810882573, 'query': ' introduction to information retrieval'}, {'qid': '60', 'docid': 119268, 'docno': '1992.jasis_journal-ir0volumeA43A2.6', 'rank': 0, 'score': 18.6628273388435, 'query': ' introduction to information retrieval'}, {'qid': '60', 'docid': 126763, 'docno': '2005.tois_journal-ir0volumeA23A1.0', 'rank': 0, 'score': 18.205315687878688, 'query': ' introduction to information retrieval'}, {'qid': '60', 'docid': 112767, 'docno': '2006.ir_journal-ir0volumeA9A2.0', 'rank': 0, 'score': 18.021716906196797, 'query': ' introduction to information retrieval'}, {'qid': '60', 'docid': 117446, 'docno': '1996.jasis_journal-ir0volumeA47A4.0', 'rank': 0, 'score': 17.55207118980423, 'query': ' introduction to information retrieval'}, {'qid': '60', 'docid': 123246, 'docno': '1999.ipm_journal-ir0volumeA35A4.0', 'rank': 0, 'score': 17.302915888802687, 'query': ' introduction to informa

In [127]:
#hier noch die liste absteigend sortieren und demenstsprechend den rank updaten

In [128]:
dfs = list()
for i in range(len(new_dataframes)):
    dfs.append(pd.DataFrame(new_dataframes[i]))

In [129]:
#format dataframe
sorted_dfs = list()
for df in dfs:
    sort_df = df.sort_values(by='score', ascending=False)
    sort_df.reset_index(drop=True, inplace=True)
    sort_df['rank'] = range(len(sort_df))
    sorted_dfs.append(sort_df)
    print(sort_df.head(3))

  qid  docid     docno  rank      score  \
0  56  64770  J86-2001     0  73.413873   
1  56  65521  J95-3006     1  69.230891   
2  56  47348  N03-5001     2  69.113129   

                                          query  
0   fundamentals of natural language processing  
1   fundamentals of natural language processing  
2   fundamentals of natural language processing  
  qid   docid                                      docno  rank      score  \
0  59  124882          2006.ipm_journal-ir0volumeA42A4.0     0  67.893637   
1  59  114140  2018.wwwjournals_journal-ir0volumeA21A1.3     1  56.630256   
2  59  112550          2013.tist_journal-ir0volumeA4A4.0     2  47.144506   

                  query  
0   introduction to ahp  
1   introduction to ahp  
2   introduction to ahp  
  qid  docid                               docno  rank      score  \
0  60  82311  2010.sigirconf_conference-2010.199     0  52.533601   
1  60  81648  2018.sigirconf_conference-2018.227     1  45.962207   
2  60  

In [130]:
#questions datastructure:
#[{'qid': '56', 'query': 'what makes natural language processing natural', 'subqueries': [' fundamentals of natural language processing', '   understandi

In [131]:
#todo: iterate through each dataframe and alter query to old_query -> compare over qid?
#for each dataframe iterate through rows
#get qid and see if match in questions, replace query column with query of questions

for df in sorted_dfs:
    for index, row in df.iterrows():
        qid = row['qid']
        for item in questions:
            if qid == item['qid']:
                old_query = item['query']
                #print(old_query)
                break
        df.at[index, 'query'] = old_query
        #print("ALTERED:",df.at[index, 'query'])
        


In [132]:
for i,df in enumerate(sorted_dfs):
    cut_df = df.iloc[:1000]
    sorted_dfs[i] = cut_df

In [133]:
dfs_combined = pd.concat(sorted_dfs, ignore_index=True)

In [134]:
len(dfs_combined)

3000

In [135]:
#ready to merge?

In [136]:
run2 = pd.concat([dfs_combined, run1], ignore_index=True)
len(run2)

69283

In [137]:
persist_and_normalize_run(run2, system_name='bm25-question-query', default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".
