In [12]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd
import re
import openai
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [13]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

In [14]:
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [15]:
client = openai.OpenAI() #connect to OpenAI API

In [16]:
def ask_gpt(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

In [17]:
querys = pt_dataset.get_topics('query')

In [18]:
def extract_integers_from_qid(df):
    # Convert the qid column to integers
    return df['qid'].astype(int).tolist()

In [19]:
def get_query_by_qid(df, qid):
    # Convert the qid to string and find the corresponding query
    result = df.loc[df['qid'] == str(qid), 'query']
    return result.iloc[0] if not result.empty else None

In [20]:
def set_query_by_qid(df, qid, new_query):
    # Convert the qid to string and update the query
    df.loc[df['qid'] == str(qid), 'query'] = new_query

In [21]:
qid_list = extract_integers_from_qid(querys)

In [None]:
expanded_queries_list = querys.values.tolist()
for i in qid_list:
    

    determine_ngrams = f""" 
    You are an scientific expert in the domain of Information Retrieval and linguistics. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, contains bigrams. This means, you should check for all 
    bigrams in the query, if they are an existing term consisting of multiple words. Then, your answer should be the original query 
    with all the bigrams you found appended in the format word1$$word2. Your answer should only include the query and the bigrams, no additional information.
    This means that when there are no existing bigrams in the query, your answer should just be the original query. You should not wrap your answer in quotation marks.
    For example given a query 'usage of machine learning in image recognition' you should answer
      'usage of machine learning in image recognition machine$$learning image$$recognition'.

    query: '''{get_query_by_qid(querys, i)}'''
    """
    answer = ask_gpt(prompt=determine_ngrams)
    set_query_by_qid(querys, i, answer)

expanded_queries_list = querys.values.tolist()
print(expanded_queries_list)

In [None]:
data_expd = [(sublist[0], sublist[1]) for sublist in expanded_queries_list]
print(data_expd)

In [42]:
expanded_queries = [['1', 'retrieval system improving effectiveness', 'retrieval system improving effectiveness retrieval$$system'], ['2', 'machine learning language identification', 'machine learning language identification machine$$learning language$$identification'], ['3', 'social media detect self harm', 'social media detect self harm social$$media self$$harm'], ['4', 'stemming for arabic languages', 'stemming for arabic languages arabic$$languages'], ['5', 'audio based animal recognition', 'audio based animal recognition audio$$based animal$$recognition'], ['6', 'comparison different retrieval models', 'comparison different retrieval models retrieval$$models'], ['7', 'cache architecture', 'cache architecture'], ['8', 'document scoping formula', 'document scoping formula document$$scoping'], ['9', 'pseudo relevance feedback', 'pseudo relevance feedback pseudo$$relevance$$feedback pseudo$$relevance relevance$$feedback'], ['10', 'how to represent natural conversations in word nets', 'how to represent natural conversations in word nets natural$$conversations word$$nets'], ['11', 'algorithm acceleration with nvidia cuda', 'algorithm acceleration with nvidia cuda nvidia$$cuda'], ['12', 'mention of algorithm', 'mention of algorithm'], ['13', 'at least three authors', 'at least three authors three$$authors'], ['14', 'german domain', 'german domain german$$domain'], ['15', 'mention of open source', 'mention of open source open$$source'], ['16', 'inclusion of text mining', 'inclusion of text mining text$$mining'], ['17', 'the ethics of artificial intelligence', 'the ethics of artificial intelligence artificial$$intelligence'], ['19', 'machine learning for more relevant results', 'machine learning for more relevant results machine$$learning relevant$$results'], ['20', 'crawling websites using machine learning', 'crawling websites using machine learning machine$$learning'], ['21', 'recommenders influence on users', 'recommenders influence on users'], ['22', 'search engine caching effects', 'search engine caching effects search$$engine caching$$effects'], ['23', 'consumer product reviews', 'consumer product reviews consumer$$product product$$reviews'], ['24', 'limitations machine learning', 'limitations machine learning machine$$learning'], ['25', 'medicine related research', 'medicine related research medicine$$related related$$research'], ['26', 'natural language processing', 'natural language processing natural$$language$$processing natural$$language'], ['27', 'graph based ranking', 'graph based ranking graph$$based based$$ranking'], ['28', 'medical studies that use information retrieval', 'medical studies that use information retrieval information$$retrieval'], ['29', 'information retrieval on different language sources', 'information retrieval on different language sources information$$retrieval language$$sources'], ['30', 'papers that compare multiple information retrieval methods', 'papers that compare multiple information retrieval methods information$$retrieval'], ['31', 'risks of information retrieval in social media', 'risks of information retrieval in social media information$$retrieval social$$media'], ['32', 'actual experiments that strengthen theoretical knowledge', 'actual experiments that strengthen theoretical knowledge theoretical$$knowledge'], ['33', 'fake news detection', 'fake news detection fake$$news news$$detection'], ['34', 'multimedia retrieval', 'multimedia retrieval multimedia$$retrieval'], ['35', 'processing natural language for information retrieval', 'processing natural language for information retrieval natural$$language information$$retrieval'], ['36', 'recommendation systems', 'recommendation systems recommendation$$systems'], ['37', 'personalised search in e commerce', 'personalised search in e commerce personalised$$search e$$commerce'], ['38', 'sentiment analysis', 'sentiment analysis sentiment$$analysis'], ['39', 'informational retrieval using neural networks', 'informational retrieval using neural networks informational$$retrieval neural$$networks'], ['40', 'query log analysis', 'query log analysis query$$log log$$analysis'], ['41', 'entity recognition', 'entity recognition entity$$recognition'], ['42', 'relevance assessments', 'relevance assessments relevance$$assessments'], ['43', 'deep neural networks', 'deep neural networks deep$$neural neural$$networks'], ['44', 'information retrieval', 'information retrieval information$$retrieval'], ['45', 'analysis for android apps', 'analysis for android apps android$$apps'], ['46', 'the university of amsterdam', 'the university of amsterdam university$$of of$$amsterdam'], ['47', 'neural ranking for ecommerce product search', 'neural ranking for ecommerce product search neural$$ranking product$$search'], ['48', 'web pages evolution', 'web pages evolution web$$pages'], ['49', 'exhaustivity of index', 'exhaustivity of index'], ['50', 'query optimization', 'query optimization query$$optimization'], ['51', 'cosine similarity vector', 'cosine similarity vector cosine$$similarity'], ['52', 'reverse indexing', 'reverse indexing reverse$$indexing'], ['53', 'index compression techniques', 'index compression techniques index$$compression compression$$techniques'], ['54', 'search engine optimization with query logs', 'search engine optimization with query logs search$$engine engine$$optimization query$$logs'], ['55', 'bm25', 'bm25'], ['56', 'what makes natural language processing natural', 'what makes natural language processing natural natural$$language$$processing natural$$language'], ['57', 'principle of a information retrieval indexing', 'principle of a information retrieval indexing information$$retrieval'], ['58', 'architecture of web search engine', 'architecture of web search engine web$$search search$$engine'], ['59', 'what is ahp', 'what is ahp'], ['60', 'what is information retrieval', 'what is information retrieval information$$retrieval'], ['61', 'efficient retrieval algorithms', 'efficient retrieval algorithms retrieval$$algorithms'], ['62', 'how to avoid spam results', 'how to avoid spam results spam$$results'], ['63', 'information retrieval with algorithms', 'information retrieval with algorithms information$$retrieval'], ['64', 'misspellings in queries', 'misspellings in queries'], ['65', 'information in different language', 'information in different language different$$language'], ['66', 'abbreviations in queries', 'abbreviations in queries'], ['67', 'lemmatization algorithms', 'lemmatization algorithms'], ['68', 'filter ad rich documents', 'filter ad rich documents ad$$rich'], ['18', 'advancements in information retrieval', 'advancements in information retrieval information$$retrieval']]
print(expanded_queries)

data_expd = [(sublist[0], sublist[2]) for sublist in expanded_queries] # Extract qid and query from each sublist


df_expd = pd.DataFrame(data_expd, columns=['qid', 'query']) # Create pandas DataFrame

[['1', 'retrieval system improving effectiveness', 'retrieval system improving effectiveness retrieval$$system'], ['2', 'machine learning language identification', 'machine learning language identification machine$$learning language$$identification'], ['3', 'social media detect self harm', 'social media detect self harm social$$media self$$harm'], ['4', 'stemming for arabic languages', 'stemming for arabic languages arabic$$languages'], ['5', 'audio based animal recognition', 'audio based animal recognition audio$$based animal$$recognition'], ['6', 'comparison different retrieval models', 'comparison different retrieval models retrieval$$models'], ['7', 'cache architecture', 'cache architecture'], ['8', 'document scoping formula', 'document scoping formula document$$scoping'], ['9', 'pseudo relevance feedback', 'pseudo relevance feedback pseudo$$relevance$$feedback pseudo$$relevance relevance$$feedback'], ['10', 'how to represent natural conversations in word nets', 'how to represent n

In [44]:
df_expd[60:70]

Unnamed: 0,qid,query
60,62,how to avoid spam results spam$$results
61,63,information retrieval with algorithms informat...
62,64,misspellings in queries
63,65,information in different language different$$l...
64,66,abbreviations in queries
65,67,lemmatization algorithms
66,68,filter ad rich documents ad$$rich
67,18,advancements in information retrieval informat...


### Clean the text

In [45]:
def clean_text_with_dollar_signs(text, return_as_list=False, keep_dollar_signs=False):
    text = text.replace('\n', ' ')
    
    if keep_dollar_signs:      
        text = text.replace('$$', 'DOUBLEDOLLARNGRAMS')                 # Replace double dollar signs with a unique placeholder
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)             # Remove all non-alphanumeric characters except spaces    
        cleaned_text = cleaned_text.replace('DOUBLEDOLLARNGRAMS', '$$') # Replace placeholder back to double dollar signs
    else:
        
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)             # Remove all non-alphanumeric characters except spaces
    
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text

In [46]:
transf_clean_text = pt.rewrite.tokenise(lambda query: clean_text_with_dollar_signs(query, return_as_list=True, keep_dollar_signs=True))

### remvoe stopwords

In [47]:
# Ensure NLTK data directory is set correctly
nltk.data.path.append("/usr/local/nltk_data")

# Download 'stopwords' corpus to the specified directory
nltk.download('stopwords', download_dir="/usr/nltk_data")

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
def remove_stopwords_with_dollar_signs(text, return_as_list=False):
    words = text.split()
    filtered_words = []
    
    for word in words:
        if '$$' in word:
            parts = word.split('$$')
            skip_word = False
            for part in parts:
                if part.lower() in stop_words:
                    skip_word = True
                    break  # If any part is a stopword, skip the entire word
            if not skip_word:
                filtered_words.append(word)
        else:
            if word.lower() not in stop_words:
                filtered_words.append(word)
    
    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)

In [49]:
transf_remove_stopwords = pt.rewrite.tokenise(lambda query: remove_stopwords_with_dollar_signs(query, return_as_list=True))

### stemming

In [50]:
stemmer = SnowballStemmer('english')

In [51]:
def stem_text_with_dollar_signs(text, return_as_list=False):
    words = text.split()

    stemmed_words = []
    for word in words:
        if '$$' in word:
            parts = word.split('$$')
            stemmed_parts = [stemmer.stem(part) for part in parts]
            stemmed_word = '$$'.join(stemmed_parts)
        else:
            stemmed_word = stemmer.stem(word)   # Stem the word normally
        
        stemmed_words.append(stemmed_word)

    if return_as_list:
        return stemmed_words
    else:
        return ' '.join(stemmed_words)

In [52]:
transf_stem_text = pt.rewrite.tokenise(lambda query: stem_text_with_dollar_signs(query, return_as_list=True))

### Pipeline

In [53]:
index_factory = pt.IndexFactory.of("./ngramindex/data.properties")

In [54]:
# This transformer will do the retrieval using bm25, and explicitly not apply any stemming and stopword removal
bm25 = pt.BatchRetrieve(index_factory, wmodel="BM25", verbose = True, properties={"termpipelines" : ""}, controls={"bm25.b": 0.2})#, "bm25.k_1": 0.1})

# This is our retrieval pipeline
retr_pipeline = transf_clean_text >> transf_remove_stopwords >> transf_stem_text >> bm25

In [59]:
run = retr_pipeline(df_expd) #queries

BR(BM25):  15%|█▍        | 10/68 [00:00<00:01, 42.46q/s]

BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 49.19q/s]


In [60]:
persist_and_normalize_run(run, system_name='ngrams', default_output='./runs/')

The run file is normalized outside the TIRA sandbox, I will store it at "./runs/".
Done. run file is stored under "./runs//run.txt".
