In [1]:
#expanding abbrevations in a query -> combine with question query if there is a question query which contains an abbrevation
#test phrase based expansion, expand to "This paper introduces Information Retrieval" -> set retrieval, boolean retrieval?
#adjust bm25 parameters
#adjust question prompt so that it catches the intent behind why questions better
#What and Why are probably most common question words -> specialize on them

Thesis

Setup

In [2]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd
import re
from collections import Counter

In [3]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/2024-05-04-16-05-53.zip
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 19.5M/19.5M [00:00<00:00, 60.4MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-lab-sose-2024/ir-acl-anthology-20240504-training/tira-ir-starter


## Build Ngram Index

In [5]:
docs =  pt_dataset.get_corpus_iter()
docs = list(docs)
count = sum(1 for _ in docs)
docs = docs[:126959]
print("Number of documents:", count)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 39.4M/39.4M [00:00<00:00, 59.6MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 53248.46it/s]

Number of documents: 126958





In [6]:
#Method that removes all special characters from a String, and returns either a String or a list of all words
def clean_text(text, return_as_list = False):
    text = text.replace('\n', ' ')
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) #remove non-alphanumeric characters, except spaces
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text

In [7]:
import nltk
from nltk.corpus import stopwords
import re

# Ensure NLTK data directory is set correctly
nltk.data.path.append("/usr/local/nltk_data")

# Download 'stopwords' corpus to the specified directory
nltk.download('stopwords', download_dir="/usr/nltk_data")

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

def remove_stopwords(text, return_as_list = False):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string

    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /usr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_text(text, return_as_list = False):
    words = text.split()

    stemmed_words = [stemmer.stem(word) for word in words]
    if return_as_list:
        return stemmed_words
    # Join the stemmed words back into a single string
    else:
        return ' '.join(stemmed_words)

In [9]:
#This is our ngram tokenizer. It takes a string and returns a dict of all ngrams, where each ngram is seperated by $$ so it will be parsed as one token

def tokenize_ngrams_to_dict(text, n1=1, n2=2):
    
    # Tokenize the text into words
    words = text.split(' ')
    words = [word for word in words if len(''.join(format(ord(c), '08b') for c in word)) <= 60]

    # Initialize an empty Counter to hold all n-grams
    all_ngram_counts = Counter()
    
    # Loop through each n from n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Update the Counter with the current n-grams
        all_ngram_counts.update(ngrams)
    
    return dict(all_ngram_counts)

In [10]:
for doc in docs:
        if 'text' in doc:
            doc['text'] = clean_text(doc['text'])
            doc['text'] = remove_stopwords(doc['text'])
            doc['text'] = stem_text(doc['text'])
            
            doc_1gram = tokenize_ngrams_to_dict(doc['text'], n1=1, n2=2) # Apply n-gram tokenization to the dataset

            doc['toks'] = doc_1gram # create new toks field for tokenfrequency
            del doc['text']  #This will delete the 'text' field from the documents
    
for i, doc in enumerate(docs):
     if i == 3:
           break
     print(doc)

#remove all empty documents
docs = [d for d in docs if any(k != '' for k in d['toks'].keys())]

{'docno': 'O02-2002', 'toks': {'studi': 1, 'word': 7, 'similar': 8, 'use': 3, 'context': 5, 'vector': 3, 'model': 2, 'need': 1, 'measur': 2, 'process': 1, 'natur': 1, 'languag': 1, 'especi': 1, 'general': 1, 'classif': 1, 'exampl': 1, 'base': 3, 'usual': 1, 'two': 1, 'defin': 1, 'accord': 3, 'distanc': 1, 'semant': 6, 'class': 2, 'less': 1, 'consid': 1, 'syntact': 5, 'ie': 2, 'howev': 1, 'real': 1, 'applic': 1, 'requir': 1, 'weight': 1, 'differ': 1, 'mixtur': 1, 'paper': 1, 'propos': 1, 'relat': 1, 'co': 2, 'occurr': 2, 'adopt': 1, 'inform': 1, 'theoret': 1, 'solv': 1, 'problem': 1, 'data': 1, 'spars': 1, 'precis': 1, 'featur': 2, 'deriv': 1, 'pars': 1, 'environ': 1, 'adjust': 1, 'idf': 1, 'invers': 1, 'valu': 2, 'agglom': 1, 'cluster': 1, 'appli': 1, 'group': 2, 'turn': 1, 'togeth': 1, 'studi$$word': 1, 'word$$similar': 4, 'similar$$use': 1, 'use$$context': 1, 'context$$vector': 3, 'vector$$model': 1, 'model$$need': 1, 'need$$measur': 1, 'measur$$word': 1, 'similar$$process': 1, 'proc

In [11]:
# Initialize the IterDictIndexer with pretokenised set to True
iter_indexer = pt.IterDictIndexer("./ngramindex", overwrite=True, meta={'docno': 35}, pretokenised=True, verbose = True, type = pt.index.IndexingType.SINGLEPASS)

# Index our pretokenized documents
index_ref = iter_indexer.index(docs)

index_ngram = pt.IndexFactory.of(index_ref)

#Print some stats about our index
print(index_ngram.getCollectionStatistics())

# Access the MetaIndex and Lexicon
meta = index_ngram.getMetaIndex()
lexicon = index_ngram.getLexicon()


i = 0
for term, le in index.getLexicon():
    i = i+1
    if i == 5:
        break
    print(term) 
    print(le.getFrequency())

Number of documents: 126824
Number of terms: 1735213
Number of postings: 11740755
Number of fields: 0
Number of tokens: 14688164
Field names: []
Positions:   false

0
9406
00
278
000
1761
0001
14


## Query Analysis

In [12]:
import openai
client = openai.OpenAI() #connect to OpenAI API

In [13]:
def ask_gpt(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

Differentiate between question querys, querys that contain an abbrevation and keyword querys to treat each query properly

In [14]:
querys = pt_dataset.get_topics('query')

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.49MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/





Write out each detected abbrevation and concat it to the query in question

In [15]:
answers = dict()
for i in range(len(querys)):
    determine_abbrevation = f""" 
    You are an scientific expert especially in the domain of Information Retrieval. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, contains an abbrevation then answer with yes or not then answer with no.
    For example given a query 'What is crx' you should answer yes, since cxr is the abbrevation for the medical term
    'chest X-Ray'. However if the given query is 'What is Information Retrieval' you should answer no, since there is no
    abbrevation in the query.

    query: '''{querys['query'][i]}'''
    """
    answer = ask_gpt(prompt=determine_abbrevation) #check answer more carefully perhaps model will return not only {yes,no}
    #print(answer)
    qid = str(querys['qid'][i])
    if "yes" in answer.lower().strip():
        answers[qid] = True
    else:
        answers[qid] = False
print(answers)

{'1': False, '2': False, '3': False, '4': False, '5': False, '6': False, '7': False, '8': False, '9': False, '10': False, '11': True, '12': False, '13': False, '14': False, '15': False, '16': False, '17': False, '19': False, '20': False, '21': False, '22': False, '23': False, '24': False, '25': False, '26': False, '27': False, '28': False, '29': False, '30': False, '31': False, '32': False, '33': False, '34': False, '35': False, '36': False, '37': False, '38': False, '39': False, '40': False, '41': False, '42': False, '43': False, '44': False, '45': False, '46': False, '47': False, '48': False, '49': False, '50': False, '51': False, '52': False, '53': False, '54': False, '55': True, '56': False, '57': False, '58': False, '59': True, '60': False, '61': False, '62': False, '63': False, '64': False, '65': False, '66': False, '67': False, '68': False, '18': False}


In [16]:
#could be more efficient if in answers text is saved only for qid where abbrevation=yes
for key in answers.keys():
    if bool(answers[key]):
        #find query
        for i in range(len(querys)):
            if querys['qid'][i] == key:
                query= querys['query'][i]
                print("Old query: ",query)
        #ask gpt to expand query
        expand = f""" 
        You are an scientific expert especially in the domain of Information Retrieval. Your are given a query, which is below
        delimited by triple quotes, which contains an abbrevation. Your task is to identify the abbrevation and write it, then
        concat the original query with the written out abbrevation and return this new query as string only.
        For example given a query 'What is crx' you should detect that the abbrevation is crx, since cxr is the abbrevation for the medical term
        'chest X-Ray', then you should concat the originial query with the abbrevation 'chest X-Ray' resulting in a new query 'What is crx chest x-ray' which
        you should return. Another example, given the query 'Algorithms of nlp' you should detect that the abbrevation is nlp, since nlp is the abbrevation
        for the term 'natural language processing', then you should concat the original query 'Algorithms of nlp' with the abbrevation 'natural language processing'  
        resulting in a new query 'Algorithms of nlp natural language processing' which you should return.
        Please only answer with the new query. So your answer should only include the original query and the detected abbreviated words and no additional information.
        Don't wrap your answer in quotation marks.

        query: '''{query}'''
        """
        new_query = ask_gpt(prompt=expand).lower().strip().replace("'", " ").replace('"', ' ')
        print("New query: ",new_query)
        #overwrite old query
        querys['query'][i] = new_query

Old query:  algorithm acceleration with nvidia cuda
New query:  algorithm acceleration with nvidia cuda compute unified device architecture
Old query:  bm25
New query:  bm25 best match 25
Old query:  what is ahp
New query:  what is ahp analytic hierarchy process


Now differentiate between a keyword query where ngram tokenization is usefull and question querys:

In [27]:
#querys = pt_dataset.get_topics('query')
expanded_queries_list = querys.values.tolist()
print(expanded_queries_list)
print(expanded_queries_list[67][1])
print(len(expanded_queries_list))
#gpt goes through each query, question querys qid will be stored in answers (list)
answers = list()
for i in range(len(expanded_queries_list)):
#for i in range(0, 2):
    

    determine_ngrams = f""" 
    You are an scientific expert in the domain of Information Retrieval and linguistics. Your task is to detect whether
    a given query, which is given as a text below delimited by triple quotes, contains bigrams. This means, you should check for all 
    bigrams in the query, if they are an existing term consisting of multiple words. Then, your answer should be the original query 
    with all the bigrams you found appended in the format word1$$word2. Your answer should only include the query and the bigrams, no additional information.
    This means that when there are no existing bigrams in the query, your answer should just be the original query. You should not wrap your answer in quotation marks.
    For example given a query 'usage of machine learning in image recognition' you should answer
      'usage of machine learning in image recognition machine$$learning image$$recognition'.

    query: '''{expanded_queries_list[i][1]}'''
    """
    answer = ask_gpt(prompt=determine_ngrams) #check answer more carefully perhaps model will return not only {yes,no}
    print(answer)
    expanded_queries_list[i].append(answer)


[['1', 'retrieval system improving effectiveness'], ['2', 'machine learning language identification'], ['3', 'social media detect self harm'], ['4', 'stemming for arabic languages'], ['5', 'audio based animal recognition'], ['6', 'comparison different retrieval models'], ['7', 'cache architecture'], ['8', 'document scoping formula'], ['9', 'pseudo relevance feedback'], ['10', 'how to represent natural conversations in word nets'], ['11', 'algorithm acceleration with nvidia cuda'], ['12', 'mention of algorithm'], ['13', 'at least three authors'], ['14', 'german domain'], ['15', 'mention of open source'], ['16', 'inclusion of text mining'], ['17', 'the ethics of artificial intelligence'], ['19', 'machine learning for more relevant results'], ['20', 'crawling websites using machine learning'], ['21', 'recommenders influence on users'], ['22', 'search engine caching effects'], ['23', 'consumer product reviews'], ['24', 'limitations machine learning'], ['25', 'medicine related research'], [

### Define the retrieval pipeline 

In [29]:
#1. Chatgpt anwenden

# Extract qid and query from each sublist
data_expd = expanded_queries_list
data_expd = [(sublist[0], sublist[1]) for sublist in expanded_queries_list]
# Create pandas DataFrame
df_expd = pd.DataFrame(data_expd, columns=['qid', 'query'])

'''#Nimmt eine qid als String und returnt die expanded query von chatgpt
def get_expanded_query(qid):
    for query_list in expanded_queries:
        if query_list[0] == qid:
            return query_list[2]
    return None

print(get_expanded_query('1'))'''

#TODO über die liste ist es nicht so gut, da wir in der pipeline eh jede query einzeln als string behandeln
#daher hier die chatgpt methode in the transformer einfügen, und dann am ende noch als liste tokenisen
#expand_query_transf = pt.rewrite.tokenise




#2. special characters löschen ausser $$

import re

def clean_text_with_dollar_signs(text, return_as_list=False, keep_dollar_signs=False):
    text = text.replace('\n', ' ')
    
    if keep_dollar_signs:
        # Replace double dollar signs with a unique placeholder
        text = text.replace('$$', 'DOUBLEDOLLARNGRAMS')
        # Remove all non-alphanumeric characters except spaces
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        # Replace placeholder back to double dollar signs
        cleaned_text = cleaned_text.replace('DOUBLEDOLLARNGRAMS', '$$')
    else:
        # Remove all non-alphanumeric characters except spaces
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text

# Examples
#print(clean_text_with_dollar_signs(get_expanded_query('1'), keep_dollar_signs=True, return_as_list=True))

transf_clean_text = pt.rewrite.tokenise(lambda query: clean_text_with_dollar_signs(query, return_as_list=True, keep_dollar_signs=True))

#3. stopwords löschen


import nltk
from nltk.corpus import stopwords
import re

# Ensure NLTK data directory is set correctly
nltk.data.path.append("/usr/local/nltk_data")

# Download 'stopwords' corpus to the specified directory
nltk.download('stopwords', download_dir="/usr/nltk_data")

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

def remove_stopwords_with_dollar_signs(text, return_as_list=False):
    words = text.split()
    filtered_words = []
    
    for word in words:
        if '$$' in word:
            # Split the word at '$$' and check both parts
            #part1, part2 = word.split('$$')
            #if part1.lower() in stop_words or part2.lower() in stop_words:
            #    continue  # Skip this word if either part is a stopword
            #else:
            #    filtered_words.append(word)

            parts = word.split('$$')
            skip_word = False
            for part in parts:
                if part.lower() in stop_words:
                    skip_word = True
                    break  # If any part is a stopword, skip the entire word
            if not skip_word:
                filtered_words.append(word)
        else:
            if word.lower() not in stop_words:
                filtered_words.append(word)
    
    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)


transf_remove_stopwords = pt.rewrite.tokenise(lambda query: remove_stopwords_with_dollar_signs(query, return_as_list=True))
# Examples
print(remove_stopwords_with_dollar_signs("machine$$learning of learning$$of machine", return_as_list=True))

#4. stemming

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_text_with_dollar_signs(text, return_as_list=False):
    words = text.split()

    stemmed_words = []
    for word in words:
        if '$$' in word:
            # Split the word at '$$' and stem each part
            #part1, part2 = word.split('$$')
            #stemmed_part1 = stemmer.stem(part1)
            #stemmed_part2 = stemmer.stem(part2)
            #stemmed_word = f"{stemmed_part1}$${stemmed_part2}"

            parts = word.split('$$')
            stemmed_parts = [stemmer.stem(part) for part in parts]
            stemmed_word = '$$'.join(stemmed_parts)
        else:
            # Stem the word normally
            stemmed_word = stemmer.stem(word)
        
        stemmed_words.append(stemmed_word)

    if return_as_list:
        return stemmed_words
    else:
        return ' '.join(stemmed_words)
    

transf_stem_text = pt.rewrite.tokenise(lambda query: stem_text_with_dollar_signs(query, return_as_list=True))
# Example
print(stem_text("running$$shoes are better than runing $$fast", return_as_list=False))

#5. bm25 ohne termpipelines

# This transformer will do the retrieval using bm25, and explicitly not apply any stemming and stopword removal
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose = True, properties={"termpipelines" : ""}, controls={"bm25.b": 0.2})#, "bm25.k_1": 0.1})

# This is our retrieval pipeline
#retr_pipeline = remove_special_characters >> remove_stopwords_from_query >> stem_query >> tokenise_query_ngram >> bm25
retr_pipeline = transf_clean_text >> transf_remove_stopwords >> transf_stem_text >> bm25

['machine$$learning', 'machine']
running$$sho are better than rune $$fast


[nltk_data] Downloading package stopwords to /usr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Create the run

In [30]:
print('Now we do the retrieval...')
run = retr_pipeline(pt_dataset.get_topics()) #queries

print('Done. Here are the first 10 entries of the run')
run.head(10)

Now we do the retrieval...
There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


BR(BM25):   0%|          | 0/68 [00:00<?, ?q/s]

BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 37.04q/s]


Done. Here are the first 10 entries of the run


Unnamed: 0,qid,docid,docno,rank,score,text,title,query_2,description,narrative,query_1,query_0,query
0,1,94795,2004.cikm_conference-2004.47,0,16.287685,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
1,1,124670,2006.ipm_journal-ir0volumeA42A3.2,1,15.362926,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
2,1,83395,1997.sigirconf_conference-97.36,2,15.220951,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
3,1,81597,2018.sigirconf_conference-2018.234,3,15.035347,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
4,1,82044,2007.sigirconf_conference-2007.212,4,15.028518,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
5,1,82438,1998.sigirconf_conference-98.39,5,15.008899,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
6,1,122357,2010.sigirjournals_journal-ir0volu,6,14.928209,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
7,1,125684,2005.ipm_journal-ir0volumeA41A5.11,7,14.908299,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
8,1,84816,2016.ntcir_conference-2016.90,8,14.725026,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
9,1,94352,2008.cikm_conference-2008.183,9,14.694944,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect


### Persist run file

In [31]:
persist_and_normalize_run(run, system_name='ngrams', default_output='../runs/')

import os
os.rename('../runs/run.txt', '../runs/runngram.txt')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs/".
Done. run file is stored under "../runs//run.txt".
