In [1]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd
import re
import openai
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [2]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
ir_query_interpretation = tira.pt.transform_queries('ir-lab-sose-2024/needthegrade/ir-query-interpretation', pt_dataset)

Download: 4.00kiB [00:00, 14.0MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-lab-sose-2024/ir-acl-anthology-20240504-training/needthegrade





### CREATE INDEX

In [4]:
docs =  pt_dataset.get_corpus_iter()
docs = list(docs)
count = sum(1 for _ in docs)
docs = docs[:126959]
print("Number of documents:", count)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download:   0%|          | 0.00/39.4M [00:00<?, ?iB/s]

Download: 100%|██████████| 39.4M/39.4M [00:00<00:00, 45.1MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:03<00:00, 32125.17it/s]

Number of documents: 126958





In [5]:
#Method that removes all special characters from a String, and returns either a String or a list of all words
def clean_text(text, return_as_list = False):
    text = text.replace('\n', ' ')
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) #remove non-alphanumeric characters, except spaces
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text

In [6]:
import nltk
from nltk.corpus import stopwords
import re

# Ensure NLTK data directory is set correctly
nltk.data.path.append("/usr/local/nltk_data")

# Download 'stopwords' corpus to the specified directory
nltk.download('stopwords', download_dir="/usr/nltk_data")

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

def remove_stopwords(text, return_as_list = False):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string

    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /usr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_text(text, return_as_list = False):
    words = text.split()

    stemmed_words = [stemmer.stem(word) for word in words]
    if return_as_list:
        return stemmed_words
    # Join the stemmed words back into a single string
    else:
        return ' '.join(stemmed_words)

In [8]:
#This is our ngram tokenizer. It takes a string and returns a dict of all ngrams, where each ngram is seperated by $$ so it will be parsed as one token

def tokenize_ngrams_to_dict(text, n1=1, n2=2):
    
    # Tokenize the text into words
    words = text.split(' ')
    words = [word for word in words if len(''.join(format(ord(c), '08b') for c in word)) <= 60]

    # Initialize an empty Counter to hold all n-grams
    all_ngram_counts = Counter()
    
    # Loop through each n from n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Update the Counter with the current n-grams
        all_ngram_counts.update(ngrams)
    
    return dict(all_ngram_counts)

In [9]:
for doc in docs:
        if 'text' in doc:
            doc['text'] = clean_text(doc['text'])
            doc['text'] = remove_stopwords(doc['text'])
            doc['text'] = stem_text(doc['text'])
            
            doc_1gram = tokenize_ngrams_to_dict(doc['text'], n1=1, n2=2) # Apply n-gram tokenization to the dataset

            doc['toks'] = doc_1gram # create new toks field for tokenfrequency
            del doc['text']  #This will delete the 'text' field from the documents
    
for i, doc in enumerate(docs):
     if i == 3:
           break
     print(doc)

#remove all empty documents
docs = [d for d in docs if any(k != '' for k in d['toks'].keys())]

{'docno': 'O02-2002', 'toks': {'studi': 1, 'word': 7, 'similar': 8, 'use': 3, 'context': 5, 'vector': 3, 'model': 2, 'need': 1, 'measur': 2, 'process': 1, 'natur': 1, 'languag': 1, 'especi': 1, 'general': 1, 'classif': 1, 'exampl': 1, 'base': 3, 'usual': 1, 'two': 1, 'defin': 1, 'accord': 3, 'distanc': 1, 'semant': 6, 'class': 2, 'less': 1, 'consid': 1, 'syntact': 5, 'ie': 2, 'howev': 1, 'real': 1, 'applic': 1, 'requir': 1, 'weight': 1, 'differ': 1, 'mixtur': 1, 'paper': 1, 'propos': 1, 'relat': 1, 'co': 2, 'occurr': 2, 'adopt': 1, 'inform': 1, 'theoret': 1, 'solv': 1, 'problem': 1, 'data': 1, 'spars': 1, 'precis': 1, 'featur': 2, 'deriv': 1, 'pars': 1, 'environ': 1, 'adjust': 1, 'idf': 1, 'invers': 1, 'valu': 2, 'agglom': 1, 'cluster': 1, 'appli': 1, 'group': 2, 'turn': 1, 'togeth': 1, 'studi$$word': 1, 'word$$similar': 4, 'similar$$use': 1, 'use$$context': 1, 'context$$vector': 3, 'vector$$model': 1, 'model$$need': 1, 'need$$measur': 1, 'measur$$word': 1, 'similar$$process': 1, 'proc

In [10]:
# Initialize the IterDictIndexer with pretokenised set to True
iter_indexer = pt.IterDictIndexer("./ngramindex", overwrite=True, meta={'docno': 35}, pretokenised=True, verbose = True, type = pt.index.IndexingType.SINGLEPASS)

# Index our pretokenized documents
index_ref = iter_indexer.index(docs)

index_ngram = pt.IndexFactory.of(index_ref)

#Print some stats about our index
print(index_ngram.getCollectionStatistics())

# Access the MetaIndex and Lexicon
meta = index_ngram.getMetaIndex()
lexicon = index_ngram.getLexicon()


i = 0
for term, le in index_ngram.getLexicon():
    i = i+1
    if i == 5:
        break
    print(term) 
    print(le.getFrequency())

Number of documents: 126824
Number of terms: 1735213
Number of postings: 11740755
Number of fields: 0
Number of tokens: 14688164
Field names: []
Positions:   false

0
9406
0$$0
84
0$$00
11
0$$000
2


### ABBREVIATIONS

### Clean the text

In [11]:
def clean_text_with_dollar_signs(text, return_as_list=False, keep_dollar_signs=False):
    text = text.replace('\n', ' ')
    
    if keep_dollar_signs:      
        text = text.replace('$$', 'DOUBLEDOLLARNGRAMS')                 # Replace double dollar signs with a unique placeholder
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)             # Remove all non-alphanumeric characters except spaces    
        cleaned_text = cleaned_text.replace('DOUBLEDOLLARNGRAMS', '$$') # Replace placeholder back to double dollar signs
    else:
        
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)             # Remove all non-alphanumeric characters except spaces
    
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text

In [12]:
transf_clean_text = pt.rewrite.tokenise(lambda query: clean_text_with_dollar_signs(query, return_as_list=True, keep_dollar_signs=True))

### remvoe stopwords

In [13]:
# Ensure NLTK data directory is set correctly
nltk.data.path.append("/usr/local/nltk_data")

# Download 'stopwords' corpus to the specified directory
nltk.download('stopwords', download_dir="/usr/nltk_data")

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def remove_stopwords_with_dollar_signs(text, return_as_list=False):
    words = text.split()
    filtered_words = []
    
    for word in words:
        if '$$' in word:
            parts = word.split('$$')
            skip_word = False
            for part in parts:
                if part.lower() in stop_words:
                    skip_word = True
                    break  # If any part is a stopword, skip the entire word
            if not skip_word:
                filtered_words.append(word)
        else:
            if word.lower() not in stop_words:
                filtered_words.append(word)
    
    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)

In [15]:
transf_remove_stopwords = pt.rewrite.tokenise(lambda query: remove_stopwords_with_dollar_signs(query, return_as_list=True))

### stemming

In [16]:
stemmer = SnowballStemmer('english')

In [17]:
def stem_text_with_dollar_signs(text, return_as_list=False):
    words = text.split()

    stemmed_words = []
    for word in words:
        if '$$' in word:
            parts = word.split('$$')
            stemmed_parts = [stemmer.stem(part) for part in parts]
            stemmed_word = '$$'.join(stemmed_parts)
        else:
            stemmed_word = stemmer.stem(word)   # Stem the word normally
        
        stemmed_words.append(stemmed_word)

    if return_as_list:
        return stemmed_words
    else:
        return ' '.join(stemmed_words)

In [18]:
transf_stem_text = pt.rewrite.tokenise(lambda query: stem_text_with_dollar_signs(query, return_as_list=True))

### Pipeline

In [19]:
index_factory = pt.IndexFactory.of("./ngramindex/data.properties")

In [20]:
# This transformer will do the retrieval using bm25, and explicitly not apply any stemming and stopword removal
bm25 = pt.BatchRetrieve(index_factory, wmodel="BM25", verbose = True, properties={"termpipelines" : ""}, controls={"bm25.b": 0.2})#, "bm25.k_1": 0.1})

# This is our retrieval pipeline
retr_pipeline = transf_clean_text >> transf_remove_stopwords >> transf_stem_text >> bm25

In [21]:
run = retr_pipeline(ir_query_interpretation(pt_dataset.get_topics())) #queries

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.44MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/





There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 27.42q/s]


In [22]:
run.head(10)

Unnamed: 0,qid,docid,docno,rank,score,text,title,query_2,description,narrative,original_query,is_abbreviations,expanded_abbreviation,ngrams,query_1,query_0,query
0,1,94795,2004.cikm_conference-2004.47,0,16.287685,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
1,1,124670,2006.ipm_journal-ir0volumeA42A3.2,1,15.362926,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
2,1,83395,1997.sigirconf_conference-97.36,2,15.220951,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
3,1,81597,2018.sigirconf_conference-2018.234,3,15.035347,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
4,1,82044,2007.sigirconf_conference-2007.212,4,15.028518,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
5,1,82438,1998.sigirconf_conference-98.39,5,15.008899,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
6,1,122357,2010.sigirjournals_journal-ir0volu,6,14.928209,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
7,1,125684,2005.ipm_journal-ir0volumeA41A5.11,7,14.908299,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
8,1,84816,2016.ntcir_conference-2016.90,8,14.725026,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect
9,1,94352,2008.cikm_conference-2008.183,9,14.694944,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,False,,retrieval system improving effectiveness retri...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect


In [24]:
persist_and_normalize_run(run, system_name='ngrams', default_output='./runs/fullrun/submission')

The run file is normalized outside the TIRA sandbox, I will store it at "./runs/fullrun/submission".
Done. run file is stored under "./runs/fullrun/submission/run.txt".
