### Import libraries

In [1]:
import pyterrier as pt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

from tira.third_party_integrations import persist_and_normalize_run,  ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

### Start pyterrier

In [2]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

ensure_pyterrier_is_loaded()

#start Tira
tira = Client()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### Process dataset and Indexing
First, we get the Dataset from pyterrier

In [3]:
#Get dataset
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
#Create Iterator of our dataset
docs =  pt_dataset.get_corpus_iter()
docs = list(docs)

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 52751.91it/s]


First, we delete all but the first n documents. 
We need to do so as otherwise the notebook will crash.

TODO: Use full dataset and try it in Tira.

In [4]:
from itertools import islice

#Create List from our docs iterator
#docs = list(docs)
#cutoff the list after the first 500 documents
#docs = docs[:500]

#Print some example documents
for i,doc in enumerate(docs):
    if i == 3:
        break
    print(doc)


{'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment o

Here, we process the documents in multiple steps.
First, we remove all special characters.

In [5]:
#Method that removes all special characters from a String, and returns either a String or a list of all words
def clean_text(text, return_as_list = False):
    text = text.replace('\n', ' ')
    # Use regular expression to remove non-alphanumeric characters, except spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    if return_as_list:
        word_list = cleaned_text.split()
        return word_list
    else:
        return cleaned_text



Then, we define a method to remove all stopwords from our document text fields.

In [6]:
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

def remove_stopwords(text, return_as_list = False):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string

    if return_as_list:
        return filtered_words
    else:
        return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Then, we define a method to apply a Snowball Stemmer to the document text fields.

In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_text(text, return_as_list = False):
    words = text.split()

    stemmed_words = [stemmer.stem(word) for word in words]
    if return_as_list:
        return stemmed_words
    # Join the stemmed words back into a single string
    else:
        return ' '.join(stemmed_words)
'''
def remove_stopwords_from_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    #filtered_and_stemmed_text = [stemmer.stem(word) for word in words not in stop_words]

    # Filter out the stopwords
    filtered_and_stemmed_text = [word for word in words if word.lower() not in stop_words]
    # Reconstruct the string from the filtered words
    filtered_text = ' '.join(filtered_and_stemmed_text)
    return filtered_text
'''


"\ndef remove_stopwords_from_text(text):\n    # Tokenize the text\n    words = word_tokenize(text)\n\n    #filtered_and_stemmed_text = [stemmer.stem(word) for word in words not in stop_words]\n\n    # Filter out the stopwords\n    filtered_and_stemmed_text = [word for word in words if word.lower() not in stop_words]\n    # Reconstruct the string from the filtered words\n    filtered_text = ' '.join(filtered_and_stemmed_text)\n    return filtered_text\n"

Now, we define our ngram tokeniser method. We will use monograms, bigrams, and trigrams.

In [8]:
#This is our ngram tokenizer. It takes a string and returns a dict of all ngrams, where each ngram is seperated by $$ so it will be parsed as one token

def tokenize_ngrams_to_dict(text, n1=1, n2=3):
    # Replace spaces with dollar signs
    #text_with_dollar_signs = re.sub(r'\s+', '$', text)
    
    # Tokenize the text into words
    #words = text_with_dollar_signs.split('$')
    words = text.split(' ')
    # Initialize an empty Counter to hold all n-grams
    all_ngram_counts = Counter()
    
    # Loop through each n from n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Update the Counter with the current n-grams
        all_ngram_counts.update(ngrams)
    
    return dict(all_ngram_counts)

Now, we process our dataset by applying the methods we just specified to our documents.

In [9]:
# Apply n-gram tokenization to the dataset
#This will delete the 'text' field from the documents,
# and create a new 'toks' field which contains the tokens with their frequencies
for doc in docs:
        if 'text' in doc:
            doc['text'] = clean_text(doc['text'])
            doc['text'] = remove_stopwords(doc['text'])
            doc['text'] = stem_text(doc['text'])
            
            doc_1gram = tokenize_ngrams_to_dict(doc['text'], n1=1, n2=3)

            doc['toks'] = doc_1gram
            del doc['text']  
    
for i, doc in enumerate(docs):
     if i == 3:
           break
     print(doc)


{'docno': 'O02-2002', 'toks': {'studi': 1, 'word': 7, 'similar': 8, 'use': 3, 'context': 5, 'vector': 3, 'model': 2, 'need': 1, 'measur': 2, 'process': 1, 'natur': 1, 'languag': 1, 'especi': 1, 'general': 1, 'classif': 1, 'exampl': 1, 'base': 3, 'approach': 2, 'usual': 1, 'two': 1, 'defin': 1, 'accord': 3, 'distanc': 1, 'semant': 6, 'class': 2, 'taxonomi': 2, 'less': 1, 'consid': 1, 'syntact': 5, 'similarit': 2, 'ie': 2, 'howev': 1, 'real': 1, 'applic': 1, 'requir': 1, 'weight': 1, 'differ': 1, 'mixtur': 1, 'paper': 1, 'propos': 1, 'relat': 1, 'co': 2, 'occurr': 2, 'adopt': 1, 'inform': 1, 'theoret': 1, 'solv': 1, 'problem': 1, 'data': 1, 'spars': 1, 'characterist': 1, 'precis': 1, 'probabilist': 1, 'distribut': 1, 'featur': 2, 'deriv': 1, 'pars': 1, 'contextu': 1, 'environ': 1, 'adjust': 1, 'idf': 1, 'invers': 1, 'document': 1, 'frequenc': 1, 'valu': 2, 'agglom': 1, 'cluster': 1, 'algorithm': 1, 'appli': 1, 'group': 2, 'turn': 1, 'categori': 1, 'togeth': 1, 'studi$$word': 1, 'word$$si

Finally, we can create our Index. 
We will use an IterDictIndexer with pretokenised=True, as we already created the tokens manually.

In [10]:

# Initialize the IterDictIndexer with pretokenised set to True
iter_indexer = pt.IterDictIndexer("./ngramindex", overwrite=True, meta={'docno': 20}, pretokenised=True)

# Index our pretokenized documents
index_ref = iter_indexer.index(docs)

index = pt.IndexFactory.of(index_ref)

#Print some stats about our index
print(index.getCollectionStatistics())

# Access the MetaIndex and Lexicon
meta = index.getMetaIndex()
lexicon = index.getLexicon()

#Print some example terms from the index. We see that numbers arent removed
i = 0
for term, le in index.getLexicon():
    i = i+1
    if i == 5:
        break
    print(term) 
    print(le.getFrequency())

Number of documents: 500
Number of terms: 60550
Number of postings: 88887
Number of fields: 0
Number of tokens: 102993
Field names: []
Positions:   false

0
47
0$$2
2
0$$2$$0
1
0$$2$$rather
1


### Define the retrieval pipeline
First, we define a new method that works exactly like the index ngram tokeniser method, just that it will return a list of tokens instead of a dictionary, and it will not count each token.

This method will be used as one of the transformers for our retrieval pipeline and it is responsible for tokenising the query in the same format as we tokenized our index.

In [11]:
#takes a string and returns a list with all ngrams
def tokenize_ngrams_to_list(text, n1=1, n2=3):
    words = text.split()

    # Initialize an empty list to hold all n-grams
    all_ngrams = []
    
    # Loop through each n between n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Add all current ngrams to the all_ngrams list
        all_ngrams.extend(ngrams)
    
    return all_ngrams

Now, we can finally define the actual retrieval pipeline. It consists of two steps; first, we tokenize the queries into ngrams, and after that, we apply bm25 to the query with our index

In [12]:
remove_special_characters = pt.rewrite.tokenise(lambda query: clean_text(query, return_as_list=True))
remove_stopwords_from_query = pt.rewrite.tokenise(lambda query: remove_stopwords(query, return_as_list=True))
stem_query = pt.rewrite.tokenise(lambda query: stem_text(query, return_as_list=True))
# This transformer will tokenise the queries into the ngrams
tokenise_query_ngram = pt.rewrite.tokenise(lambda query: tokenize_ngrams_to_list(query))

# This transformer will do the retrieval using bm25, and explicitly not apply any stemming and stopword removal
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose = True, properties={"termpipelines" : ""})

# This is our retrieval pipeline
retr_pipeline = remove_special_characters >> remove_stopwords_from_query >> stem_query >> tokenise_query_ngram >> bm25

In [13]:
#Look at some example queries
print(pt_dataset.get_topics().columns)
pt_dataset.get_topics('query').head(3)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
Index(['qid', 'text', 'title', 'query', 'description', 'narrative'], dtype='object')


Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm


In [14]:
# Not needed anymore since we process the queries directly in the pipeline
'''

df = pd.DataFrame(pt_dataset.get_topics())
if 'query' not in df.columns:
    df['query'] = df['text']


# Convert the DataFrame to a list of dictionaries
queries = df[['qid', 'query']].to_dict(orient='records')

# Print the result
print(queries)

#TODO sonderzeichen aus query löschen
def clean_queries(queries):
    for query in queries:
        if 'query' in query:
            query['query'] = clean_text(query['query'])
clean_queries(queries)
for query in queries:
        if 'query' in query:
            query['query'] = remove_stopwords(query['query'])
            query['query'] = stem_text(query['query'])

for i,query in enumerate(queries):
   

    query_ngram = tokenize_ngrams_to_dict(query['query'], n1=1, n2=3)

    query['query'] = query_ngram
  
for query in queries:
     print(query)
'''

"\n\ndf = pd.DataFrame(pt_dataset.get_topics())\nif 'query' not in df.columns:\n    df['query'] = df['text']\n\n\n# Convert the DataFrame to a list of dictionaries\nqueries = df[['qid', 'query']].to_dict(orient='records')\n\n# Print the result\nprint(queries)\n\n#TODO sonderzeichen aus query löschen\ndef clean_queries(queries):\n    for query in queries:\n        if 'query' in query:\n            query['query'] = clean_text(query['query'])\nclean_queries(queries)\nfor query in queries:\n        if 'query' in query:\n            query['query'] = remove_stopwords(query['query'])\n            query['query'] = stem_text(query['query'])\n\nfor i,query in enumerate(queries):\n   \n\n    query_ngram = tokenize_ngrams_to_dict(query['query'], n1=1, n2=3)\n\n    query['query'] = query_ngram\n  \nfor query in queries:\n     print(query)\n"

### Create the run

In [15]:
print('Now we do the retrieval...')
run = retr_pipeline(pt_dataset.get_topics()) #queries

print('Done. Here are the first 10 entries of the run')
run.head(10)

Now we do the retrieval...
There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


BR(BM25): 100%|██████████| 68/68 [00:00<00:00, 123.99q/s]


Done. Here are the first 10 entries of the run


Unnamed: 0,qid,docid,docno,rank,score,text,title,query_3,description,narrative,query_2,query_1,query_0,query
0,1,414,S07-1088,0,17.367886,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
1,1,308,D19-3006,1,12.982733,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
2,1,264,2012.iwslt-evaluati,2,12.363764,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
3,1,341,P05-1007,3,12.021729,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
4,1,121,C10-2174,4,11.991876,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
5,1,306,2021.emnlp-main.148,5,9.380649,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
6,1,360,L16-1093,6,9.101785,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
7,1,44,R13-1056,7,9.022026,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
8,1,366,2009.mtsummit-poste,8,8.699967,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...
9,1,475,W18-5026,9,7.870108,retrieval system improving effectiveness,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectivene...,Relevant papers include research on what makes...,retrieval system improving effectiveness,retrieval system improving effectiveness,retriev system improv effect,retriev system improv effect retriev$$system s...


### Persist the run file for evaluation

In [16]:
persist_and_normalize_run(run, system_name='ngrams', default_output='../runs/')

import os
os.rename('../runs/run.txt', '../runs/runngram.txt')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs/".
Done. run file is stored under "../runs//run.txt".


In [17]:
'''queries = [
    {'qid':1 , 'query':'machine learning'},
    {'qid':2 , 'query':'natural language processing techniques'}
]

# Print the new query representation with ngrams included. This is how our query will get passed to bm25
df = pd.DataFrame(queries)
print(df)
transformed_df = tokenise_query_ngram.transform(df)
print("Transformed:")
print(transformed_df)
'''

'queries = [\n    {\'qid\':1 , \'query\':\'machine learning\'},\n    {\'qid\':2 , \'query\':\'natural language processing techniques\'}\n]\n\n# Print the new query representation with ngrams included. This is how our query will get passed to bm25\ndf = pd.DataFrame(queries)\nprint(df)\ntransformed_df = tokenise_query_ngram.transform(df)\nprint("Transformed:")\nprint(transformed_df)\n'

In [18]:
'''for index, row in transformed_df.iterrows():
    query_id = row['qid']
    query_text = row['query']
    print("test")
    print(f"Processing query ID {query_id} with text: {query_text}")
    
    # Execute the search
    results = retr_pipeline.search(query_text)
    
    # Print or process the results
    print(f"Results for query ID {query_id}:")
    print(results)'''

'for index, row in transformed_df.iterrows():\n    query_id = row[\'qid\']\n    query_text = row[\'query\']\n    print("test")\n    print(f"Processing query ID {query_id} with text: {query_text}")\n    \n    # Execute the search\n    results = retr_pipeline.search(query_text)\n    \n    # Print or process the results\n    print(f"Results for query ID {query_id}:")\n    print(results)'