In [1]:
import pyterrier as pt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

from tira.third_party_integrations import persist_and_normalize_run,  ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

In [2]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

ensure_pyterrier_is_loaded()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [31]:
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [32]:
docs =  pt_dataset.get_corpus_iter()
#now, we will only take the first 1000 documents for testing because otherwise the notebook will crash
#TODO use all documents and compute in tira

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

expermient with dataset

only take the first n documents for testing purposes

In [33]:
from itertools import islice
#docs = islice(docs, 10)
docs = list(docs)
docs = docs[:500]
print(docs)
for i,doc in enumerate(docs):
    print(doc)
 

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:03<00:00, 31987.32it/s]

[{'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment 




remove all special characters

In [34]:
import re

def clean_text(text):
    text = text.replace('\n', ' ')
    # Use regular expression to remove non-alphanumeric characters, except spaces
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

def clean_documents(documents):
    for document in documents:
        if 'text' in document:
            document['text'] = clean_text(document['text'])
    return documents

clean_documents(docs)
for i,doc in enumerate(docs):
    print(doc)


{'text': 'A Study on Word Similarity using Context Vector Models    There is a need to measure word similarity when processing natural languages  especially when using generalization  classification  or example  based approaches  Usually  measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy   The taxonomy approaches are more or less semantic  based that do not consider syntactic similarit ies  However  in real applications  both semantic and syntactic similarities are required and weighted differently  Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies  In this paper  we propose using only syntactic related co occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision  The probabilistic distribution of co occurrence context features is derived by parsing the contextual environment of e

remove all stopwords and stemming

In [35]:
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')


# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))


from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
#nltk.download('punkt')
# Ensure stopwords and punkt tokenizer are downloaded

stemmer = SnowballStemmer('english')

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

def stem_text(text):
    words = text.split()

    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the stemmed words back into a single string
    return ' '.join(stemmed_words)

def remove_stopwords_from_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    #filtered_and_stemmed_text = [stemmer.stem(word) for word in words not in stop_words]

    # Filter out the stopwords
    filtered_and_stemmed_text = [word for word in words if word.lower() not in stop_words]
    # Reconstruct the string from the filtered words
    filtered_text = ' '.join(filtered_and_stemmed_text)
    return filtered_text

for doc in docs:
        if 'text' in doc:
            doc['text'] = remove_stopwords(doc['text'])
            doc['text'] = stem_text(doc['text'])
for doc in docs:
     print(doc)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'text': 'studi word similar use context vector model need measur word similar process natur languag especi use general classif exampl base approach usual measur similar two word defin accord distanc semant class semant taxonomi taxonomi approach less semant base consid syntact similarit ie howev real applic semant syntact similar requir weight differ word similar base context vector mixtur syntact semant similarit ie paper propos use syntact relat co occurr context vector adopt inform theoret model solv problem data spars characterist precis probabilist distribut co occurr context featur deriv pars contextu environ word context featur adjust accord idf invers document frequenc valu agglom cluster algorithm appli group similar word accord similar valu turn word similar syntact categori semant class group togeth', 'docno': 'O02-2002'}
{'text': 'bootstrap larg sens tag corpora', 'docno': 'L02-1310'}
{'text': 'headerless quoteless hopeless use pairwis email classif disentangl email thread

In [36]:
#This is our ngram tokenizer. It takes a string and returns a dict of all ngrams, where each ngram is seperated by $$ so it will be parsed as one token

def tokenize_ngrams_to_dict(text, n1=1, n2=3):
#TODO warum hier nochmal stopwords drin? doppelt
    #text = remove_stopwords_from_text(text)
    # Replace spaces with dollar signs
    text_with_dollar_signs = re.sub(r'\s+', '$', text)
    
    # Tokenize the text into words
    words = text_with_dollar_signs.split('$')
    
    # Initialize an empty Counter to hold all n-grams
    all_ngram_counts = Counter()
    
    # Loop through each n from n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Update the Counter with the current n-grams
        all_ngram_counts.update(ngrams)
    
    return dict(all_ngram_counts)

In [37]:
# Apply n-gram tokenization to the dataset

for i,doc in enumerate(docs):
    if i == 1000: #limit to 1000 for now since kernel crashes if performed on whole set of docs
        break

    # remove stopwords
    #stemming
    #remove special characters
    doc_1gram = tokenize_ngrams_to_dict(doc['text'], n1=1, n2=3)

    doc['toks'] = doc_1gram
    del doc['text']  # Remove the 'text' field as it's not needed anymore
    #if doc['docno'] == 'N09-1041':
    #    print("N09-1041:",doc)
    #documents.append(doc)
    print(doc)

{'docno': 'O02-2002', 'toks': {'studi': 1, 'word': 7, 'similar': 8, 'use': 3, 'context': 5, 'vector': 3, 'model': 2, 'need': 1, 'measur': 2, 'process': 1, 'natur': 1, 'languag': 1, 'especi': 1, 'general': 1, 'classif': 1, 'exampl': 1, 'base': 3, 'approach': 2, 'usual': 1, 'two': 1, 'defin': 1, 'accord': 3, 'distanc': 1, 'semant': 6, 'class': 2, 'taxonomi': 2, 'less': 1, 'consid': 1, 'syntact': 5, 'similarit': 2, 'ie': 2, 'howev': 1, 'real': 1, 'applic': 1, 'requir': 1, 'weight': 1, 'differ': 1, 'mixtur': 1, 'paper': 1, 'propos': 1, 'relat': 1, 'co': 2, 'occurr': 2, 'adopt': 1, 'inform': 1, 'theoret': 1, 'solv': 1, 'problem': 1, 'data': 1, 'spars': 1, 'characterist': 1, 'precis': 1, 'probabilist': 1, 'distribut': 1, 'featur': 2, 'deriv': 1, 'pars': 1, 'contextu': 1, 'environ': 1, 'adjust': 1, 'idf': 1, 'invers': 1, 'document': 1, 'frequenc': 1, 'valu': 2, 'agglom': 1, 'cluster': 1, 'algorithm': 1, 'appli': 1, 'group': 2, 'turn': 1, 'categori': 1, 'togeth': 1, 'studi$$word': 1, 'word$$si

In [38]:
for doc in docs:    
    print(doc)

{'docno': 'O02-2002', 'toks': {'studi': 1, 'word': 7, 'similar': 8, 'use': 3, 'context': 5, 'vector': 3, 'model': 2, 'need': 1, 'measur': 2, 'process': 1, 'natur': 1, 'languag': 1, 'especi': 1, 'general': 1, 'classif': 1, 'exampl': 1, 'base': 3, 'approach': 2, 'usual': 1, 'two': 1, 'defin': 1, 'accord': 3, 'distanc': 1, 'semant': 6, 'class': 2, 'taxonomi': 2, 'less': 1, 'consid': 1, 'syntact': 5, 'similarit': 2, 'ie': 2, 'howev': 1, 'real': 1, 'applic': 1, 'requir': 1, 'weight': 1, 'differ': 1, 'mixtur': 1, 'paper': 1, 'propos': 1, 'relat': 1, 'co': 2, 'occurr': 2, 'adopt': 1, 'inform': 1, 'theoret': 1, 'solv': 1, 'problem': 1, 'data': 1, 'spars': 1, 'characterist': 1, 'precis': 1, 'probabilist': 1, 'distribut': 1, 'featur': 2, 'deriv': 1, 'pars': 1, 'contextu': 1, 'environ': 1, 'adjust': 1, 'idf': 1, 'invers': 1, 'document': 1, 'frequenc': 1, 'valu': 2, 'agglom': 1, 'cluster': 1, 'algorithm': 1, 'appli': 1, 'group': 2, 'turn': 1, 'categori': 1, 'togeth': 1, 'studi$$word': 1, 'word$$si

Index

In [39]:

# Initialize the IterDictIndexer with pretokenised set to True
iter_indexer = pt.IterDictIndexer("./ngramindex", overwrite=True, meta={'docno': 20}, pretokenised=True)

# Index the pretokenized dataset

index_ref = iter_indexer.index(docs)

print(f"Indexing complete: {index_ref}")
# Now you can use the index_ref as usual
index = pt.IndexFactory.of(index_ref)

print(index.getCollectionStatistics())

# Access the MetaIndex and Lexicon
meta = index.getMetaIndex()
lexicon = index.getLexicon()

i = 0
for term, le in index.getLexicon():
    i = i+1
    if i == 20:
        break
    print(term) 
    print(le.getFrequency())

Indexing complete: <org.terrier.querying.IndexRef at 0x78ce6c922f20 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x5f9ba77165d0 at 0x78ce6c4f1430>>
Number of documents: 500
Number of terms: 60510
Number of postings: 88834
Number of fields: 0
Number of tokens: 102942
Field names: []
Positions:   false

0
47
0$$2
2
0$$2$$0
1
0$$2$$rather
1
0$$25
1
0$$25$$rank
1
0$$35
1
0$$35$$extract
1
0$$40
1
0$$40$$recal
1
0$$41
1
0$$41$$recal
1
0$$42
1
0$$45
1
0$$45$$recogn
1
0$$5
3
0$$5$$0
1
0$$5$$gain
1
0$$5$$would
1


Pipeline

In [40]:
#takes a string and returns a list with all ngrams
def tokenize_ngrams_to_list(text, n1=1, n2=3):
    #Split the text into all individual words
    #TODO use another tokenizer here first to get rid of all special characters and to do stemming etc
    words = text.split()

    # Initialize an empty list to hold all n-grams
    all_ngrams = []
    
    # Loop through each n between n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Add all current ngrams to the all_ngrams list
        all_ngrams.extend(ngrams)
    
    return all_ngrams

In [86]:
# This transformer will tokenise the queries into the ngrams
tokenise_query_ngram = pt.rewrite.tokenise(lambda query: tokenize_ngrams_to_list(query))

# This transformer will do the retrieval using bm25
#bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose = True)

# This is our retrieval pipeline
retr_pipeline = tokenise_query_ngram >> pt.BatchRetrieve(index_ref, wmodel="BM25", verbose = True)

In [87]:
i = 0
for topic in pt_dataset.get_topics():
    i = i+1
    if i == 20: 
        break
    print(topic)
pt_dataset.get_topics('query').head(10)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
qid
text
title
query
description
narrative


Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm
3,4,stemming for arabic languages
4,5,audio based animal recognition
5,6,comparison different retrieval models
6,7,cache architecture
7,8,document scoping formula
8,9,pseudo relevance feedback
9,10,how to represent natural conversations in word...


In [88]:
data = {
    'qid': [1, 2, 3],
    'text': ['machine learning', 'natural language processing techniques', 'deep learning applications']
}

df = pd.DataFrame(pt_dataset.get_topics())
if 'query' not in df.columns:
    df['query'] = df['text']


# Convert the DataFrame to a list of dictionaries
queries = df[['qid', 'query']].to_dict(orient='records')

# Print the result
print(queries)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
[{'qid': '1', 'query': 'retrieval system improving effectiveness'}, {'qid': '2', 'query': 'machine learning language identification'}, {'qid': '3', 'query': 'social media detect self harm'}, {'qid': '4', 'query': 'stemming for arabic languages'}, {'qid': '5', 'query': 'audio based animal recognition'}, {'qid': '6', 'query': 'comparison different retrieval models'}, {'qid': '7', 'query': 'cache architecture'}, {'qid': '8', 'query': 'document scoping formula'}, {'qid': '9', 'query': 'pseudo relevance feedback'}, {'qid': '10', 'query': 'how to represent natural conversations in word nets'}, {'qid': '11', 'query': 'algorithm acceleration with nvidia cuda'}, {'qid': '12', 'query': 'mention of algorithm'}, {'qid': '13', 'query': 'at least three authors'}, {'qid': '14', 'query': 'german domain'}, {'qid': '15', 'query

In [89]:
#TODO sonderzeichen aus query löschen
def clean_queries(queries):
    for query in queries:
        if 'query' in query:
            query['query'] = clean_text(query['query'])
clean_queries(queries)
for query in queries:
        if 'query' in query:
            query['query'] = remove_stopwords(query['query'])
            query['query'] = stem_text(query['query'])
'''
for i,query in enumerate(queries):
   

    query_ngram = tokenize_ngrams_to_dict(query['query'], n1=1, n2=3)

    query['query'] = query_ngram
  '''
for query in queries:
     print(query)

{'qid': '1', 'query': 'retriev system improv effect'}
{'qid': '2', 'query': 'machin learn languag identif'}
{'qid': '3', 'query': 'social media detect self harm'}
{'qid': '4', 'query': 'stem arab languag'}
{'qid': '5', 'query': 'audio base anim recognit'}
{'qid': '6', 'query': 'comparison differ retriev model'}
{'qid': '7', 'query': 'cach architectur'}
{'qid': '8', 'query': 'document scope formula'}
{'qid': '9', 'query': 'pseudo relev feedback'}
{'qid': '10', 'query': 'repres natur convers word net'}
{'qid': '11', 'query': 'algorithm acceler nvidia cuda'}
{'qid': '12', 'query': 'mention algorithm'}
{'qid': '13', 'query': 'least three author'}
{'qid': '14', 'query': 'german domain'}
{'qid': '15', 'query': 'mention open sourc'}
{'qid': '16', 'query': 'inclus text mine'}
{'qid': '17', 'query': 'ethic artifici intellig'}
{'qid': '19', 'query': 'machin learn relev result'}
{'qid': '20', 'query': 'crawl websit use machin learn'}
{'qid': '21', 'query': 'recommend influenc user'}
{'qid': '22',

In [91]:
print('Now we do the retrieval...')
run = retr_pipeline(queries)

print('Done. Here are the first 10 entries of the run')
run.head(10)

Now we do the retrieval...


BR(BM25): 100%|██████████| 68/68 [00:00<00:00, 125.98q/s]

Done. Here are the first 10 entries of the run





Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,1,414,S07-1088,0,17.364669,retriev system improv effect,retriev system improv effect retriev$$system s...
1,1,308,D19-3006,1,12.980152,retriev system improv effect,retriev system improv effect retriev$$system s...
2,1,264,2012.iwslt-evaluati,2,12.361925,retriev system improv effect,retriev system improv effect retriev$$system s...
3,1,341,P05-1007,3,12.020239,retriev system improv effect,retriev system improv effect retriev$$system s...
4,1,121,C10-2174,4,11.989796,retriev system improv effect,retriev system improv effect retriev$$system s...
5,1,306,2021.emnlp-main.148,5,9.379248,retriev system improv effect,retriev system improv effect retriev$$system s...
6,1,360,L16-1093,6,9.099911,retriev system improv effect,retriev system improv effect retriev$$system s...
7,1,44,R13-1056,7,9.020675,retriev system improv effect,retriev system improv effect retriev$$system s...
8,1,366,2009.mtsummit-poste,8,8.698839,retriev system improv effect,retriev system improv effect retriev$$system s...
9,1,475,W18-5026,9,7.868823,retriev system improv effect,retriev system improv effect retriev$$system s...


In [44]:
queries = [
    {'qid':1 , 'query':'machine learning'},
    {'qid':2 , 'query':'natural language processing techniques'}
]

# Print the new query representation with ngrams included. This is how our query will get passed to bm25
df = pd.DataFrame(queries)
print(df)
transformed_df = tokenise_query_ngram.transform(df)
print("Transformed:")
print(transformed_df)


   qid                                   query
0    1                             machine lea
1    2  natural language processing techniques
Transformed:
   qid                                 query_0  \
0    1                             machine lea   
1    2  natural language processing techniques   

                                               query  
0                           machine lea machine$$lea  
1  natural language processing techniques natural...  


RUN

In [43]:
for index, row in transformed_df.iterrows():
    query_id = row['qid']
    query_text = row['query']
    print("test")
    print(f"Processing query ID {query_id} with text: {query_text}")
    
    # Execute the search
    results = retr_pipeline.search(query_text)
    
    # Print or process the results
    print(f"Results for query ID {query_id}:")
    print(results)

test
Processing query ID 1 with text: machine learning algorithms machine$$learning learning$$algorithms machine$$learning$$algorithms
Results for query ID 1:
Empty DataFrame
Columns: [docid, docno, rank, score, qid, query_0, query]
Index: []
test
Processing query ID 2 with text: natural language processing techniques natural$$language language$$processing processing$$techniques natural$$language$$processing language$$processing$$techniques
Results for query ID 2:
Empty DataFrame
Columns: [docid, docno, rank, score, qid, query_0, query]
Index: []
