# Operations, Transformers and Pipelines

This notebook goes a little into detail about the section Transformer-pipelines in the dataset_analysis.ipynb Notebook

## Setup

Libraries and Imports

In [1]:
# If using Google Colab:
#!pip3 install tira ir-datasets python-terrier

In [4]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.third_party_integrations import ir_datasets
from tira.rest_api_client import Client
import pyterrier as pt

import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
#pd.set_option("display.max_rows", None)
pd.set_option("display.precision", 4)

In [5]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


## Indexing

For all retrieval related stuff in pyterrier the index is used.

In [6]:
# get the dataset
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

## Transformers & Operations

In [10]:
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
tf = pt.BatchRetrieve(index, wmodel="Tf")

scores = tf( pt_dataset.get_topics(variant="title").head(1) ).head(10)
print(scores)

  qid   docid                                          docno  rank  score  \
0   1   73688               2001.sigirconf_workshop-2001w1.0     0  128.0   
1   1  122235  1989.sigirjournals_journal-ir0volumeA23A34.11     1   74.0   
2   1   83452                1997.sigirconf_conference-97.33     2   72.0   
3   1   76558                      2000.clef_workshop-2000.0     3   71.0   
4   1  126292             2019.tois_journal-ir0volumeA37A1.2     4   50.0   
5   1   84107              2003.sigirconf_conference-2003.49     5   46.0   
6   1  101146                   2010.spire_conference-2010.0     6   46.0   
7   1  122479   2010.sigirjournals_journal-ir0volumeA44A2.15     7   45.0   
8   1   82479                1998.sigirconf_conference-98.22     8   41.0   
9   1   82496                1998.sigirconf_conference-98.39     9   41.0   

                                      query  
0  retrieval system improving effectiveness  
1  retrieval system improving effectiveness  
2  retrieval s

## Pipeline Construction

## Beispielhafter Versuchsaufbau

In [None]:
from sklearn.model_selection import train_test_split


# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.third_party_integrations import ir_datasets
from tira.rest_api_client import Client
import pyterrier as pt

import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
#pd.set_option("display.max_rows", None)
pd.set_option("display.precision", 4)

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [None]:
RANK_CUTOFF = 10
SEED = 10

dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
#dataset = ir_datasets.load("ir-lab-sose-2024/ir-acl-anthology-20240504-training")

topics = dataset.get_topics() # variant??
qrels = dataset.get_qrels()

train_val_topics, test_topics = train_test_split(topics, test_size=15, random_state=SEED)
train_topics, val_topics = train_test_split(train_val_topics, test_size=15, random_state=SEED)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [None]:
# FEATURES

index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', dataset)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

ltr_feats1 = (bm25 % RANK_CUTOFF) >> pt.text.get_text(dataset, ["doc_id", "text"]) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence
    (sdm >> bm25)
    #** # score of text for query 'tomato'
    #(pt.apply.query(lambda row: 'tomato') >> bm25)
    ** # score of title (not originally indexed)
    (pt.text.scorer(body_attr="text", takes='docs', wmodel='BM25')) 
#    ** # date 2020
#    (pt.apply.doc_score(lambda row: int("2020" in row["date"])))
    ** # has doi
    (pt.apply.doc_score(lambda row: int( row["doc_id"] is not None and len(row["doc_id"]) > 0) ))
    ** # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
)

In [None]:
res = ltr_feats1.search("neural")
res = res[["qid", "docid", "docno", "rank", "score", "query", "doc_id", "features"]]
res



Unnamed: 0,qid,docid,docno,rank,score,query,doc_id,features
0,1,46911,N16-1065,0,6.7668,neural,N16-1065,"[6.7668194554010475, 6.7668194554010475, -8.459695810623721, 1.0, 6.7668194554010475]"
1,1,29197,W17-4914,1,6.6466,neural,W17-4914,"[6.646604380515833, 6.646604380515833, -8.334060219255585, 1.0, 6.646604380515833]"
2,1,42273,P18-1223,2,6.5876,neural,P18-1223,"[6.587611735634998, 6.587611735634998, -8.265185567105398, 1.0, 6.587611735634998]"
3,1,39667,P18-2060,3,6.5553,neural,P18-2060,"[6.555326659823115, 6.555326659823115, -8.209298568342081, 1.0, 6.555326659823115]"
4,1,108626,2005.wwwconf_conference-2005si.54,4,6.54,neural,2005.wwwconf_conference-2005si.54,"[6.540047885527931, 6.540047885527931, -8.19331749741392, 1.0, 6.540047885527931]"
5,1,30184,N18-1055,5,6.535,neural,N18-1055,"[6.535039286684509, 6.535039286684509, -8.226413902194334, 1.0, 6.535039286684509]"
6,1,91713,2020.ecir_conference-20202.58,6,6.5232,neural,2020.ecir_conference-20202.58,"[6.523177539953064, 6.523177539953064, -8.188358290973635, 1.0, 6.523177539953064]"
7,1,4564,C18-1266,7,6.512,neural,C18-1266,"[6.5120015894951395, 6.5120015894951395, -8.197809577415388, 1.0, 6.5120015894951395]"
8,1,66623,2021.cl-1.6,8,6.5112,neural,2021.cl-1.6,"[6.5111804822252495, 6.5111804822252495, -8.184819674598971, 1.0, 6.5111804822252495]"
9,1,42858,W17-4123,9,6.5097,neural,W17-4123,"[6.509703010835169, 6.509703010835169, -8.161541294071368, 1.0, 6.509703010835169]"


In [25]:
# Define Pipeline with >> operator
# this is called a composition

pipeline = tf >> tfidf

query = pt_dataset.get_topics(variant="title").head(1)
print(query)

#print(tf(query).head(50))
#print( pipeline( pt_dataset.get_topics(variant="title").head(1) ).head(10) )

  qid                                     query
0   1  retrieval system improving effectiveness
   qid   docid                                          docno  rank  score  \
0    1   73688               2001.sigirconf_workshop-2001w1.0     0  128.0   
1    1  122235  1989.sigirjournals_journal-ir0volumeA23A34.11     1   74.0   
2    1   83452                1997.sigirconf_conference-97.33     2   72.0   
3    1   76558                      2000.clef_workshop-2000.0     3   71.0   
4    1  126292             2019.tois_journal-ir0volumeA37A1.2     4   50.0   
5    1   84107              2003.sigirconf_conference-2003.49     5   46.0   
6    1  101146                   2010.spire_conference-2010.0     6   46.0   
7    1  122479   2010.sigirjournals_journal-ir0volumeA44A2.15     7   45.0   
8    1   82479                1998.sigirconf_conference-98.22     8   41.0   
9    1   82496                1998.sigirconf_conference-98.39     9   41.0   
10   1  122484   2010.sigirjournals_journal-ir

In [20]:
# RANKER

tf_at_10 = tf( pt_dataset.get_topics(variant="title").head(1) ).head(100)
tfidf_at_10 = tfidf( pt_dataset.get_topics(variant="title").head(1) ).head(100)
#print(tf_at_10)
#print(tfidf_at_10)

In [21]:

best_tf_docids = set(tf_at_10["docid"].values)
best_tfidf_docids = set(tfidf_at_10["docid"].values)
#print(best_tf_docids)
#print(best_tfidf_docids)

intersection = best_tf_docids & best_tfidf_docids
print(intersection)



{90115, 83984, 83987, 59419, 82460, 29725, 112161, 82465, 80937, 74282, 82479, 82486, 83000, 123967, 82496, 84545, 126531, 121932, 82513, 82528, 88166, 122478, 122479, 122481, 122484, 126596, 94858, 84107, 84123, 86693, 113323, 82102, 82104, 101568, 106180, 209, 124115, 125153, 50402, 83691, 81643, 81645, 106229, 94453, 81655, 83708, 81664, 83712, 88834, 122117, 122632, 76558, 83728, 77585, 122130, 80658, 93970, 122131, 122133, 80665, 101146, 96540, 80668, 81703, 82229, 77621, 83765, 83768, 83770, 81726, 82250, 114506, 101710, 126292, 123748, 79731, 121718, 122235, 75134, 124799, 75653, 92043, 122252, 82316, 122253, 82327, 126875, 122271, 76197, 81318, 83877, 80304, 83395, 82374, 77772, 101842, 73688, 81884, 83452, 83455}
{74247, 82952, 94732, 126479, 90655, 82472, 87595, 82475, 114223, 111672, 82490, 92224, 80449, 101445, 86088, 53327, 17496, 82538, 84081, 84593, 124023, 83075, 75400, 94858, 94859, 82580, 83604, 79515, 94364, 123051, 96429, 73906, 111285, 82102, 74431, 111300, 94415, 