In [1]:
# !pip install python-terrier

In [2]:
import pandas as pd
import pyterrier as pt

In [3]:
terrier_index = pt.Artifact.from_hf("macavaney/msmarco-passage.terrier")
bm25 = terrier_index.bm25()
# let's see what inputs BM25 accepets!
pt.inspect.transformer_inputs(bm25)

Java started (triggered by TerrierIndex.index_ref) and loaded: pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


09:02:44.344 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.


[['qid', 'query'],
 ['qid', 'query', 'docid'],
 ['qid', 'query_toks'],
 ['qid', 'docno', 'query']]

In [4]:
# nice, it looks like it accepts several different input configurations. Let's see the outputs when we use the first one
pt.inspect.transformer_outputs(bm25, ['qid', 'query'])

['qid', 'docid', 'docno', 'rank', 'score', 'query']

In [5]:
# so when we pass in a qid and query, we get out qid, docid, docno, rank, score, and query. Let's see it in action!
bm25(pd.DataFrame([
    {'qid': '1', 'query': 'terrier'}
]))

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,299652,299652,0,22.947724,terrier
1,1,2873114,2873114,1,22.879183,terrier
2,1,6088052,6088052,2,22.667023,terrier
3,1,1740938,1740938,3,22.372571,terrier
4,1,5944797,5944797,4,22.250772,terrier
...,...,...,...,...,...,...
995,1,8251288,8251288,995,15.911523,terrier
996,1,8454982,8454982,996,15.911523,terrier
997,1,8796747,8796747,997,15.911523,terrier
998,1,595850,595850,998,15.906338,terrier


In [6]:
# We can also inspect pipelines consisting of multiple transformers
bm25_and_load_text = bm25 >> terrier_index.text_loader()
pt.inspect.transformer_outputs(bm25_and_load_text, ['qid', 'query'])
# here we also get text output

['qid', 'docid', 'rank', 'score', 'query', 'docno', 'text']

In [7]:
# let's say we pass in an invalid configuration (a dense vector version of the query)
pt.inspect.transformer_outputs(bm25, ['qid', 'query_vec'])
# we'll get an error that tells us what inputs we should have provided instead

InputValidationError: DataFrame(columns=['qid', 'query_vec']) does not match required columns for this transformer. TerrierRetr(BM25) [TransformerMode(missing_columns=['query'], extra_columns=[], mode_name='retrieve'), TransformerMode(missing_columns=['query', 'docid'], extra_columns=[], mode_name='rerank'), TransformerMode(missing_columns=['query_toks'], extra_columns=[], mode_name='retrieve_toks'), TransformerMode(missing_columns=['docno', 'query'], extra_columns=[], mode_name='rerank')]

In [8]:
# We can also check a transformer's attributes.
pt.inspect.transformer_attributes(bm25)

[TransformerAttribute(name='index_location', value=<org.terrier.querying.IndexRef at 0x7fad60219ee0 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x5ae3818 at 0x7fad78833b10>>, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='num_results', value=1000, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='metadata', value=['docno'], init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='wmodel', value='BM25', init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='threads', value=1, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='verbose', value=False, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='terrierql', value='on', init_default_value=<class 'inspect._empty'>, init_parameter_kind=None

In [9]:
# inspect also lets us return a new version of the transformer with modified attributes
new_bm25 = pt.inspect.transformer_apply_attributes(bm25, **{'bm25.b': 0.9})
pt.inspect.transformer_attributes(new_bm25)
# we can see that bm25.b is updated

[TransformerAttribute(name='index_location', value=<org.terrier.querying.IndexRef at 0x7fad60219ee0 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x5ae3818 at 0x7fad78833b10>>, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='num_results', value=1000, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='metadata', value=['docno'], init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='wmodel', value='BM25', init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='threads', value=1, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='verbose', value=False, init_default_value=<class 'inspect._empty'>, init_parameter_kind=None),
 TransformerAttribute(name='terrierql', value='on', init_default_value=<class 'inspect._empty'>, init_parameter_kind=None