In [1]:
from sys import modules

IN_COLAB = 'google.colab' in modules
if IN_COLAB:
    !pip install -q ir_axioms[examples] python-terrier

In [2]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="auto")

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
from pyterrier.datasets import get_dataset, Dataset

# Load dataset.
dataset_name = "antique"
dataset: Dataset = get_dataset(f"irds:{dataset_name}")
dataset_train: Dataset = get_dataset(f"irds:{dataset_name}/train")
dataset_test: Dataset = get_dataset(f"irds:{dataset_name}/test")

In [4]:
from pathlib import Path
from pyterrier.index import IterDictIndexer
from ir_axioms.backend.pyterrier import IndexRef, IndexFactory

# Load documents and build index.
index_dir = Path(f"cache/indices/{dataset_name.replace('/', '-')}").absolute()
index_ref: IndexRef
if index_dir.exists():
    index_ref = IndexRef.of(str(index_dir))
else:
    # Don't forget to include the 'text' field in the meta index.
    indexer = IterDictIndexer(str(index_dir))
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])
index = IndexFactory.of(index_ref)

In [5]:
from pyterrier.batchretrieve import BatchRetrieve

# BM25 baseline retrieval, cut off at 10 documents per query.
pipeline_bm25 = BatchRetrieve(index, wmodel="BM25", num_results=20)
pipeline_bm25(dataset_test.get_topics())

Unnamed: 0,qid,docid,docno,rank,score,query
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething
...,...,...,...,...,...,...
3877,1340574,136249,2323025_6,17,25.933856,why do some people only go to church on easter...
3878,1340574,116651,2036141_2,18,25.540481,why do some people only go to church on easter...
3879,1340574,123602,1757874_0,19,25.338768,why do some people only go to church on easter...
3880,1971899,234068,1971899_6,0,28.234530,what is masturbat


In [6]:
from pathlib import Path

cache_dir = Path(f"cache/")

In [7]:
from ir_axioms.axiom import (
    ArgUC, QTArg, QTPArg, aSL, PROX1, PROX2, PROX3, PROX4, PROX5, TFC1, TFC3, RS_TF, RS_TF_IDF, RS_BM25, RS_PL2, RS_QL,
    AND, LEN_AND, M_AND, LEN_M_AND, DIV, LEN_DIV, M_TDC, LEN_M_TDC, STMC1, STMC1_f, STMC2, STMC2_f, LNC1, TF_LNC, LB1,
    REG, ANTI_REG
)

axioms = [
    # ~ArgUC(),  # Very slow due to network access.
    # ~QTArg(),  # Very slow due to network access.
    # ~QTPArg(),  # Very slow due to network access.
    ~aSL(),
    ~LNC1(),
    ~TF_LNC(),
    ~LB1(),
    ~PROX1(),
    ~PROX2(),
    ~PROX3(),
    ~PROX4(),
    ~PROX5(),
    ~REG(),
    ~ANTI_REG(),
    ~AND(),
    ~LEN_AND(),
    ~M_AND(),
    ~LEN_M_AND(),
    ~DIV(),
    ~LEN_DIV(),
    ~RS_TF(),
    ~RS_TF_IDF(),
    ~RS_BM25(),
    ~RS_PL2(),
    ~RS_QL(),
    ~TFC1(),
    ~TFC3(),
    ~M_TDC(),
    ~LEN_M_TDC(),
    ~STMC1(),
    ~STMC1_f(),
    ~STMC2(),
    ~STMC2_f(),
]

In [8]:
from ir_axioms.backend.pyterrier.transformers import AggregatedAxiomaticPreference

pipeline_features = (
        ~(
                pipeline_bm25 >>
                AggregatedAxiomaticPreference(
                    axioms=axioms,
                    index=index,
                    dataset=dataset_name,
                    verbose=True
                )
        ) ^
        ~(
                pipeline_bm25 >>
                AggregatedAxiomaticPreference(
                    axioms=axioms,
                    index=index,
                    aggregation=min,
                    dataset=dataset_name,
                    verbose=True
                )
        ) ^
        ~(
                pipeline_bm25 >>
                AggregatedAxiomaticPreference(
                    axioms=axioms,
                    index=index,
                    aggregation=max,
                    dataset=dataset_name,
                    verbose=True
                )
        )
)


In [9]:
pipeline_features.transform(dataset_test.get_topics())

Aggregating query axiom preferences:   0%|          | 0/198 [00:00<?, ?query/s]

Unnamed: 0,qid,docid,docno,rank,score,query,features
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,"[-3, 1, 0, 0, 0]"
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething,"[12, 5, 0, 0, 0]"
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething,"[2, 10, 0, 0, 0]"
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething,"[2, 5, 0, 0, 0]"
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething,"[12, 5, 0, 0, 0]"
...,...,...,...,...,...,...,...
3877,1340574,136249,2323025_6,17,25.933856,why do some people only go to church on easter...,"[-4, -5, 0, 0, 0]"
3878,1340574,116651,2036141_2,18,25.540481,why do some people only go to church on easter...,"[-6, -5, 0, 0, 0]"
3879,1340574,123602,1757874_0,19,25.338768,why do some people only go to church on easter...,"[-1, -1, 0, 0, 0]"
3880,1971899,234068,1971899_6,0,28.234530,what is masturbat,"[0, 0, 0, 0, 0]"


In [10]:
from pyterrier.pipelines import Experiment
from pyterrier.ltr import apply_learned_model
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(n_estimators=400)
pipeline_random_forest = pipeline_features >> apply_learned_model(random_forest)
pipeline_random_forest.fit(
    dataset_train.get_topics(),
    dataset_train.get_qrels()
)

Aggregating query axiom preferences:   0%|          | 0/2413 [00:00<?, ?query/s]

In [11]:
Experiment(
    [pipeline_bm25, pipeline_random_forest],
    dataset_test.get_topics(),
    dataset_test.get_qrels(),
    ["map"],
    names=["BM25 Baseline", "LTR"]
)

Unnamed: 0,name,map
0,BM25 Baseline,0.3203
1,LTR,0.267358
