In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets trectools
else:
    print('We are in the TIRA sandbox.')


In [None]:
# create index
import pyterrier as pt

def create_index(documents):
    indexer = pt.IterDictIndexer(
        "./tmp/index", 
        overwrite=True, 
        stopwords="./stopwordlists/stopwords_english_long.txt", 
        meta={'docno': 100, 'text': 20480},
        stemmer='porter'
    )
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)


In [None]:
# create model
import pyterrier as pt

def create_model(index):
    return pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
# load dataset
from tira.third_party_integrations import ir_datasets
import pyterrier as pt

def load_dataset():
    training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'

    queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset), format='trecxml')

    dataset = ir_datasets.load(training_dataset)
    return {'documents': dataset.docs_iter(), 'queries': queries}

In [None]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt

from load_dataset import load_dataset 
from create_index import create_index
from create_model import create_model
from test_model import test_model


if not pt.started():
    pt.init()

ensure_pyterrier_is_loaded()

# load data
load_dataset_result = load_dataset()
documents, queries = load_dataset_result['documents'], load_dataset_result['queries']
print("data load")

# create index
index = create_index(documents)
print("index created")

# create model
model = create_model(index)
print("model created")

# run model
run = model(queries)
persist_and_normalize_run(run, 'bm25-baseline')