# BM25+RM3 with PySerini

### Step 1: Import everything and load variables

In [2]:
from pyserini.search.lucene import LuceneSearcher
import pandas as pd
from tira.third_party_integrations import get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm

input_directory, output_directory = get_input_directory_and_output_directory('./sample-input-full-rank')

  from .autonotebook import tqdm as notebook_tqdm


I will use a small hardcoded example located in ./sample-input-full-rank.
The output directory is /tmp/


### Step 2: Create Index and Searcher

In [3]:
!mkdir -p /tmp/anserini-docs

with open(f'{input_directory}/documents.jsonl') as documents, open(f'/tmp/anserini-docs/part-01.json', 'w') as ans:
    for doc in tqdm(documents):
        doc = json.loads(doc)
        ans.write(json.dumps({"id": doc['docno'], "contents": doc['text']}) + '\n')

!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /tmp/anserini-docs \
  --index /tmp/index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors

searcher = LuceneSearcher('/tmp/index')
searcher.set_bm25()
searcher.set_rm3()

5it [00:00, 5370.43it/s]


2023-07-21 07:31:17,518 INFO  [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO
2023-07-21 07:31:17,519 INFO  [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...
2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-docs
2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection
2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator
2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1
2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:259) - Language: en
2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter
2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:261) - Keep

### Step 3: Create Run

In [4]:
run = []

with open(f'{input_directory}/queries.jsonl') as queries:
    for query in queries:
        query = json.loads(query)
        for doc in searcher.search(query['query'], 1000):
            run += [{"qid": query['qid'], "score": doc.score, "docno": doc.docid}]
run = pd.DataFrame(run)

### Step 4: Persist Run

In [5]:
persist_and_normalize_run(run, output_file=output_directory, system_name='BM25+RM3', depth=1000)