# BM25 Re-Ranking with PySerini

### Step 1: Import everything and load variables

In [1]:
from pyserini.search.lucene import LuceneSearcher
import gzip
import pandas as pd
from tira.third_party_integrations import get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm

input_directory, output_directory = get_input_directory_and_output_directory('./sample-input')

  from .autonotebook import tqdm as notebook_tqdm


I will use a small hardcoded example located in ./sample-input.
The output directory is /tmp/


### Step 2 Load and Group the Data


In [2]:
data = {}

with gzip.open(f'{input_directory}/rerank.jsonl.gz') as rerank:
    for l in rerank:
        l = json.loads(l)
        if l['qid'] not in data:
            data[l['qid']] = {}
            data[l['qid']]['query'] = l['query']
        data[l['qid']][l['docno']] = l['text']


### Step 3 Re-rank Data

In [3]:
run = []

for qid in tqdm(data):
    !rm -Rf /tmp/anserini-re-rank
    !mkdir -p /tmp/anserini-re-rank/anserini-docs
    with open(f'/tmp/anserini-re-rank/anserini-docs/part-01.json', 'w') as ans:
        for docno, text in data[qid].items():
            ans.write(json.dumps({"id": docno, "contents":text}) + '\n')

    !python -m pyserini.index.lucene \
        --collection JsonCollection \
        --input /tmp/anserini-re-rank/anserini-docs \
        --index /tmp/anserini-re-rank/index \
        --generator DefaultLuceneDocumentGenerator \
        --threads 1 \
        --storePositions --storeDocvectors

    searcher = LuceneSearcher('/tmp/anserini-re-rank/index')
    searcher.set_bm25()
    scores = {}
    for doc in searcher.search(data[qid]['query'], 1000):
        scores[doc.docid] = doc.score

    min_score = min(scores.values()) -1
    for doc in data[qid].keys():
        run += [{"qid": qid, "score": scores.get(doc, min_score), "docno": doc}]
run = pd.DataFrame(run)

  0%|                                                                                                                       | 0/2 [00:00<?, ?it/s]

2023-07-21 08:11:37,595 INFO  [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO
2023-07-21 08:11:37,597 INFO  [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...
2023-07-21 08:11:37,597 INFO  [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-re-rank/anserini-docs
2023-07-21 08:11:37,598 INFO  [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection
2023-07-21 08:11:37,598 INFO  [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator
2023-07-21 08:11:37,598 INFO  [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1
2023-07-21 08:11:37,598 INFO  [main] index.IndexCollection (IndexCollection.java:259) - Language: en
2023-07-21 08:11:37,598 INFO  [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter
2023-07-21 08:11:37,599 INFO  [main] index.IndexCollection (IndexCollection

 50%|███████████████████████████████████████████████████████▌                                                       | 1/2 [00:03<00:03,  3.93s/it]

2023-07-21 08:11:41,454 INFO  [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO
2023-07-21 08:11:41,456 INFO  [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...
2023-07-21 08:11:41,456 INFO  [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-re-rank/anserini-docs
2023-07-21 08:11:41,456 INFO  [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection
2023-07-21 08:11:41,456 INFO  [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator
2023-07-21 08:11:41,457 INFO  [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1
2023-07-21 08:11:41,457 INFO  [main] index.IndexCollection (IndexCollection.java:259) - Language: en
2023-07-21 08:11:41,457 INFO  [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter
2023-07-21 08:11:41,457 INFO  [main] index.IndexCollection (IndexCollection

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.79s/it]


### Step 4: Persist Run

In [4]:
persist_and_normalize_run(run, output_file=output_directory, system_name='BM25', depth=1000)