In [None]:
!apt-get install -y openjdk-21-jdk maven
!pip install pyserini
!pip install ir_datasets


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  ca-certificates-java default-jre-headless fonts-dejavu-core
  fonts-dejavu-extra java-common libaopalliance-java libapache-pom-java
  libatinject-jsr330-api-java libatk-wrapper-java libatk-wrapper-java-jni
  libcdi-api-java libcommons-cli-java libcommons-io-java libcommons-lang3-java
  libcommons-parent-java libgeronimo-annotation-1.3-spec-java
  libgeronimo-interceptor-3.0-spec-java libguava-java libguice-java
  libhawtjni-runtime-java libjansi-java libjansi-native-java libjsr305-java
  libmaven-parent-java libmaven-resolver-java libmaven-shared-utils-java
  libmaven3-core-java libpcsclite1 libplexus-cipher-java
  libplexus-classworlds-java libplexus-component-annotations-java
  libplexus-interpolation-java libplexus-sec-dispatcher-java
  libplexus-utils2-java libsisu-inject-java libsisu-plexus-java libslf4j-java
  libwagon-file-java l

In [None]:
!git clone https://github.com/castorini/anserini-tools

Cloning into 'anserini-tools'...
remote: Enumerating objects: 1163, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 1163 (delta 74), reused 96 (delta 59), pack-reused 1037 (from 3)[K
Receiving objects: 100% (1163/1163), 795.88 MiB | 34.71 MiB/s, done.
Resolving deltas: 100% (259/259), done.
Updating files: 100% (860/860), done.


In [1]:
import json
from pathlib import Path
import ir_datasets

# dataset download
dataset = ir_datasets.load("beir/trec-covid")


In [2]:
# metadata creation
docs= dataset.docs_iter()
queries= list(dataset.queries_iter())
qrels= dataset.qrels_iter()

In [3]:
# json file creation
# writing documents into file json
output_folder = Path("data/trec_covid_jsonl")
output_folder.mkdir(parents=True, exist_ok=True)
output_path = output_folder / "docs.jsonl"

with open(output_path, "w", encoding="utf-8") as f_out:
    for doc in docs:
        text = (doc.title + " " + doc.text).strip() if hasattr(doc, "title") else doc.text
        record = {"id": doc.doc_id, "contents": text}
        f_out.write(json.dumps(record, ensure_ascii=False) + "\n")

In [4]:
# writing qrels into file txt
qrels_folder= Path("data/qrels")
qrels_folder.mkdir(parents=True, exist_ok=True)
qrels_path= qrels_folder / "qrels.txt"

with open(qrels_path, "w", encoding="utf-8") as f_out:
  for qrel in qrels:
    f_out.write(f"{qrel.query_id} {qrel.iteration} {qrel.doc_id} {qrel.relevance}\n")

In [5]:
# pyserini index creation
from pyserini.index.lucene import LuceneIndexReader
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input data/trec_covid_jsonl \
  --index pyserini_indexes/trec_covid_index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions\
  --storeDocvectors \
  --storeRaw



2025-11-21 15:04:35,477 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-11-21 15:04:35,478 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-11-21 15:04:35,478 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: data/trec_covid_jsonl
2025-11-21 15:04:35,478 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-11-21 15:04:35,478 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: pyserini_indexes/trec_covid_index
2025-11-21 15:04:35,479 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 4
2025-11-21 15:04:35,479 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
Nov 21, 2025 3:04:35 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable 

In [6]:
# reading by terms (frequence in documents)
# non serve
index_reader = LuceneIndexReader('pyserini_indexes/trec_covid_index')

term = 'beauty'

analyzed_form = index_reader.analyze(term)

if analyzed_form:
  # Look up its document frequency (df) and collection frequency (cf).
  df, cf = index_reader.get_term_counts(term)
  print(f'Analyzed form of term "{analyzed_form[0]}": df={df}, cf={cf}')
else:
  print(f'Term "{term}" is a stopword or not indexed.')

Analyzed form of term "beauti": df=28, cf=30


Nov 21, 2025 3:05:02 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [7]:
# BM25 Searching
from pyserini.search.lucene import LuceneSearcher
import pandas as pd

query_id=int(queries[1][0])
query_text= queries[1][1]

lucene_bm25_searcher = LuceneSearcher('pyserini_indexes/trec_covid_index')
hits = lucene_bm25_searcher.search(query_text, k=10)


print(query_id," ", query_text, ":\n" )


data_frame = pd.read_csv("data/qrels/qrels.txt", sep=" ", names=["query_id", "iteration", "doc_id", "relevance"])
data_frame["query_id"] = data_frame["query_id"].astype(int)
# qrels filtered by query_id i am looking for
filtered = data_frame[data_frame["query_id"] == query_id]

for i in range(0, len(hits)):
  doc_id = hits[i].docid
  # Filtra i QRELs per il doc_id corrente
  rel = filtered[filtered['doc_id'] == doc_id]

  if not rel.empty:
        relevance_value = rel['relevance'].values[0]
  else:
        relevance_value = 0
        print("non esiste")

  print(f"{i+1:2}. DOC: {hits[i].docid:7} SCORE: {hits[i].score:.5f} | Relevance: {relevance_value}")

#hits[8].lucene_document.get('raw')


2   how does the coronavirus respond to changes in the weather :

 1. DOC: w5kjmw88 SCORE: 10.45690 | Relevance: 2
 2. DOC: gan10za0 SCORE: 10.33940 | Relevance: 1
 3. DOC: 124czudi SCORE: 9.44770 | Relevance: 0
 4. DOC: 526elsrf SCORE: 8.48690 | Relevance: 2
 5. DOC: h5ufxzv9 SCORE: 8.25990 | Relevance: 0
 6. DOC: r1yjphnn SCORE: 8.02550 | Relevance: 2
 7. DOC: w7ycc07b SCORE: 8.02550 | Relevance: 2
 8. DOC: amzc5yrd SCORE: 7.97300 | Relevance: 0
non esiste
 9. DOC: 16k5946u SCORE: 7.85980 | Relevance: 0
non esiste
10. DOC: xwz7hj2b SCORE: 7.85980 | Relevance: 0


In [None]:
!python -m pyserini.encode \
  input   --corpus data/trec_covid_jsonl \
          --fields contents \
          --delimiter "\n" \
          --shard-id 0 \
          --shard-num 1 \
  output  --embeddings pyserini_indexes/dense_vector_index \
  encoder --encoder castorini/tct_colbert-v2-hnp-msmarco \
          --fields text \
          --batch 8 \
          --device cpu



171332it [00:00, 216285.04it/s]
  0%|                                                 | 0/21417 [00:00<?, ?it/s]
Traceback (most recent call last):
  File [35m"<frozen runpy>"[0m, line [35m198[0m, in [35m_run_module_as_main[0m
  File [35m"<frozen runpy>"[0m, line [35m88[0m, in [35m_run_code[0m
  File [35m"/Users/luuk/Uni/IR/2025IRProject/.venv/lib/python3.13/site-packages/pyserini/encode/__main__.py"[0m, line [35m144[0m, in [35m<module>[0m
    kwargs['texts'] = [31mbatch_info[0m[1;31m['text'][0m # pyserini text encoders takes 'texts' as default input
                      [31m~~~~~~~~~~[0m[1;31m^^^^^^^^[0m
[1;35mKeyError[0m: [35m'text'[0m


In [None]:
from pyserini.search.lucene import LuceneHnswDenseSearcher
lucene_hnsw_searcher = LuceneHnswDenseSearcher('pyserini_indexes/trec_covid_index', ef_search=1000, encoder='BgeBaseEn15')
hits_ = lucene_hnsw_searcher.search(query_text, 10)

for i in range(0, len(hits_)):
    print(f'{i+1:2} {hits_[i].docid:7} {hits_[i].score:.5f}')