In [1]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import json
import time
import random
import pandas as pd
import os
import subprocess

# Metrics
from src.metrics import (
    mrr_score,
    map_score,
    mr_score,
    mf1_score,
    mndcg_score,
)

# Implementations
from rank_bm25 import BM25Okapi
from pyserini.search.lucene import LuceneSearcher
from fastbm25 import fastbm25

import pickle

In [2]:
index_folder = 'index'

In [3]:
# Load datasets

with open('data/temp_data/dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)
dataset_tokenized_docs = [word_tokenize(doc) for doc in tqdm(dataset.documents.values())]

with open('data/temp_data/dataset_lower.pkl', 'rb') as f:
    dataset_lower = pickle.load(f)
dataset_lower_tokenized_docs = [word_tokenize(doc) for doc in tqdm(dataset_lower.documents.values())]

with open('data/temp_data/dataset_prepro.pkl', 'rb') as f:
    dataset_prepro = pickle.load(f)
dataset_prepro_tokenized_docs = [word_tokenize(doc) for doc in tqdm(dataset_prepro.documents.values())]

docs_ids = list(dataset.documents.keys())

100%|██████████| 277168/277168 [00:56<00:00, 4866.22it/s]
100%|██████████| 277168/277168 [00:56<00:00, 4928.77it/s]
100%|██████████| 277168/277168 [00:28<00:00, 9891.09it/s] 


In [4]:
# Build indexes
os.makedirs(index_folder, exist_ok=True)

def build_index(dataset, dataset_name):
    index_dataset_folder = os.path.join(index_folder, 'input_' + dataset_name)
    os.makedirs(index_dataset_folder, exist_ok=True)

    with open(os.path.join(index_dataset_folder, 'index.jsonl'), 'w') as f:
        for id, text in tqdm(dataset.documents.items()):
            f.write(json.dumps({"id": id, "contents": text}) + "\n")
    
    index_dataset_folder_output = index_dataset_folder.replace('input_', 'output_')
    os.makedirs(index_dataset_folder_output, exist_ok=True)
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "JsonCollection",
        "--input", index_dataset_folder,
        "--index", index_dataset_folder_output,
        "--generator", "DefaultLuceneDocumentGenerator",
        "--threads", "8",
        "--storePositions", "--storeDocvectors", "--storeRaw"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)

build_index(dataset, 'dataset')
build_index(dataset_lower, 'dataset_lower')
build_index(dataset_prepro, 'dataset_prepro')

100%|██████████| 277168/277168 [00:02<00:00, 138566.94it/s]
100%|██████████| 277168/277168 [00:02<00:00, 118615.18it/s]
100%|██████████| 277168/277168 [00:01<00:00, 160500.02it/s]


In [5]:
random.seed(42)
queries_ids = random.choices(list(dataset.train_query_ids), k=len(dataset.test_query_ids))
print(f'Using {len(queries_ids)} queries for validate BM25 implementation')

Using 555 queries for validate BM25 implementation


In [6]:
models = [
    ('dataset', BM25Okapi(dataset_tokenized_docs), fastbm25(dataset_tokenized_docs), LuceneSearcher(os.path.join(index_folder, 'output_dataset'))),
    ('dataset_lower', BM25Okapi(dataset_lower_tokenized_docs), fastbm25(dataset_lower_tokenized_docs), LuceneSearcher(os.path.join(index_folder, 'output_dataset_lower'))),
    ('dataset_prepro', BM25Okapi(dataset_prepro_tokenized_docs), fastbm25(dataset_prepro_tokenized_docs), LuceneSearcher(os.path.join(index_folder, 'output_dataset_prepro'))),
]

In [7]:
results_score_docs = {}
results_times = {}
for dataset_name, bm25, bm25_f, bm25_s in models:
    score_docs = {}
    score_docs_f = {}
    score_docs_s = {}

    times = []
    times_f = []
    times_s = []

    k = 100

    for query_id in tqdm(queries_ids, desc=f'Processing {dataset_name}'):
        if dataset_name == 'dataset':
            query = dataset.queries[query_id]
        elif dataset_name == 'dataset_lower':
            query = dataset_lower.queries[query_id]
        elif dataset_name == 'dataset_prepro':
            query = dataset_prepro.queries[query_id]
        tokenized_query = word_tokenize(query)

        start = time.time()
        scores = bm25.get_scores(tokenized_query)
        scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
        scores = [(docs_ids[i], float(score)) for i, score in scores]
        end = time.time()
        times.append(end - start)
        score_docs[query_id] = scores

        start_f = time.time()
        scores_f = bm25_f.top_k_sentence(tokenized_query, k)
        scores_f = [(docs_ids[index], score) for doc, index, score in scores_f]
        end_f = time.time()
        times_f.append(end_f - start_f)
        score_docs_f[query_id] = scores_f

        start_s = time.time()
        scores_s = bm25_s.search(query, k)
        scores_s = [(hit.docid, hit.score) for hit in scores_s]
        end_s = time.time()
        times_s.append(end_s - start_s)
        score_docs_s[query_id] = scores_s

    results_score_docs[dataset_name] = {
        'bm25': score_docs,
        'fastbm25': score_docs_f,
        'pyserini': score_docs_s,
    }

    results_times[dataset_name] = {
        'bm25': times,
        'fastbm25': times_f,
        'pyserini': times_s,
    }

Processing dataset:  52%|█████▏    | 288/555 [05:24<05:01,  1.13s/it]


KeyboardInterrupt: 

In [None]:
results_bm25_folder = os.path.join('results', 'raw', 'bm25')
os.makedirs(results_bm25_folder, exist_ok=True)

for dataset_name, scores in results_score_docs.items():
    for implementation, score_docs in scores.items():
        with open(os.path.join(results_bm25_folder, f'{dataset_name}_{implementation}.json'), 'w') as f:
            json.dump({
                'score_docs': score_docs,
                'times': results_times[dataset_name][implementation],
            }, f)

In [None]:
k = 10
results_list = []
for dataset_name in ['dataset', 'dataset_lower', 'dataset_prepro']:
    score_docs = results_score_docs[dataset_name]['bm25']
    score_docs_f = results_score_docs[dataset_name]['fastbm25']
    score_docs_s = results_score_docs[dataset_name]['pyserini']

    times = results_times[dataset_name]['bm25']
    times_f = results_times[dataset_name]['fastbm25']
    times_s = results_times[dataset_name]['pyserini']
    for name, ss, ts in [
        ('bm25', score_docs, times),
        ('fastbm25', score_docs_f, times_f),
        ('pyserini', score_docs_s, times_s),
    ]:
        results_list.append({
            'name': name,
            'dataset': dataset_name,
            'mrr': mrr_score(ss, dataset.qrels, k=k),
            'map': map_score(ss, dataset.qrels, k=k),
            'mr': mr_score(ss, dataset.qrels, k=k),
            'mf1': mf1_score(ss, dataset.qrels, k=k),
            'mndcg': mndcg_score(ss, dataset.qrels, k=k),
            'avg_time': sum(ts) / len(ts),
            'time_std': pd.Series(ts).std(),
            'time_max': max(ts),
            'time_min': min(ts),
        })
results = pd.DataFrame(results_list)
results.to_csv(os.path.join(results_bm25_folder, 'metrics.csv'), index=False)
results.sort_values(by=['mrr']).style.format({
    'mrr': '{:.2f}',
    'map': '{:.2f}',
    'mr': '{:.2f}',
    'mf1': '{:.2f}',
    'mndcg': '{:.2f}',
    'avg_time': '{:.2f}',
    'max_time': '{:.2f}',
    'min_time': '{:.2f}',
    'std_time': '{:.2f}',
})

Unnamed: 0,name,dataset,mrr,map,mr,mf1,mndcg,avg_time,time_std,time_max,time_min
0,bm25,dataset,0.19,0.03,0.32,0.06,0.22,1.45,0.791564,16.244423,0.552158
1,fastbm25,dataset,0.19,0.03,0.32,0.06,0.22,0.19,0.389642,8.839134,0.0
3,bm25,dataset_lower,0.43,0.06,0.62,0.11,0.46,1.55,0.729511,12.155577,0.549005
4,fastbm25,dataset_lower,0.43,0.06,0.62,0.11,0.46,0.19,0.160835,2.126905,0.000993
7,fastbm25,dataset_prepro,0.48,0.07,0.67,0.12,0.52,0.07,0.143135,3.091843,0.001001
6,bm25,dataset_prepro,0.48,0.07,0.67,0.12,0.52,1.12,0.562874,11.776602,0.531992
8,pyserini,dataset_prepro,0.5,0.07,0.67,0.12,0.53,0.01,0.02802,0.548584,0.002884
2,pyserini,dataset,0.51,0.07,0.68,0.13,0.54,0.03,0.158303,3.710677,0.006531
5,pyserini,dataset_lower,0.51,0.07,0.68,0.13,0.54,0.02,0.045316,0.988006,0.0
