In [7]:
!pip install beir
!pip install -U sentence-transformers
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [44]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

import logging, pathlib, os, string
from tqdm.notebook import tqdm

from rank_bm25 import BM25Okapi

import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
en_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Available datasets in BEIR:

| Dataset   | Website| BEIR-Name | Domain     | Relevancy| Queries  | Documents | Avg. Docs/Q | Download | 
| -------- | -----| ---------| ----------- | ---------| ---------| --------- | ------| ------------| 
| MSMARCO    | [``Homepage``](https://microsoft.github.io/msmarco/)| ``msmarco`` | Misc.       |  Binary  |  6,980   |  8.84M     |    1.1 | Yes |  
| TREC-COVID |  [``Homepage``](https://ir.nist.gov/covidSubmit/index.html)| ``trec-covid``| Bio-Medical |  3-level|50|  171K| 493.5 | Yes | 
| NFCorpus   | [``Homepage``](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | ``nfcorpus``  | Bio-Medical |  3-level |  323     |  3.6K     |  38.2 | Yes |
| BioASQ     | [``Homepage``](http://bioasq.org) | ``bioasq``| Bio-Medical |  Binary  |   500    |  14.91M    |  8.05 | No | 
| NQ         | [``Homepage``](https://ai.google.com/research/NaturalQuestions) | ``nq``| Wikipedia   |  Binary  |  3,452   |  2.68M  |  1.2 | Yes | 
| HotpotQA   | [``Homepage``](https://hotpotqa.github.io) | ``hotpotqa``| Wikipedia   |  Binary  |  7,405   |  5.23M  |  2.0 | Yes |
| FiQA-2018  | [``Homepage``](https://sites.google.com/view/fiqa/) | ``fiqa``    | Finance     |  Binary  |  648     |  57K    |  2.6 | Yes | 
| Signal-1M (RT) | [``Homepage``](https://research.signal-ai.com/datasets/signal1m-tweetir.html)| ``signal1m`` | Twitter     |  3-level  |   97   |  2.86M  |  19.6 | No |
| TREC-NEWS  | [``Homepage``](https://trec.nist.gov/data/news2019.html) | ``trec-news``    | News     |  5-level  |   57    |  595K    |  19.6 | No |
| ArguAna    | [``Homepage``](http://argumentation.bplaced.net/arguana/data) | ``arguana`` | Misc.       |  Binary  |  1,406     |  8.67K    |  1.0 | Yes |
| Touche-2020| [``Homepage``](https://webis.de/events/touche-20/shared-task-1.html) | ``webis-touche2020``| Misc.       |  6-level  |  49     |  382K    |  49.2 |  Yes |
| CQADupstack| [``Homepage``](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) | ``cqadupstack``| StackEx.      |  Binary  |  13,145 |  457K  |  1.4 |  Yes |
| Quora| [``Homepage``](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) | ``quora``| Quora  | Binary  |  10,000     |  523K    |  1.6 |  Yes | 
| DBPedia | [``Homepage``](https://github.com/iai-group/DBpedia-Entity/) | ``dbpedia-entity``| Wikipedia |  3-level  |  400    |  4.63M    |  38.2 |  Yes | 
| SCIDOCS| [``Homepage``](https://allenai.org/data/scidocs) | ``scidocs``| Scientific |  Binary  |  1,000     |  25K    |  4.9 |  Yes | 
| FEVER| [``Homepage``](http://fever.ai) | ``fever``| Wikipedia     |  Binary  |  6,666     |  5.42M    |  1.2|  Yes | 
| Climate-FEVER| [``Homepage``](http://climatefever.ai) | ``climate-fever``| Wikipedia |  Binary  |  1,535     |  5.42M |  3.0 |  Yes |
| SciFact| [``Homepage``](https://github.com/allenai/scifact) | ``scifact``| Scientific |  Binary  |  300     |  5K    |  1.1 |  Yes |

#Load dataset: scifact

In [3]:
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

/content/datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

Dataset downloaded here: /content/datasets/scifact


In [58]:
data_path = "datasets/scifact"
corpus, queries, qrels = GenericDataLoader(data_path).load('train') # train test dev

  0%|          | 0/5183 [00:00<?, ?it/s]

In [33]:
corpus_list = [text_title['text'] for _, text_title in corpus.items()]

# Documents Pre-Processing

In [62]:
ps = PorterStemmer()

corpus_preprocessed = {}
queries_preprocessed = {}

for idx, text_title in tqdm(corpus.items(), total=len(corpus.items()), desc='Documents Pre-Processing'):
  word_document = text_title['text'].translate(str.maketrans('', '', string.punctuation)).strip().split(' ') # Removing puntaction
  corpus_preprocessed[idx] = ([ps.stem(i.lower()) for i in word_document if i not in en_stops]) # Removing stopwords, stemming and lowercase

for idx, text in tqdm(queries.items(), total=len(queries.items()), desc='Queries Pre-Processing'):
  word_query = text.translate(str.maketrans('', '', string.punctuation)).strip().split(' ') # Removing puntaction
  queries_preprocessed[idx] = ([ps.stem(i.lower()) for i in word_query if i not in en_stops]) # Removing stopwords, stemming and lowercase


Documents Pre-Processing:   0%|          | 0/5183 [00:00<?, ?it/s]

Queries Pre-Processing:   0%|          | 0/809 [00:00<?, ?it/s]

# Sparse Representation

In [63]:
bm25 = BM25Okapi(corpus_preprocessed.values())

query_doc_scores = {}

for idx, query in tqdm(queries_preprocessed.items(), total=len(queries_preprocessed), desc='Obtaining Query - Documents Score'):    
  query_doc_scores[idx] = (bm25.get_scores(query))

Optaining the score query - documents:   0%|          | 0/809 [00:00<?, ?it/s]

In [67]:
doc_keys = list(corpus.keys())

for idx, res in query_doc_scores.items():
  winner = np.argmax(res)
  print(f'Predicted - Best document for query {idx} is the {doc_keys[winner]}')
  print(f'Ground Truth - Best document for query {idx} is the {qrels[str(idx)]}\n')

Predicted - Best document for query 0 is the 26071782
Ground Truth - Best document for query 0 is the {'31715818': 1}

Predicted - Best document for query 2 is the 13734012
Ground Truth - Best document for query 2 is the {'13734012': 1}

Predicted - Best document for query 4 is the 1387104
Ground Truth - Best document for query 4 is the {'22942787': 1}

Predicted - Best document for query 6 is the 23117378
Ground Truth - Best document for query 6 is the {'2613775': 1}

Predicted - Best document for query 9 is the 44265107
Ground Truth - Best document for query 9 is the {'44265107': 1}

Predicted - Best document for query 10 is the 13780287
Ground Truth - Best document for query 10 is the {'32587939': 1}

Predicted - Best document for query 11 is the 13780287
Ground Truth - Best document for query 11 is the {'32587939': 1}

Predicted - Best document for query 12 is the 11705328
Ground Truth - Best document for query 12 is the {'33409100': 1}

Predicted - Best document for query 14 is th

# Dense Representation