In [1]:
!pip install beir

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from typing import Dict, List, Tuple
from __future__ import annotations

from beir import util
from beir.datasets.data_loader import GenericDataLoader

import pathlib, os, string
from tqdm.notebook import tqdm

import time
import numpy as np
from scipy import sparse
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import multiprocessing as mp
import string
import math
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm


# Available Datasets

| Dataset   | Website| BEIR-Name | Domain     | Relevancy| Queries  | Documents | Avg. Docs/Q | Download | 
| -------- | -----| ---------| ----------- | ---------| ---------| --------- | ------| ------------| 
| MSMARCO    | [``Homepage``](https://microsoft.github.io/msmarco/)| ``msmarco`` | Misc.       |  Binary  |  6,980   |  8.84M     |    1.1 | Yes |  
| TREC-COVID |  [``Homepage``](https://ir.nist.gov/covidSubmit/index.html)| ``trec-covid``| Bio-Medical |  3-level|50|  171K| 493.5 | Yes | 
| NFCorpus   | [``Homepage``](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | ``nfcorpus``  | Bio-Medical |  3-level |  323     |  3.6K     |  38.2 | Yes |
| BioASQ     | [``Homepage``](http://bioasq.org) | ``bioasq``| Bio-Medical |  Binary  |   500    |  14.91M    |  8.05 | No | 
| NQ         | [``Homepage``](https://ai.google.com/research/NaturalQuestions) | ``nq``| Wikipedia   |  Binary  |  3,452   |  2.68M  |  1.2 | Yes | 
| HotpotQA   | [``Homepage``](https://hotpotqa.github.io) | ``hotpotqa``| Wikipedia   |  Binary  |  7,405   |  5.23M  |  2.0 | Yes |
| FiQA-2018  | [``Homepage``](https://sites.google.com/view/fiqa/) | ``fiqa``    | Finance     |  Binary  |  648     |  57K    |  2.6 | Yes | 
| Signal-1M (RT) | [``Homepage``](https://research.signal-ai.com/datasets/signal1m-tweetir.html)| ``signal1m`` | Twitter     |  3-level  |   97   |  2.86M  |  19.6 | No |
| TREC-NEWS  | [``Homepage``](https://trec.nist.gov/data/news2019.html) | ``trec-news``    | News     |  5-level  |   57    |  595K    |  19.6 | No |
| ArguAna    | [``Homepage``](http://argumentation.bplaced.net/arguana/data) | ``arguana`` | Misc.       |  Binary  |  1,406     |  8.67K    |  1.0 | Yes |
| Touche-2020| [``Homepage``](https://webis.de/events/touche-20/shared-task-1.html) | ``webis-touche2020``| Misc.       |  6-level  |  49     |  382K    |  49.2 |  Yes |
| CQADupstack| [``Homepage``](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) | ``cqadupstack``| StackEx.      |  Binary  |  13,145 |  457K  |  1.4 |  Yes |
| Quora| [``Homepage``](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) | ``quora``| Quora  | Binary  |  10,000     |  523K    |  1.6 |  Yes | 
| DBPedia | [``Homepage``](https://github.com/iai-group/DBpedia-Entity/) | ``dbpedia-entity``| Wikipedia |  3-level  |  400    |  4.63M    |  38.2 |  Yes | 
| SCIDOCS| [``Homepage``](https://allenai.org/data/scidocs) | ``scidocs``| Scientific |  Binary  |  1,000     |  25K    |  4.9 |  Yes | 
| FEVER| [``Homepage``](http://fever.ai) | ``fever``| Wikipedia     |  Binary  |  6,666     |  5.42M    |  1.2|  Yes | 
| Climate-FEVER| [``Homepage``](http://climatefever.ai) | ``climate-fever``| Wikipedia |  Binary  |  1,535     |  5.42M |  3.0 |  Yes |
| SciFact| [``Homepage``](https://github.com/allenai/scifact) | ``scifact``| Scientific |  Binary  |  300     |  5K    |  1.1 |  Yes |


In [3]:
def download_dataset(dataset: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, str]]]:
  '''
  PURPOSE: download the dataset
  ARGUMENTS:
    - dataset (str): string describing the beir dataset
  RETURN:
    - corpus, queries (Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, str]]]): documents, queries of the respective dataset
  '''
  data_path = f'datasets/{dataset}'
  if not os.path.isdir(data_path):
    url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
    out_dir = os.path.join(os.getcwd(), 'datasets')
    data_path = util.download_and_unzip(url, out_dir)
    print(f'Dataset downloaded here: {data_path}')
  corpus, _, _ = GenericDataLoader(data_path).load(split="test")
  return corpus

datasets = ['scifact', 'nfcorpus'] # Choosen datasets 
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]

datasets_data = {dataset: download_dataset(dataset) for dataset in datasets}

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

In [4]:
def all_pairs_docs_sim1(ds_name, threshold):
    count = 0
    texts = [title_text['text'] for _, title_text in datasets_data[ds_name].items()]
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(texts)
    
    start = time.time()
    for doc_1, feature_vector_1 in tqdm(enumerate(features), total=len(texts), desc=f'Scanning {ds_name} with threshold {threshold} - 1° loop'):
        for doc_2, feature_vector_2 in tqdm(enumerate(features), total=len(texts), desc=f'Scanning {ds_name} with threshold {threshold} - 2° loop', leave=False):
            if doc_1 != doc_2 and cosine_similarity(feature_vector_1, feature_vector_2)[0][0] >= threshold: count += 1
    end = time.time()
    
    return {'count': count, 'elapsed': end-start}

In [5]:
def all_pairs_docs_sim2(ds_name, threshold):
    count = 0
    texts = [title_text['text'] for _, title_text in datasets_data[ds_name].items()]
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(texts)
    
    start = time.time()
    similarities = cosine_similarity(features)
    for doc_sims in (-np.sort(-similarities)):
        for doc_sim in doc_sims[1:]:
            if doc_sim >= threshold: count += 1
            else: break
    end = time.time()
    
    return {'threshold': threshold, 'count': int(count/2), 'elapsed': end-start}

In [6]:
def all_pairs_docs_sim3(ds_name, threshold):
    count = 0
    texts = [title_text['text'] for _, title_text in datasets_data[ds_name].items()]
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(texts)
    
    start = time.time()
    similarities = cosine_similarity(features)
    for doc_1, doc_sims in enumerate(similarities):
        for doc_sim in doc_sims[(doc_1+1):]:
            if doc_sim >= threshold: count += 1

    end = time.time()
    
    return {'threshold': threshold, 'count': count, 'elapsed': end-start}

In [7]:
def perform_all_pairs_docs_sim():
    result = {}
    pbar1 = tqdm(datasets_data.items(), total=len(datasets_data), desc='All Documents Pairs Similarities')
    for datasets_name, corpus in pbar1:
        pbar1.set_description(f'All Documents Pairs Similarities - {datasets_name}', refresh=True)
        result[datasets_name] = []
        pbar2 = tqdm(thresholds, total=len(thresholds), desc='Running for each thresholds', leave=False)
        for threshold in pbar2:
            pbar2.set_description(f'Running with threshold {threshold}', refresh=True)
            result[datasets_name].append(all_pairs_docs_sim2(datasets_name, threshold))
    return result

In [8]:
res = perform_all_pairs_docs_sim() 
res

All Documents Pairs Similarities:   0%|          | 0/2 [00:00<?, ?it/s]

Running for each thresholds:   0%|          | 0/7 [00:00<?, ?it/s]

Running for each thresholds:   0%|          | 0/7 [00:00<?, ?it/s]

{'scifact': [{'threshold': 0.7, 'count': 15, 'elapsed': 5.46851921081543},
  {'threshold': 0.75, 'count': 7, 'elapsed': 7.785690546035767},
  {'threshold': 0.8, 'count': 4, 'elapsed': 4.478631258010864},
  {'threshold': 0.85, 'count': 1, 'elapsed': 4.177727937698364},
  {'threshold': 0.9, 'count': 0, 'elapsed': 4.030917644500732},
  {'threshold': 0.95, 'count': 0, 'elapsed': 5.20185661315918},
  {'threshold': 0.99, 'count': 0, 'elapsed': 4.056122779846191}],
 'nfcorpus': [{'threshold': 0.7, 'count': 54, 'elapsed': 2.0246214866638184},
  {'threshold': 0.75, 'count': 47, 'elapsed': 2.2564122676849365},
  {'threshold': 0.8, 'count': 42, 'elapsed': 2.6439292430877686},
  {'threshold': 0.85, 'count': 41, 'elapsed': 2.018648624420166},
  {'threshold': 0.9, 'count': 41, 'elapsed': 2.0352766513824463},
  {'threshold': 0.95, 'count': 41, 'elapsed': 2.0306661128997803},
  {'threshold': 0.99, 'count': 41, 'elapsed': 2.4832704067230225}]}