In [1]:
!pip install beir
!pip install pandas
!pip install sklearn
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install ipywidgets

Collecting sklearn
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[18 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-lear

In [13]:
from typing import Dict, List, Tuple

from beir import util
from beir.datasets.data_loader import GenericDataLoader

import os
try:
    import ipywidgets
    from tqdm.auto import tqdm
except ModuleNotFoundError:
    from tqdm import tqdm


import spacy

import time
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Available Datasets

| Dataset   | Website| BEIR-Name | Domain     | Relevancy| Queries  | Documents | Avg. Docs/Q | Download | 
| -------- | -----| ---------| ----------- | ---------| ---------| --------- | ------| ------------| 
| MSMARCO    | [``Homepage``](https://microsoft.github.io/msmarco/)| ``msmarco`` | Misc.       |  Binary  |  6,980   |  8.84M     |    1.1 | Yes |  
| TREC-COVID |  [``Homepage``](https://ir.nist.gov/covidSubmit/index.html)| ``trec-covid``| Bio-Medical |  3-level|50|  171K| 493.5 | Yes | 
| NFCorpus   | [``Homepage``](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | ``nfcorpus``  | Bio-Medical |  3-level |  323     |  3.6K     |  38.2 | Yes |
| BioASQ     | [``Homepage``](http://bioasq.org) | ``bioasq``| Bio-Medical |  Binary  |   500    |  14.91M    |  8.05 | No | 
| NQ         | [``Homepage``](https://ai.google.com/research/NaturalQuestions) | ``nq``| Wikipedia   |  Binary  |  3,452   |  2.68M  |  1.2 | Yes | 
| HotpotQA   | [``Homepage``](https://hotpotqa.github.io) | ``hotpotqa``| Wikipedia   |  Binary  |  7,405   |  5.23M  |  2.0 | Yes |
| FiQA-2018  | [``Homepage``](https://sites.google.com/view/fiqa/) | ``fiqa``    | Finance     |  Binary  |  648     |  57K    |  2.6 | Yes | 
| Signal-1M (RT) | [``Homepage``](https://research.signal-ai.com/datasets/signal1m-tweetir.html)| ``signal1m`` | Twitter     |  3-level  |   97   |  2.86M  |  19.6 | No |
| TREC-NEWS  | [``Homepage``](https://trec.nist.gov/data/news2019.html) | ``trec-news``    | News     |  5-level  |   57    |  595K    |  19.6 | No |
| ArguAna    | [``Homepage``](http://argumentation.bplaced.net/arguana/data) | ``arguana`` | Misc.       |  Binary  |  1,406     |  8.67K    |  1.0 | Yes |
| Touche-2020| [``Homepage``](https://webis.de/events/touche-20/shared-task-1.html) | ``webis-touche2020``| Misc.       |  6-level  |  49     |  382K    |  49.2 |  Yes |
| CQADupstack| [``Homepage``](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) | ``cqadupstack``| StackEx.      |  Binary  |  13,145 |  457K  |  1.4 |  Yes |
| Quora| [``Homepage``](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) | ``quora``| Quora  | Binary  |  10,000     |  523K    |  1.6 |  Yes | 
| DBPedia | [``Homepage``](https://github.com/iai-group/DBpedia-Entity/) | ``dbpedia-entity``| Wikipedia |  3-level  |  400    |  4.63M    |  38.2 |  Yes | 
| SCIDOCS| [``Homepage``](https://allenai.org/data/scidocs) | ``scidocs``| Scientific |  Binary  |  1,000     |  25K    |  4.9 |  Yes | 
| FEVER| [``Homepage``](http://fever.ai) | ``fever``| Wikipedia     |  Binary  |  6,666     |  5.42M    |  1.2|  Yes | 
| Climate-FEVER| [``Homepage``](http://climatefever.ai) | ``climate-fever``| Wikipedia |  Binary  |  1,535     |  5.42M |  3.0 |  Yes |
| SciFact| [``Homepage``](https://github.com/allenai/scifact) | ``scifact``| Scientific |  Binary  |  300     |  5K    |  1.1 |  Yes |


# Dataset Download & Pre-Processing

In [14]:
def download_dataset(dataset: str) -> List[str]:
	'''
	PURPOSE: download the dataset
	ARGUMENTS:
		- dataset (str): string describing the beir dataset
	RETURN:
		- (List[str]) list of documents
	'''
	data_path = f'datasets/{dataset}'
	if not os.path.isdir(data_path):
		url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
		out_dir = os.path.join(os.getcwd(), 'datasets')
		data_path = util.download_and_unzip(url, out_dir)
		print(f'Dataset downloaded here: {data_path}')
	corpus, _, _ = GenericDataLoader(data_path).load(split="test")
	return [title_text['text'] for _, title_text in corpus.items()]
	

datasets = ['scifact', 'nfcorpus'] # Choosen datasets 
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]

datasets_data = {dataset: download_dataset(dataset) for dataset in datasets}

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

## Pre-Processing with Spacy

In [4]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() for token in tokens if token not in stopwords and not token.is_punct])
# Lambda for text pre-processing

In [5]:
def pre_process(corpus: str) -> str:
	'''
	PURPOSE: preprocess the text using spaCy
	ARGUMENTS:
		- corpus (str): string od document to pre-process
	RETURN:
		- str: cleaned document
	'''
	return clean_tokens(nlp(corpus))



def documents_preprocessing(dataset_name: str, documents: List[str]) -> List[str]:
	'''
	PURPOSE: preprocess all the documents and query for the relative dataset
	ARGUMENTS:
		- dataset_name (str): string describing the dataset name
		- documents (List[str]): documents list
	RETURN: 
		- new_documents (List[str]): list of cleaned documents
	'''
	new_documents = []

	with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
		new_documents.extend(
			iter(
				list(
					tqdm(
						executor.map(pre_process, documents),
						total=len(documents),
						desc=f'{dataset_name} - Documents Pre-Processing',
					)
				)
			)
		)
	return new_documents

In [6]:
# Dictionary of dataset: pre-processed documents
pre_processed_data = {dataset: documents_preprocessing(dataset, docs_list) for dataset, docs_list in datasets_data.items()}

scifact - Documents Pre-Processing: 100%|██████████| 5183/5183 [01:59<00:00, 43.29it/s]
nfcorpus - Documents Pre-Processing: 100%|██████████| 3633/3633 [01:30<00:00, 40.08it/s]


# Sequential Version - All Pairs Documents Similarity

In [7]:
def all_pairs_docs_sim1(ds_name, docs_list, threshold):
    count = 0
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(docs_list)
    
    start = time.time()
    for doc_1, feature_vector_1 in tqdm(enumerate(features), total=len(docs_list), desc=f'Scanning {ds_name} with threshold {threshold} - 1° loop'):
        for doc_2, feature_vector_2 in tqdm(enumerate(features), total=len(docs_list), desc=f'Scanning {ds_name} with threshold {threshold} - 2° loop', leave=False):
            if doc_1 != doc_2 and cosine_similarity(feature_vector_1, feature_vector_2)[0][0] >= threshold: count += 1
    end = time.time()
    
    return {'count': count, 'elapsed': end-start}

In [8]:
def all_pairs_docs_sim2(ds_name, docs_list, threshold):
    count = 0
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(docs_list)
    
    start = time.time()
    similarities = cosine_similarity(features)
    for doc_sims in (-np.sort(-similarities)):
        for doc_sim in doc_sims[1:]:
            if doc_sim >= threshold: count += 1
            else: break
    end = time.time()
    
    return {'threshold': threshold, 'count': int(count/2), 'elapsed': end-start}

In [9]:
def all_pairs_docs_sim3(ds_name, docs_list, threshold):
    count = 0
    vectorizer = TfidfVectorizer(use_idf=True)
    features = vectorizer.fit_transform(docs_list)
    
    start = time.time()
    similarities = cosine_similarity(features)
    for doc_1, doc_sims in enumerate(similarities):
        for doc_sim in doc_sims[(doc_1+1):]:
            if doc_sim >= threshold: count += 1

    end = time.time()
    
    return {'threshold': threshold, 'count': count, 'elapsed': end-start}

In [16]:
def perform_all_pairs_docs_sim():
    result = {}
    pbar1 = tqdm(datasets_data.items(), total=len(datasets_data), desc='All Documents Pairs Similarities')
    for datasets_name, docs_list in pbar1:
        pbar1.set_description(f'All Documents Pairs Similarities - {datasets_name}', refresh=True)
        result[datasets_name] = []
        pbar2 = tqdm(thresholds, total=len(thresholds), desc='Running for each thresholds', leave=False)
        for threshold in pbar2:
            pbar2.set_description(f'Running with threshold {threshold}', refresh=True)
            result[datasets_name].append(all_pairs_docs_sim2(datasets_name, docs_list, threshold))
    pbar1.set_description('All Documents Pairs Similarities', refresh=True)
    return result

In [17]:
res = perform_all_pairs_docs_sim() 
res

All Documents Pairs Similarities:   0%|          | 0/2 [00:00<?, ?it/s]

Running for each thresholds:   0%|          | 0/7 [00:00<?, ?it/s]

Running for each thresholds:   0%|          | 0/7 [00:00<?, ?it/s]

{'scifact': [{'threshold': 0.7, 'count': 15, 'elapsed': 3.777374029159546},
  {'threshold': 0.75, 'count': 7, 'elapsed': 3.8133339881896973},
  {'threshold': 0.8, 'count': 4, 'elapsed': 3.8306713104248047},
  {'threshold': 0.85, 'count': 1, 'elapsed': 3.9922029972076416},
  {'threshold': 0.9, 'count': 0, 'elapsed': 4.012606143951416},
  {'threshold': 0.95, 'count': 0, 'elapsed': 3.945665121078491},
  {'threshold': 0.99, 'count': 0, 'elapsed': 3.7787435054779053}],
 'nfcorpus': [{'threshold': 0.7, 'count': 54, 'elapsed': 2.09859037399292},
  {'threshold': 0.75, 'count': 47, 'elapsed': 1.9046380519866943},
  {'threshold': 0.8, 'count': 42, 'elapsed': 1.8886833190917969},
  {'threshold': 0.85, 'count': 41, 'elapsed': 1.9578287601470947},
  {'threshold': 0.9, 'count': 41, 'elapsed': 1.9463560581207275},
  {'threshold': 0.95, 'count': 41, 'elapsed': 1.9689230918884277},
  {'threshold': 0.99, 'count': 41, 'elapsed': 2.1748411655426025}]}

# Parallel Version with Map Reduce from PySpark - All Pairs Documents Similarity

### Download PySpark 

In [None]:
# Download spark-hadoop
!wget https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
# control the checksum with the one provided
!sha512sum spark-3.4.0-bin-hadoop3.tgz
# unzip
!tar -xzf spark-3.4.0-bin-hadoop3.tgz

### Active PySpark

In [None]:
!./spark-3.4.0-bin-hadoop3/bin/pyspark