In [52]:
!pip install beir
!pip install pandas
!pip install sklearn
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install ipywidgets

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [53]:
from typing import Dict, List

from beir import util
from beir.datasets.data_loader import GenericDataLoader

import os
try:
    import ipywidgets
    from tqdm.auto import tqdm
except ModuleNotFoundError:
    from tqdm import tqdm

import spacy

import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Available Datasets

| Dataset   | Website| BEIR-Name | Domain     | Relevancy| Queries  | Documents | Avg. Docs/Q | Download | 
| -------- | -----| ---------| ----------- | ---------| ---------| --------- | ------| ------------| 
| MSMARCO    | [``Homepage``](https://microsoft.github.io/msmarco/)| ``msmarco`` | Misc.       |  Binary  |  6,980   |  8.84M     |    1.1 | Yes |  
| TREC-COVID |  [``Homepage``](https://ir.nist.gov/covidSubmit/index.html)| ``trec-covid``| Bio-Medical |  3-level|50|  171K| 493.5 | Yes | 
| NFCorpus   | [``Homepage``](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) | ``nfcorpus``  | Bio-Medical |  3-level |  323     |  3.6K     |  38.2 | Yes |
| BioASQ     | [``Homepage``](http://bioasq.org) | ``bioasq``| Bio-Medical |  Binary  |   500    |  14.91M    |  8.05 | No | 
| NQ         | [``Homepage``](https://ai.google.com/research/NaturalQuestions) | ``nq``| Wikipedia   |  Binary  |  3,452   |  2.68M  |  1.2 | Yes | 
| HotpotQA   | [``Homepage``](https://hotpotqa.github.io) | ``hotpotqa``| Wikipedia   |  Binary  |  7,405   |  5.23M  |  2.0 | Yes |
| FiQA-2018  | [``Homepage``](https://sites.google.com/view/fiqa/) | ``fiqa``    | Finance     |  Binary  |  648     |  57K    |  2.6 | Yes | 
| Signal-1M (RT) | [``Homepage``](https://research.signal-ai.com/datasets/signal1m-tweetir.html)| ``signal1m`` | Twitter     |  3-level  |   97   |  2.86M  |  19.6 | No |
| TREC-NEWS  | [``Homepage``](https://trec.nist.gov/data/news2019.html) | ``trec-news``    | News     |  5-level  |   57    |  595K    |  19.6 | No |
| ArguAna    | [``Homepage``](http://argumentation.bplaced.net/arguana/data) | ``arguana`` | Misc.       |  Binary  |  1,406     |  8.67K    |  1.0 | Yes |
| Touche-2020| [``Homepage``](https://webis.de/events/touche-20/shared-task-1.html) | ``webis-touche2020``| Misc.       |  6-level  |  49     |  382K    |  49.2 |  Yes |
| CQADupstack| [``Homepage``](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) | ``cqadupstack``| StackEx.      |  Binary  |  13,145 |  457K  |  1.4 |  Yes |
| Quora| [``Homepage``](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) | ``quora``| Quora  | Binary  |  10,000     |  523K    |  1.6 |  Yes | 
| DBPedia | [``Homepage``](https://github.com/iai-group/DBpedia-Entity/) | ``dbpedia-entity``| Wikipedia |  3-level  |  400    |  4.63M    |  38.2 |  Yes | 
| SCIDOCS| [``Homepage``](https://allenai.org/data/scidocs) | ``scidocs``| Scientific |  Binary  |  1,000     |  25K    |  4.9 |  Yes | 
| FEVER| [``Homepage``](http://fever.ai) | ``fever``| Wikipedia     |  Binary  |  6,666     |  5.42M    |  1.2|  Yes | 
| Climate-FEVER| [``Homepage``](http://climatefever.ai) | ``climate-fever``| Wikipedia |  Binary  |  1,535     |  5.42M |  3.0 |  Yes |
| SciFact| [``Homepage``](https://github.com/allenai/scifact) | ``scifact``| Scientific |  Binary  |  300     |  5K    |  1.1 |  Yes |


# Dataset Download & Pre-Processing

In [54]:
def download_dataset(dataset: str) -> Dict[str, List[str]]:
	'''
	PURPOSE: download the dataset
	ARGUMENTS:
		- dataset (str): string describing the beir dataset
	RETURN:
		- (List[str]) list of documents
	'''
	data_path = f'datasets/{dataset}'
	if not os.path.isdir(data_path):
		url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
		out_dir = os.path.join(os.getcwd(), 'datasets')
		data_path = util.download_and_unzip(url, out_dir)
		print(f'Dataset downloaded here: {data_path}')
	corpus, _, _ = GenericDataLoader(data_path).load(split="test")
	return {doc_id: title_text['title'] + ' ' + title_text['text'] for doc_id, title_text in corpus.items()}

datasets = ['nfcorpus'] # Choosen datasets
threshold = 0.8

datasets_data = {dataset: download_dataset(dataset) for dataset in datasets}

  0%|          | 0/3633 [00:00<?, ?it/s]

## Pre-Processing with Spacy

In [55]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() for token in tokens if token not in stopwords and not token.is_punct])
# Lambda for text pre-processing

In [56]:
def pre_process(dictionary):
	'''
	PURPOSE: preprocess the text using spaCy
	ARGUMENTS:
		- corpus (str): string of document to pre-process
	RETURN:
		- str: cleaned document
	'''
	key, value = dictionary
	return {key: clean_tokens(nlp(value))}



def documents_preprocessing(dataset_name: str, documents: Dict[str, str]) -> Dict[str, str]:
	'''
	PURPOSE: preprocess all the documents and query for the relative dataset
	ARGUMENTS:
		- dataset_name (str): string describing the dataset name
		- documents (Dict[str, List[str]]): doc_id, document_text dictionary
	RETURN: 
		- new_documents (Dict[str, List[str]]): dictionary of cleaned documents
	'''
 
	path_datasets = os.path.join(os.getcwd(), 'datasets')
	if os.path.exists(os.path.join(path_datasets, dataset_name, 'pre_processed_corpus.parquet')):
		return pd.read_parquet(os.path.join(path_datasets, dataset_name, 'pre_processed_corpus.parquet')).to_dict()[0]
	
 
	new_documents = {}

	with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
		results = list(
				tqdm(
					executor.map(pre_process, documents.items()),
					total=len(documents),
					desc=f'{dataset_name} - Documents Pre-Processing',
				)
			)

	for result in results:
		new_documents |= result

	write_pd = pd.DataFrame.from_dict(new_documents, orient='index')
	write_pd.to_parquet(os.path.join(path_datasets, dataset_name, 'pre_processed_corpus.parquet'))

	return new_documents

In [57]:
# Dictionary of dataset: pre-processed documents
pre_processed_data = {dataset: documents_preprocessing(dataset, docs_dict) for dataset, docs_dict in datasets_data.items()}

# Sequential Version - All Pairs Documents Similarity

In [58]:
def classic_all_pairs_docs_sim(docs_list: List[str], threshold: float):
    count = 0
    doc_similaritis = []
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(docs_list)
    
    
    start = time.time()
    similarities = cosine_similarity(features)
    for doc_1, doc_sims in enumerate(similarities):
        for doc_2, doc_sim in enumerate(doc_sims[(doc_1+1):], start=doc_1+1):
            if doc_sim >= threshold:
                count += 1
                doc_similaritis.append((doc_1, doc_2, doc_sim))
    end = time.time()
    
    
    return doc_similaritis, {'threshold': threshold, 'similar_doc': count, 'elapsed': end-start}

In [59]:
def npargwhere_all_pairs_docs_sim(docs_list: List[str], threshold: float):
    print()
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(docs_list)
    
    
    start = time.time()
    similarities = cosine_similarity(features)
    np.fill_diagonal(similarities, 0.0)
    idx_doc_similaritis = np.argwhere(similarities > threshold)
    end = time.time()
    

    return [(similar.tolist(), similarities[similar[0], similar[1]]) for similar in idx_doc_similaritis], \
        {'threshold': threshold, 'similar_doc': int(len(idx_doc_similaritis)/2), 'elapsed': end-start}

In [60]:
def perform_all_pairs_docs_sim(data, threshold):
    result = {}
    for datasets_name, docs_list in data.items():
        print(f'All Documents Pairs Similarities - {datasets_name}')
        similar_list, stat = npargwhere_all_pairs_docs_sim(list(docs_list.values()), threshold)
        for tuple in similar_list: print(tuple)
        result[datasets_name] = stat
    return result

In [61]:
res = perform_all_pairs_docs_sim(pre_processed_data, threshold) 
res

All Documents Pairs Similarities - nfcorpus

([2, 1503], 1.0)
([4, 1675], 1.0000000000000002)
([8, 9], 0.8032354716247146)
([9, 8], 0.8032354716247146)
([10, 1844], 1.0000000000000002)
([27, 32], 1.0)
([32, 27], 1.0)
([54, 963], 1.0)
([272, 3090], 0.8403826284168192)
([376, 1120], 1.0000000000000002)
([930, 1376], 0.9999999999999999)
([963, 54], 1.0)
([1101, 2931], 1.0)
([1120, 376], 1.0000000000000002)
([1147, 3489], 1.0000000000000004)
([1162, 3235], 1.0000000000000004)
([1179, 1188], 1.0000000000000004)
([1188, 1179], 1.0000000000000004)
([1376, 930], 0.9999999999999999)
([1444, 2908], 1.0)
([1503, 2], 1.0)
([1591, 3599], 1.0000000000000004)
([1600, 1605], 0.8233866288823122)
([1605, 1600], 0.8233866288823122)
([1671, 1736], 1.0000000000000002)
([1672, 1742], 1.0)
([1674, 1739], 1.0)
([1675, 4], 1.0000000000000002)
([1678, 1729], 1.0000000000000002)
([1691, 2910], 1.0000000000000002)
([1695, 2972], 1.0000000000000002)
([1706, 3160], 1.0)
([1729, 1678], 1.0000000000000002)
([1736, 16

{'nfcorpus': {'threshold': 0.8,
  'similar_doc': 44,
  'elapsed': 1.1976044178009033}}

In [62]:
def perform_all_pairs_docs_sim(data, threshold):
    result = {}
    for datasets_name, docs_list in data.items():
        print(f'All Documents Pairs Similarities - {datasets_name}')
        similar_list, stat = classic_all_pairs_docs_sim(list(docs_list.values()), threshold)
        for tuple in similar_list: print(tuple)
        result[datasets_name] = stat
    return result

In [63]:
res = perform_all_pairs_docs_sim(pre_processed_data, threshold) 
res

All Documents Pairs Similarities - nfcorpus
(2, 1503, 1.0)
(4, 1675, 1.0000000000000002)
(8, 9, 0.8032354716247146)
(10, 1844, 1.0000000000000002)
(27, 32, 1.0)
(54, 963, 1.0)
(272, 3090, 0.8403826284168192)
(376, 1120, 1.0000000000000002)
(930, 1376, 0.9999999999999999)
(1101, 2931, 1.0)
(1147, 3489, 1.0000000000000004)
(1162, 3235, 1.0000000000000004)
(1179, 1188, 1.0000000000000004)
(1444, 2908, 1.0)
(1591, 3599, 1.0000000000000004)
(1600, 1605, 0.8233866288823122)
(1671, 1736, 1.0000000000000002)
(1672, 1742, 1.0)
(1674, 1739, 1.0)
(1678, 1729, 1.0000000000000002)
(1691, 2910, 1.0000000000000002)
(1695, 2972, 1.0000000000000002)
(1706, 3160, 1.0)
(1833, 2282, 1.0000000000000002)
(1851, 1861, 0.9999999999999999)
(1894, 1902, 1.0000000000000002)
(2014, 2730, 1.0000000000000004)
(2303, 2749, 1.0000000000000002)
(2306, 3471, 0.9999999999999999)
(2314, 2317, 1.0000000000000002)
(2316, 2320, 1.0)
(2346, 3154, 1.0)
(2354, 2355, 0.9978727360754368)
(2366, 2370, 1.0)
(2599, 2971, 1.0)
(2623

{'nfcorpus': {'threshold': 0.8,
  'similar_doc': 44,
  'elapsed': 2.0110719203948975}}

# Parallel Version with Map Reduce from PySpark - All Pairs Documents Similarity

## Download PySpark 

In [64]:
!pip install pyspark



## Active PySpark

In [66]:
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
    .master('local[1]') \
    .config("spark.driver.memory", "15g") \
    .appName("all_pairs_docs_similarity.com") \
    .getOrCreate()

sc = spark.sparkContext

ConnectionRefusedError: [Errno 111] Connection refused

## PySpark Dataset Creation

In [None]:
# Create the features and columns vectors
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform(pre_processed_data['nfcorpus'].values())
tfidf_columns = vectorizer.get_feature_names_out()

In [1]:
# Create a dictionary of key document ID and value the list of TF-IDF values
dict_pre_rrd = list(
    zip(pre_processed_data['nfcorpus'].keys(), tfidf_features.toarray())
)

l = [
	('ciao',  np.array([1,2,3,4,5,6,7])),
	('ciao1', np.array([4,4,4,4,4,6,7])),
	('ciao2', np.array([1,1,1,1,1,1,7])),
	('ciao3', np.array([2,2,2,2,2,2,8]))
]

rdd = sc.parallelize(dict_pre_rrd) #tfidf_features.toarray()
#rdd.take(5)

# Get the d_star with the maximum TF-IDF value of each term from any documents
# todo

NameError: name 'pre_processed_data' is not defined

In [None]:
'''# Create the pandas Dataframe and convert it into a PySpark Dataframe
scifact_tfidf_pdf  = pd.DataFrame(data=tfidf_features.toarray(), index=pre_processed_data['scifact'].keys(), columns=tfidf_columns)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled","true")
scifact_tfidf_df = spark.createDataFrame(scifact_tfidf_pdf)
scifact_rdd = scifact_tfidf_df.rdd # Obtain the rrd from the dataframe
print(scifact_rdd.take(5))

# Get the d_star with the maximum TF-IDF value of each term from any documents
d_star = scifact_tfidf_pdf.max(axis='rows')'''

'# Create the pandas Dataframe and convert it into a PySpark Dataframe\nscifact_tfidf_pdf  = pd.DataFrame(data=tfidf_features.toarray(), index=pre_processed_data[\'scifact\'].keys(), columns=tfidf_columns)\nspark.conf.set("spark.sql.execution.arrow.pyspark.enabled","true")\nscifact_tfidf_df = spark.createDataFrame(scifact_tfidf_pdf)\nscifact_rdd = scifact_tfidf_df.rdd # Obtain the rrd from the dataframe\nprint(scifact_rdd.take(5))\n\n# Get the d_star with the maximum TF-IDF value of each term from any documents\nd_star = scifact_tfidf_pdf.max(axis=\'rows\')'

In [None]:
def print_fun(x): print(x)

In [None]:
# Map function
def map_fun(pair):
    doc_id, tf_idf_list = pair
    return [(idx, (doc_id, tf_idf_list)) for idx in range(len(tf_idf_list))]
    

In [None]:
mapped = rdd.flatMap(map_fun)
mapped.take(5)

----------------------------------------ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving

Exception occurred during processing of request from ('127.0.0.1', 43770)
Traceback (most recent call last):
  File "/usr/lib/py

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/riccardo/Desktop/all-pairs-doc-sim/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [None]:
rdd_temp = sc.parallelize([('0', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('1', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('2', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('3', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('4', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('5', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('6', ('ciao', np.array([1, 2, 3, 4, 5, 6, 7]))),
 ('0', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('1', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('2', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('3', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('4', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('5', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('6', ('ciao1', np.array([4, 4, 4, 4, 4, 6, 7]))),
 ('0', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('1', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('2', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('3', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('4', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('5', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('6', ('ciao2', np.array([1, 1, 1, 1, 1, 1, 7]))),
 ('0', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('1', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('2', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('3', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('4', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('5', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8]))),
 ('6', ('ciao3', np.array([2, 2, 2, 2, 2, 2, 8])))])#[("1", ('cacca', np.array([1,2,3,4,5]))), ("2", ('mucca', np.array([1,2,3,4,5]))), ("1", ('cucca', np.array([1,2,3,4,5])))])
#rdd_temp.collect()
rdd_temp1 = rdd_temp.groupByKey().mapValues(list)
rdd_temp1.collect()

[('0',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4, 4, 6, 7])),
   ('ciao2', array([1, 1, 1, 1, 1, 1, 7])),
   ('ciao3', array([2, 2, 2, 2, 2, 2, 8]))]),
 ('1',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4, 4, 6, 7])),
   ('ciao2', array([1, 1, 1, 1, 1, 1, 7])),
   ('ciao3', array([2, 2, 2, 2, 2, 2, 8]))]),
 ('2',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4, 4, 6, 7])),
   ('ciao2', array([1, 1, 1, 1, 1, 1, 7])),
   ('ciao3', array([2, 2, 2, 2, 2, 2, 8]))]),
 ('3',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4, 4, 6, 7])),
   ('ciao2', array([1, 1, 1, 1, 1, 1, 7])),
   ('ciao3', array([2, 2, 2, 2, 2, 2, 8]))]),
 ('4',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4, 4, 6, 7])),
   ('ciao2', array([1, 1, 1, 1, 1, 1, 7])),
   ('ciao3', array([2, 2, 2, 2, 2, 2, 8]))]),
 ('5',
  [('ciao', array([1, 2, 3, 4, 5, 6, 7])),
   ('ciao1', array([4, 4, 4, 4

In [None]:
groupby = rdd.groupByKey().mapValues(list)
groupby.collect()

[('ciao', [array([1, 2, 3, 4, 5, 6, 7])]),
 ('ciao1', [array([4, 4, 4, 4, 4, 6, 7])]),
 ('ciao2', [array([1, 1, 1, 1, 1, 1, 7])]),
 ('ciao3', [array([2, 2, 2, 2, 2, 2, 8])])]

In [None]:
# Reduce function
def reduce_fun(doc_id_doc_list):
    res = []
    for id1, d1 in doc_id_doc_list:
        res.extend(
            (id1, id2, cosine_similarity(d1, d2))
            for id2, d2 in doc_id_doc_list
            if cosine_similarity(d1, d2) >= threshold
        )
    return res

In [None]:
reduced = groupby.reduceByKey(reduce_fun)
reduced.take(5)

                                                                                

[('MED-861', [array([0., 0., 0., ..., 0., 0., 0.])]),
 ('MED-1273', [array([0., 0., 0., ..., 0., 0., 0.])]),
 ('MED-5109',
  [array([0.        , 0.04891914, 0.        , ..., 0.        , 0.        ,
          0.        ])]),
 ('MED-1063',
  [array([0.09279332, 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ])]),
 ('MED-1246', [array([0., 0., 0., ..., 0., 0., 0.])])]

In [None]:
#sc.stop()