In [None]:
!nvidia-smi

Links from - https://github.com/beir-cellar/beir


*   /nfcorpus (lightest dataset for faster experiments)
*   /scidocs
*   /scifact
*   /arguana
*   /fiqa
*   /trec-covid





In [1]:
dataset = "nfcorpus"

In [3]:
!wget "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"

--2023-01-04 15:55:22--  https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2448432 (2.3M) [application/zip]
Saving to: ‘nfcorpus.zip’


2023-01-04 15:55:25 (1.76 MB/s) - ‘nfcorpus.zip’ saved [2448432/2448432]



Sorry, not sure how to pass the variable to the bash unzip

In [4]:
!unzip nfcorpus.zip

Archive:  nfcorpus.zip
   creating: nfcorpus/
   creating: nfcorpus/qrels/
  inflating: nfcorpus/qrels/train.tsv  
  inflating: nfcorpus/qrels/test.tsv  
  inflating: nfcorpus/qrels/dev.tsv  
  inflating: nfcorpus/corpus.jsonl   
  inflating: nfcorpus/queries.jsonl  


In [5]:
import json

with open(f"./{dataset}/corpus.jsonl", "r") as json_file:
  json_list = list(json_file)

error_counter = 0

corpus = []
org_docID_to_seq_docID = {} # We use this key-value to match the relevance labels for query-doc pairs

for idx in range(0, len(json_list), 1):
  result = json.loads(json_list[idx])
  new_doc_obj = {}
  new_doc_obj["document"] = result["text"]
  # This might give you trouble, some BEIR datasets are string keys, others int!!
  org_docID_to_seq_docID[(result["_id"])] = idx
  new_doc_obj["DocID"] = idx

  corpus.append(new_doc_obj)

In [6]:
len(corpus)

3633

In [7]:
import json

with open(f"./{dataset}/queries.jsonl", "r") as json_file:
  json_list = list(json_file)

queries = []

for json_str in json_list:
  result = json.loads(json_str)
  new_query_obj = {}
  new_query_obj["queryID"] = result["_id"] # NOTE some are string keys others int
  new_query_obj["query"] = result["text"]
  
  queries.append(new_query_obj)

In [8]:
len(queries)

3237

In [10]:
import os
os.listdir("nfcorpus/qrels")

['test.tsv', 'train.tsv', 'dev.tsv']

In [11]:
import pandas as pd
import os

all_data = pd.DataFrame()
# there will be at most ['test.tsv', 'train.tsv', 'dev.tsv'], not using splitting yet
for label_file in os.listdir(f"{dataset}/qrels"):
  data = pd.read_csv(f"{dataset}/qrels/{label_file}", sep="\t")
  all_data = pd.concat([data, all_data])

In [12]:
all_data.head(2)

Unnamed: 0,query-id,corpus-id,score
0,PLAIN-1,MED-2421,2
1,PLAIN-1,MED-2422,2


In [13]:
found_key_counter, missing_key_counter = 0, 0 # sanity check
for query_idx, query in enumerate(queries):
  queryID = query["queryID"]
  # note that "query-id" might not be the key name for all BEIR datasets
  matchingDocIDs = list(all_data[all_data["query-id"] == (queryID)]["corpus-id"].to_numpy())
  processed_matchingDocIDs = []
  for matchingDoc in matchingDocIDs:
    try:
      processed_matchingDocIDs.append(org_docID_to_seq_docID[str(matchingDoc)]) # str wrapper for CQADupstack
      found_key_counter += 1
    except:
      missing_key_counter += 1
  if len(processed_matchingDocIDs) > 0:
    query["matchingDocIDs"] = processed_matchingDocIDs
    query["original_matchingDocIDs"] = matchingDocIDs # if numpy ints: use --> [id.item() for id in matchingDocIDs]
  else:
    queries.pop(query_idx) # removign if a query has no labeled matching docs, msising_key_counter will count # of these

In [14]:
print(missing_key_counter)
print(found_key_counter)

0
134294


Ready to Vectorize!!

In [15]:
!pip install sentence-transformers > /dev/null

In [16]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
model.save_pretrained("all-MiniLM-L12-v2")
tokenizer.save_pretrained("all-MiniLM-L12-v2")

model = SentenceTransformer("all-MiniLM-L12-v2")

Downloading:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [17]:
from google.colab import drive
drive.mount("/drive")

Mounted at /drive


In [19]:
import time
start = time.time()


corpus_save_path = f"/drive/My Drive/BEIR-Files/all-MiniLM-L12-v2/{dataset}-Corpus.json"

for idx in range(0, len(corpus), 1):
  corpus_dict = corpus[idx]
  if idx % 10_000 == 9_999: 
    print(f"{idx} texts vectorized, ran for {time.time() - start} so far...")
  vector = model.encode(corpus_dict["document"])
  corpus[idx]["vector"] = vector.tolist()

print("\n")
print(f"Vectorized {idx+1} texts in {time.time() - start} seconds on T4.\n")
print("Saving vectors to disk...")
import json
Corpus_json_object = json.dumps(corpus, indent=4)
with open(corpus_save_path, "w") as outfile:
  outfile.write(Corpus_json_object)

print("Finished saving vectors...")
print(len(corpus))
print(corpus_save_path)



Vectorized 3633 texts in 59.40937399864197 seconds on T4.

Saving vectors to disk...
Finished saving vectors...
3633
/drive/My Drive/BEIR-Files/all-MiniLM-L12-v2/nfcorpus-Corpus.json
