In [17]:
from sentence_transformers import SentenceTransformer
from datasets import load_from_disk, load_dataset
import numpy as np
import faiss
import os
import torch
from tqdm.notebook import tqdm

print(faiss.get_compile_options())
n_gpus = faiss.get_num_gpus()
print("Number of GPUs available to FAISS:", n_gpus)

%reload_ext autoreload
%autoreload 2

OPTIMIZE GENERIC GPU 
Number of GPUs available to FAISS: 1


In [18]:
res = faiss.StandardGpuResources()

In [19]:
unifiedds = load_dataset('brygotti/MNLP_M3_mcqa_dataset')
mlp4educ_evalds = load_dataset('brygotti/NLP4Education_english_single_mcq_4_choices')
mmlu_ds = load_dataset('brygotti/mmlu')

LETTER_INDICES = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]

def relevance_text(line):
    if not line.get("relevance_text"):
        prompt = line["question"].strip() + "\n"
        if line["choices"]:
            prompt += "".join([f"{key}. {choice.strip()}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
        return {
            "relevance_text": prompt
        }
    else:
        return {}

unifiedds = unifiedds.map(relevance_text)['train']

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)  # otherwise, use: 'multi-qa-MiniLM-L6-cos-v1'

def embed(batch):
    embs = model.encode(batch['relevance_text'], convert_to_numpy=True, show_progress_bar=False, device=device)
    return {'emb': embs.astype('float32')}  # FAISS needs float32

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Compute

In [7]:
ds_A = unifiedds.map(embed, batched=True, batch_size=4096*8)
A_embs = np.vstack(ds_A['emb']).astype('float32')
faiss.normalize_L2(A_embs)

Map:   0%|          | 0/816351 [00:00<?, ? examples/s]

In [8]:
cache_dir = './cache'
def compute_distances(dataset, dataset_name):
    dataset = dataset.map(relevance_text)['test']
    ds_B = dataset.map(embed, batched=True, batch_size=64)
    B_embs = np.vstack(ds_B['emb']).astype('float32')
    faiss.normalize_L2(B_embs)
    
    # Step 2: Prepare GPU FAISS index
    dim = B_embs.shape[1]
    cpu_index = faiss.IndexFlatIP(dim)  # Inner product (with normalized = cosine sim)
    gpu_res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(gpu_res, 0, cpu_index)
    
    # Step 3: Add B embeddings to GPU index
    index.add(B_embs)
    
    # Step 4: Search A_embs (should also be L2-normalized and float32)
    distances, indices = index.search(A_embs, k=1)
    
    # Step 5: Save
    os.makedirs(f"{cache_dir}/all-MiniLM-L6-v2", exist_ok=True)
    np.save(f"{cache_dir}/all-MiniLM-L6-v2/distances_{dataset_name}.npy", distances)
    np.save(f"{cache_dir}/all-MiniLM-L6-v2/indices_{dataset_name}.npy", indices)

In [9]:
compute_distances(mlp4educ_evalds, "mlp4educ")

Map:   0%|          | 0/1962 [00:00<?, ? examples/s]

Map:   0%|          | 0/1962 [00:00<?, ? examples/s]

In [11]:
compute_distances(mmlu_ds, "mmlu")

Map:   0%|          | 0/14042 [00:00<?, ? examples/s]

Map:   0%|          | 0/14042 [00:00<?, ? examples/s]

In [20]:
def load_distances(dataset_name):
    distances = np.load(f"{cache_dir}/all-MiniLM-L6-v2/distances_{dataset_name}.npy")
    indices = np.load(f"{cache_dir}/all-MiniLM-L6-v2/indices_{dataset_name}.npy")
    return distances, indices

distances_mcq4educ, indices_mcq4educ = load_distances("mlp4educ")
distances_mmlu, indices_mmlu = load_distances("mmlu")

# Map unified to dataset with relevancy field (equal to the distance)
unifiedds = unifiedds.map(lambda x, idx: {'relevance_nlp4educ': distances_mcq4educ[idx][0], 'relevance_mmlu': distances_mmlu[idx][0]}, with_indices=True)
unifiedds

Dataset({
    features: ['dataset', 'question', 'choices', 'question_type', 'answer', 'explanation', 'prompt', 'completion', 'relevance_text', 'relevance1', 'relevance2', 'relevance_nlp4educ', 'relevance_mmlu', 'relevance_othereval'],
    num_rows: 816351
})

In [23]:
unifiedds.push_to_hub('brygotti/MNLP_M3_mcqa_dataset')

Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/brygotti/unified-0.8M/commit/5bfb19f5c04d7dd3821112f0b4c45f4c20b2abcd', commit_message='Upload dataset', commit_description='', oid='5bfb19f5c04d7dd3821112f0b4c45f4c20b2abcd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/brygotti/unified-0.8M', endpoint='https://huggingface.co', repo_type='dataset', repo_id='brygotti/unified-0.8M'), pr_revision=None, pr_num=None)