In [2]:

import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import torch
import argparse
import re
from functools import partial
import logging
from datasets import load_dataset, Dataset
from tqdm import tqdm
import os
from torch.utils.data import DataLoader
import json
import copy

import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES']='3'
t2p='data/musique_dev_t2p.jsonl'
vector='data/musique.pt'
query='data/musique_dev_query_full.jsonl'
result='data/musique_dev_query_full_scored.jsonl'

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def encode_batch(data, tokenizer, model, batch_size=128):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(data), batch_size)):
            batch = data[i:i + batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to('cuda')
            outputs = model(**inputs)
            batch_embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
            embeddings.append(batch_embeddings.cpu())
    return torch.cat(embeddings, dim=0)

encoder_model = 'facebook/contriever'

def get_similarities(search_space, query, model, tokenizer):
    with torch.no_grad():
        query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to('cuda')
        query_embedding = mean_pooling(model(**query_inputs).last_hidden_state,query_inputs['attention_mask']).cpu()  # Move to CPU
        query_embedding = query_embedding / query_embedding.norm(dim=1, keepdim=True)
        search_space = search_space / search_space.norm(dim=1, keepdim=True)
        similarities = torch.matmul(search_space, query_embedding.T).squeeze()
        return similarities
    

def map_nearest(row,space,model,tokenizer):
    l = get_similarities(space,row['question'],model,tokenizer)
    row["score"] = l
    return row

tokenizer = AutoTokenizer.from_pretrained(encoder_model)
model = AutoModel.from_pretrained(encoder_model).to('cuda')

if os.path.exists(vector):
    vec_passages=torch.load(vector)
else:
    with open(t2p, 'r', encoding='utf-8') as file:
        data = [json.loads(line)['passage'] for line in file]
    vec_passages = encode_batch(data, tokenizer, model)
    torch.save(vec_passages, vector)

dataset = load_dataset('json', data_files=query)["train"]
dataset = dataset.map(partial(map_nearest,space=vec_passages,model=model,tokenizer=tokenizer,))
dataset.to_json(result, orient="records", lines=True)

  vec_passages=torch.load(vector)


Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

549818770