In [1]:
import torch
from vector_quantize_pytorch import VectorQuantize
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
import numpy as np

## Load models for vector generation

In [2]:
roberta_tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
roberta_model = AutoModel.from_pretrained("allenai/biomed_roberta_base").cuda()

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def embed(term):
    inputs = roberta_tokenizer(term, padding=True, truncation=True, return_tensors="pt")
    inputs['input_ids'] = inputs['input_ids'].cuda()
    inputs['attention_mask'] = inputs['attention_mask'].cuda()
    outputs = roberta_model(**inputs).last_hidden_state.cpu().detach().numpy()[0][-1]
    return outputs

def embed2(term):
    inputs = roberta_tokenizer(term, padding=True, truncation=True, return_tensors="pt")
    inputs['input_ids'] = inputs['input_ids'].cuda()
    inputs['attention_mask'] = inputs['attention_mask'].cuda()
    outputs = roberta_model(**inputs)[1].last_hidden_state.cpu().detach().numpy()[0]
    return outputs

## Load data

In [4]:
data = pd.read_json("../data/msmarco-triplets/msmarco-triplets.jsonl", lines=True)

ValueError: Invalid octet in UTF-8 sequence when decoding 'string'

In [None]:
data['pos'] = data['pos'].apply(lambda x: x[0])
data['neg'] = data['neg'].apply(lambda x: x[0])

## Measure cosine similarity differences between pos and neg

In [None]:
metrics = []
def calculate_metrics(method, data):
    pos_similarity = []
    neg_similarity = []

    for _, sample in tqdm.tqdm(data.iterrows(), total=len(data)):
        query, pos, neg = [method(i) for i in sample.values]
        pos_similarity.append(cosine_similarity([query], [pos])[0][0])
        neg_similarity.append(cosine_similarity([query], [neg])[0][0])

    pos_similarity = np.array(pos_similarity)
    neg_similarity = np.array(neg_similarity)
    
    ratio = np.sum(pos_similarity>neg_similarity)/len(pos_similarity)
    mse = np.mean((pos_similarity-neg_similarity)**2)
    mean_diff = pos_similarity.mean()-neg_similarity.mean()
    diff_of_means = (pos_similarity-neg_similarity).mean()
    
    return pd.DataFrame.from_dict([{
                        "ratio": ratio,
                        "mse": mse,
                        "mean_diff": mean_diff,
                        "diff_of_means": diff_of_means,
                        "method": "CLS vector"
                    }])

In [None]:
metrics.append(calculate_metrics(embed, data))

  0%|                                                                                                                        | 41/499184 [00:01<5:17:33, 26.20it/s]

In [None]:
metrics.append(calculate_metrics(embed2, data))

In [131]:
reg_vals

Unnamed: 0,ratio,mse,mean_diff,diff_of_means
0,0.6,5e-06,4.6e-05,4.6e-05


In [132]:
reg_metrics

Unnamed: 0,ratio,mse,mean_diff,diff_of_means,method
0,0.6,5e-06,7.6e-05,7.7e-05,CLS vector


## Quantize vectors

In [None]:
vq = VectorQuantize(
    dim = 256,
    codebook_size = 256,
    use_cosine_sim = True   # set this to True
)

x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = vq(x)

In [3]:
quantized

tensor([[[-0.0436, -0.0356,  0.0504,  ...,  0.0734,  0.0101, -0.0043],
         [ 0.0023,  0.0299,  0.0954,  ..., -0.0674, -0.0306, -0.0673],
         [ 0.0594,  0.0038,  0.0323,  ...,  0.0005,  0.0146, -0.0669],
         ...,
         [ 0.0659,  0.0087,  0.0428,  ..., -0.0188,  0.1388,  0.0502],
         [ 0.0321,  0.0005, -0.0922,  ..., -0.0186,  0.0722, -0.0732],
         [ 0.0866,  0.0397,  0.0194,  ...,  0.0013, -0.0600,  0.0446]]])

In [5]:
indices.shape

torch.Size([1, 1024])

In [7]:
x

tensor([[[ 0.5812,  0.3534,  0.3396,  ...,  0.8195, -0.7648, -0.1033],
         [-0.4561,  0.5710,  1.4759,  ...,  1.1099, -0.5926,  1.8276],
         [-0.9794,  0.6377, -0.3734,  ...,  0.3372, -0.6377, -0.2107],
         ...,
         [ 0.6335, -0.9268, -1.0811,  ...,  0.7970,  1.7144,  1.3704],
         [-0.5405,  0.8778, -0.2181,  ...,  0.5724, -0.6577, -0.2691],
         [ 0.5452,  0.5831,  1.4473,  ...,  0.6675, -2.0544,  0.6891]]])