In [1]:
# !pip install -U transformers torch

In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


We will use a pretrained model from [huggingface](https://huggingface.co/naver/splade-cocondenser-selfdistil)

In [20]:
# model_id = 'naver/splade-cocondenser-ensembledistil'
model_id = 'naver/splade-cocondenser-selfdistil'


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

# test text

In [21]:
text = 'test text'

In [22]:
tokens = tokenizer(text, return_tensors='pt')
print(f' tokenized text: {tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])}')
output = model(**tokens)
output

 tokenized text: ['[CLS]', 'test', 'text', '[SEP]']


MaskedLMOutput(loss=None, logits=tensor([[[ -6.3943,  -7.9640,  -7.5507,  ...,  -7.7696,  -7.7066,  -6.0523],
         [-36.7605, -25.9343, -22.5490,  ..., -25.1820, -26.5855, -27.3972],
         [-26.7355, -20.2880, -22.2167,  ..., -19.8303, -19.0085, -24.0596],
         [-19.8575, -15.9112, -15.6052,  ..., -15.7236, -15.2050, -16.4795]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [23]:
output.logits.shape

torch.Size([1, 4, 30522])

Now we have a probability distribution over all token, but we want it over the entire text, the splade paper does this by:

In [24]:


vec = torch.max(
    torch.log(
        1 + torch.relu(output.logits)
    ) * tokens.attention_mask.unsqueeze(-1),
dim=1)[0].squeeze()

vec.shape

torch.Size([30522])

In [25]:
vec

tensor([0., 0., 0.,  ..., 0., 0., 0.], grad_fn=<SqueezeBackward0>)

In [26]:
# extract non-zero positions
cols = vec.nonzero().squeeze().cpu().tolist()
print(f"amount of non-zero values: {len(cols)}")

# extract the non-zero values
weights = vec[cols].cpu().tolist()
# use to create a dictionary of token ID to weight
sparse_dict = dict(zip(cols, weights))

print("the non-zero values:")
sparse_dict

amount of non-zero values: 58
the non-zero values:


{2508: 0.1254245936870575,
 2653: 0.042251672595739365,
 2731: 0.12503007054328918,
 2739: 0.01955387368798256,
 2773: 0.09141194820404053,
 3076: 0.16296613216400146,
 3160: 0.309316486120224,
 3189: 0.15591995418071747,
 3231: 2.970235586166382,
 3259: 0.0020148707553744316,
 3350: 0.1404428482055664,
 3433: 0.20468543469905853,
 3485: 0.5482600927352905,
 3642: 0.09496811777353287,
 3661: 0.5951390266418457,
 3752: 0.48842114210128784,
 3793: 2.7863576412200928,
 3836: 0.1300884634256363,
 3945: 0.0756244882941246,
 4106: 0.1932976394891739,
 4289: 0.03775309771299362,
 4345: 0.015433406457304955,
 4357: 0.5832338929176331,
 4431: 0.08798141032457352,
 4471: 0.5469196438789368,
 4613: 0.22698090970516205,
 4742: 0.09591808915138245,
 4807: 0.38130977749824524,
 4918: 0.21712850034236908,
 5074: 0.04295560345053673,
 5604: 2.032471179962158,
 5616: 0.012767007574439049,
 5896: 0.20169223845005035,
 6254: 0.5167056918144226,
 6498: 0.13640891015529633,
 6868: 0.02545907348394394,
 698

These tokens do not tell us much lets map them back to tokens:

In [27]:
# extract the ID position to text token mappings
idx2token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}

In [28]:
# map token IDs to human-readable tokens
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens

{'test': 2.97,
 'text': 2.79,
 'texts': 2.38,
 'testing': 2.03,
 'exam': 1.12,
 'quote': 0.64,
 'letter': 0.6,
 'interview': 0.58,
 'journal': 0.55,
 'message': 0.55,
 'document': 0.52,
 'reading': 0.49,
 'assessment': 0.45,
 'communication': 0.38,
 'grammar': 0.36,
 'bert': 0.34,
 'question': 0.31,
 'sample': 0.31,
 'malcolm': 0.28,
 'speech': 0.23,
 'charlie': 0.22,
 'response': 0.2,
 'script': 0.2,
 'analysis': 0.19,
 'archive': 0.18,
 'student': 0.16,
 'report': 0.16,
 'scan': 0.16,
 'phonetic': 0.16,
 'phrase': 0.15,
 'evidence': 0.14,
 'josh': 0.14,
 'james': 0.13,
 'training': 0.13,
 'teacher': 0.13,
 'logic': 0.13,
 'math': 0.13,
 'quiz': 0.12,
 'signal': 0.1,
 'word': 0.09,
 'code': 0.09,
 'reference': 0.09,
 'search': 0.08,
 'pearson': 0.06,
 'telegraph': 0.05,
 'language': 0.04,
 'format': 0.04,
 'roger': 0.04,
 'watson': 0.04,
 'collins': 0.03,
 'tutor': 0.03,
 'news': 0.02,
 'oxford': 0.02,
 'roland': 0.02,
 'doc': 0.02,
 'toby': 0.02,
 'emma': 0.01,
 'paper': 0.0}

# Comparing vectors

We will now compare 3 pieces of text to eachother to see how that works:

In [29]:
texts = [
   "information retrieval is hard to understand, but lovely when you understand it.",
   "I love going to the University of Amsterdam",
   "I don't want to go to school mum... we need to do information retrieval"
]

In [30]:
tokens = tokenizer(
    texts, return_tensors='pt',
    padding=True, truncation=True
)
output = model(**tokens)
# aggregate the token-level vecs and transform to sparse
vecs = torch.max(
    torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1), dim=1
)[0].squeeze().detach().cpu().numpy()
vecs.shape

(3, 30522)

In [31]:
import numpy as np

sim = np.zeros((vecs.shape[0], vecs.shape[0]))

for i, vec in enumerate(vecs):
    sim[i,:] = np.dot(vec, vecs.T) / (
        np.linalg.norm(vec) * np.linalg.norm(vecs, axis=1)
    )

In [32]:
sim

array([[1.        , 0.01146347, 0.39074919],
       [0.01146347, 1.00000012, 0.16770877],
       [0.39074919, 0.16770874, 1.        ]])