# Trace Cross Encoder

In [1]:
model_id = "cross-encoder/nli-deberta-base"
test_sentences = [('How many people live in Berlin?', 'How many people live in Berlin?'), ('Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.')]

In [7]:
import torch
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)
features = tokenizer([('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Query', 'Paragraph3')],  padding=True, truncation=True, return_tensors="pt")

In [8]:
tokenizer

DebertaTokenizerFast(name_or_path='cross-encoder/nli-deberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [9]:
features

{'input_ids': tensor([[    1, 48382,     2, 22011, 44947,   134,     2],
        [    1, 48382,     2, 22011, 44947,   176,     2],
        [    1, 48382,     2, 22011, 44947,   246,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}

In [10]:
ex_input = (features["input_ids"], features["token_type_ids"], features["attention_mask"])
traced_model = torch.jit.trace(model, ex_input, strict=False)
torch.jit.save(traced_model, "traced_cross_encoder.pt")

In [5]:
loaded_model = torch.jit.load("traced_cross_encoder.pt")
loaded_model.eval()

In [None]:
test_features = tokenizer(test_sentences,  padding=True, truncation=True, return_tensors="pt")
pt_prediction = loaded_model(**test_features)
pt_prediction

In [None]:
from torch import nn
default_activation_function = nn.Identity() # nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
activation_fct = default_activation_function
logits = activation_fct(pt_prediction['logits'])

In [None]:
logits

In [None]:
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in logits.argmax(dim=1)]
print(labels)

# Compare Output

In [11]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(model_id)
scores = model.predict(test_sentences)
print(scores)

label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
print(labels)

[[-3.381288    3.6616564  -1.1681075 ]
 [ 3.9306588  -4.44145     0.85258216]]
['entailment', 'contradiction']


In [24]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch

config = AutoConfig.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
max_length = 512

In [25]:
from torch.utils.data import DataLoader

def smart_batching_collate_text_only(batch):
    texts = [[] for _ in range(len(batch[0]))]

    for example in batch:
        for idx, text in enumerate(example):
            texts[idx].append(text.strip())

    tokenized = tokenizer(*texts, padding=True, truncation='longest_first', return_tensors="pt", max_length=max_length)

    for name in tokenized:
        tokenized[name] = tokenized[name].to('cpu')

    return tokenized

inp_dataloader = DataLoader(test_sentences, batch_size=32, collate_fn=smart_batching_collate_text_only, num_workers=0, shuffle=False)

In [26]:
inp_dataloader

<torch.utils.data.dataloader.DataLoader at 0x28265ffd0>

In [27]:
from torch import nn
default_activation_function = nn.Identity() # nn.Sigmoid() if config.num_labels == 1 else 

In [29]:
import numpy as np
iterator = inp_dataloader
activation_fct = None

if activation_fct is None:
    activation_fct = default_activation_function

pred_scores = []
model.eval()
model.to('cpu')
with torch.no_grad():
    for features in iterator:
        model_predictions = model(**features, return_dict=True)
        logits = activation_fct(model_predictions.logits)

#         if apply_softmax and len(logits[0]) > 1:
#             logits = torch.nn.functional.softmax(logits, dim=1)
        pred_scores.extend(logits)

pred_scores = np.array(pred_scores)
print(pred_scores)
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in pred_scores.argmax(axis=1)]
print(labels)

[[-3.381288    3.6616564  -1.1681075 ]
 [ 3.9306588  -4.44145     0.85258216]]
['entailment', 'contradiction']
