# Extract contextual embedding vector of a character span

In [1]:
import spacy_alignments as tokenizations
import torch
from transformers import AutoTokenizer, AutoModel

from noisemon.domain.models.entity_span import EntitySpan
from noisemon.domain.services.language_representation.contextual_embedder import ContextualEmbedder

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
class ContextualEmbedderLocalImpl(ContextualEmbedder):
    model_name = "intfloat/multilingual-e5-large"
    def __init__(
            self,
            model_name=None,
            model=None,
            tokenizer=None,
            device=torch.device("cpu")
    ):
        if model_name is not None:
            self.model_name = model_name

        if model is None:
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            model = AutoModel.from_pretrained(self.model_name)

        self.model = model
        self.tokenizer = tokenizer
        self.model.to(device)



    def get_char_span_vectors(self, text: str, char_spans: list[EntitySpan]) -> list[torch.Tensor]:
        encoded_text = self.tokenizer([text], truncation=True, max_length=512, return_tensors="pt")
        wordpieces = self.tokenizer.batch_decode(encoded_text.input_ids[0])

        embedding_alignment, _ = tokenizations.get_alignments(list(text), wordpieces)

        with torch.no_grad():
            model_output = self.model(**{k: v.to(self.model.device) for k, v in encoded_text.items()})

        embeddings = model_output.last_hidden_state.cpu()
        embedding = torch.nn.functional.normalize(embeddings).squeeze()

        span_vectors = []
        for span in char_spans:
            span_idxs = [idx
                         for list_of_indices in embedding_alignment[span.span_start: span.span_end]
                         for idx in list_of_indices]
            span_idxs = sorted(set(span_idxs))
            span_emb = embedding[span_idxs]
            span_vector = torch.mean(span_emb, dim=0)
            span_vectors.append(span_vector)

        return span_vectors

In [41]:
import html
import urllib.parse

In [44]:
import bleach

In [48]:
cleaner = bleach.Cleaner(tags=set(), strip=True)

In [49]:
print(html.unescape(cleaner.clean(urllib.parse.unquote(text))))

The studio has produced many critically acclaimed films such as "Titanic", "Footloose", "Breakfast at Tiffany's", "Braveheart", "Ghost", "The Truman Show", "Mean Girls", "Psycho", "Rocketman", "Ferris Bueller's Day Off", "The Curious Case of Benjamin Button", "Days of Thunder", "Rosemary's Baby", "Nebraska", "Sunset Boulevard", "Forrest Gump", "Super 8", "Coming to America", "World War Z", "Babel", "The Conversation", "The Fighter", "Interstellar", "", "Terms of Endearment", "The Wolf of Wall Street" and "A Quiet Place"; as well as commercially successful franchises and/or properties such as: the "Godfather" films, "Star Trek", "", "SpongeBob SquarePants", the "Grease" films, "Sonic the Hedgehog", the "Top Gun" films, "The Italian Job", the "Transformers" films, the "Teenage Mutant Ninja Turtles" films, the "Tomb Raider" films, the "Friday the 13th" films, the "Cloverfield" films, the "G.I. Joe" films, the "Beverly Hills Cop" films, the "Terminator" films, the "Pet Sematary" films, the

In [50]:
len(wordpieces)

512

In [33]:
len(wordpieces)

512

In [26]:
text = """The studio has produced many critically acclaimed films such as "Titanic", "Footloose", "Breakfast at Tiffany's", "Braveheart", "Ghost", "The Truman Show", "Mean Girls", "Psycho", "Rocketman", "Ferris Bueller's Day Off", "The Curious Case of Benjamin Button", "Days of Thunder", "Rosemary's Baby", "Nebraska", "Sunset Boulevard", "Forrest Gump", "Super 8", "Coming to America", "World War Z", "Babel", "The Conversation", "The Fighter", "Interstellar", "", "Terms of Endearment", "The Wolf of Wall Street" and "A Quiet Place"; as well as commercially successful franchises and/or properties such as: the "Godfather" films, "Star Trek", "", "SpongeBob SquarePants", the "Grease" films, "Sonic the Hedgehog", the "Top Gun" films, "The Italian Job", the "Transformers" films, the "Teenage Mutant Ninja Turtles" films, the "Tomb Raider" films, the "Friday the 13th" films, the "Cloverfield" films, the "G.I. Joe" films, the "Beverly Hills Cop" films, the "Terminator" films, the "Pet Sematary" films, the "Without a Paddle" films, "Jackass", the "Odd Couple" films, "South Park", the "Crocodile Dundee" films, the "Charolette's Web" films, the "Wayne's World" films, "<a href="Beavis%20%26amp%3B%20Butthead">Beavis & Butthead</a>", "Jimmy Neutron", the "War of the Worlds" films, the "Naked Gun" films, the "Anchorman" films, "Dora the Explorer", the "Addams Family" films, "Rugrats", the "Zoolander" films, "Æon Flux", the "Ring" films, the "Bad News Bears" films, "The Wild Thornberrys", and the "Paranormal Activity" films; as well as the first four films of the Marvel Cinematic Universe, the "Indiana Jones" films, and various DreamWorks Animation properties (such as "Shrek", the "Madagascar" sequels, the first two "Kung Fu Panda" films, and the first "How to Train Your Dragon") before both studios were respectively acquired by Disney (via Marvel Studios and Lucasfilm) and Universal Studios."""

In [27]:
text[1833:1839]

'Disney'

In [82]:
span = EntitySpan(span= 'Disney', span_start= 1833, span_end=1839 )

In [83]:
embedder = ContextualEmbedderLocalImpl()

In [84]:
vecs = embedder.get_char_span_vectors(text, [span])

In [85]:
vecs

[tensor([ 0.0156,  0.0126,  0.0038,  ..., -0.0428, -0.0342, -0.0352])]

In [86]:
bool(torch.isnan(vecs[0]).any())

False

In [9]:
self = embedder

In [10]:
encoded_text = self.tokenizer([text], truncation=True, max_length=512, return_tensors="pt")
wordpieces = self.tokenizer.batch_decode(encoded_text.input_ids[0])

In [11]:
embedding_alignment, _ = tokenizations.get_alignments(list(text), wordpieces)

In [14]:
with torch.no_grad():
    model_output = self.model(**{k: v.to(self.model.device) for k, v in encoded_text.items()})

In [87]:
embeddings = model_output.last_hidden_state.cpu()
embedding = (embeddings).squeeze()

In [88]:
embeddings

tensor([[[ 0.0953,  0.3670,  0.1145,  ..., -0.4083, -0.3172,  0.1805],
         [-0.0887, -0.1598,  0.1276,  ..., -0.7622, -0.5241, -0.3246],
         [-0.0344,  0.0289,  0.2937,  ..., -0.5823, -0.6273, -0.3763],
         ...,
         [-0.3504,  0.1292,  0.2704,  ..., -0.0404, -0.4498, -0.2149],
         [-0.3526,  0.1246,  0.2574,  ..., -0.0237, -0.4510, -0.2308],
         [ 0.0424,  0.1841,  0.1519,  ..., -0.7243, -0.5346, -0.1693]]])

In [57]:
from scipy.linalg import norm

In [90]:
emb_slice = embedding[[1,2,3]]

In [95]:
emb_vec = torch.mean(emb_slice, dim=0)

In [100]:
norm(emb_vec)

16.222457885742188

In [101]:
emb_vec = torch.nn.functional.normalize(emb_vec, dim=0)

In [102]:
norm(emb_vec)

0.9999999403953552

In [59]:
norm(embeddings[0, :].numpy())

1.1946344375610352

In [64]:
norm(embeddings[0, 0])

15.546648979187012

In [69]:
embeddings.squeeze().shape

torch.Size([512, 1024])

In [67]:
norm(torch.nn.functional.normalize(embeddings.squeeze(), dim=1)[0])

1.0

In [20]:
len(text)

1897

In [19]:
len(embedding_alignment)

1897

In [18]:
embedding_alignment[span.span_start: span.span_end]

[[], [], [], [], [], []]

In [16]:
span_idxs = [idx
             for list_of_indices in embedding_alignment[span.span_start: span.span_end]
             for idx in list_of_indices]

In [17]:
span_idxs

[]

In [23]:
_enc_text = self.tokenizer(["something Disney something"], truncation=True, max_length=512, return_tensors="pt")
_wordpieces = self.tokenizer.batch_decode(_enc_text.input_ids[0])

In [24]:
_wordpieces

['[CLS]', 'something', 'disney', 'something', '[SEP]']

In [None]:
    span_idxs = sorted(set(span_idxs))
    span_emb = embedding[span_idxs]
    span_vector = torch.mean(span_emb, dim=0)
    span_vectors.append(span_vector)