# NLP interpretations
> NLL interpretretation tool sets

In [1]:
# default_exp nlp.interp

In [12]:
# export
from unpackai.interp.latent import InterpEmbeddings
import pandas as pd

## Interpret huggingface tokenizer

In [13]:
# export
class InterpEmbeddingsTokenizer(InterpEmbeddings):
    def __init__(self,
                 embedding_matrix,
                 tokenizer):
        """
        embedding_matrix: np.ndarray, embedding matrix of shape:
            (num_items, hidden_size)
        tokenizer: a huggingface tokenizer
        """
        super().__init__(
            embedding_matrix,
            dict((v, k) for k, v in tokenizer.vocab.items()))
        self.tokenizer = tokenizer

    def search(
        self,
        word: str,
        filter_special_token: bool = True,
        top_k: int = 20,
    ) -> pd.DataFrame:
        """
        search for similar words with embedding and
            tokenizer's encode/ decode
        """
        token_ids = self.tokenizer.encode(word)
        if filter_special_token:
            token_ids = list(t for t in token_ids if t > 110)

        # combine multiple tokens into 1
        vec = self.base[token_ids].mean(0)

        # distance search
        closest, similarity = self.cosine.search(vec, return_similarity=True)
        tokens = self.tokenizer.convert_ids_to_tokens(closest)
        return pd.DataFrame({
            "tokens": tokens,
            "idx": closest,
            "similarity": similarity}).head(top_k)

In [14]:
from transformers import AutoTokenizer, AutoModel

PRETRAINED = "albert-base-v2"

model = AutoModel.from_pretrained(PRETRAINED)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)

In [5]:
tokenizer.special_tokens_map

{'bos_token': '[CLS]',
 'eos_token': '[SEP]',
 'unk_token': '<unk>',
 'sep_token': '[SEP]',
 'pad_token': '<pad>',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [6]:
model.embeddings

AlbertEmbeddings(
  (word_embeddings): Embedding(30000, 128, padding_idx=0)
  (position_embeddings): Embedding(512, 128)
  (token_type_embeddings): Embedding(2, 128)
  (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0, inplace=False)
)

In [7]:
embedding_matrix = model.embeddings.word_embeddings.weight.data.numpy()
embedding_matrix.shape

(30000, 128)

In [8]:
tokenizer.special_tokens_map.values

<function dict.values>

In [None]:
interp = InterpEmbeddingsTokenizer(
    embedding_matrix,tokenizer=tokenizer)

In [None]:
interp.search("wife")