# Interpretation
> Easy consine similarity search, search similar features among vectors is a frequently encountered situation

> Most of the code is from Ray's other library ```forgebox.cosine```

In [1]:
# default_exp interp.latent

In [13]:
# export
import numpy as np
import pandas as pd
from typing import Dict, List, Any
from forgebox.html import DOM

## Cosine similarity search

In [8]:
# export
class CosineSearch:
    """
    Build a index search on cosine distance
    cos = CosineSearch(base_array)
    idx_order = cos(vec)
    """

    def __init__(self, base: np.ndarray):
        """
        base: np.ndarray, embedding matrix of shape:
            (num_items, hidden_size)
        """
        assert len(base.shape) == 2,\
            f"Base array has to be 2 dimentional, input is {len(base.shape)}"
        self.base = base
        self.base_norm = self.calc_base_norm(self.base)
        self.normed_base = self.base/self.base_norm[:, None]
        self.dim = self.base.shape[1]
        
    def __repr__(self):
        return f"[Consine Similarity Search] ({len(self)} items)"

    def __len__(self): return self.base.shape[0]

    @staticmethod
    def calc_base_norm(base: np.ndarray) -> np.ndarray:
        return np.sqrt(np.power(base, 2).sum(1))

    def search(self, vec: np.ndarray, return_similarity: bool = False):
        if return_similarity:
            similarity = (vec * self.normed_base /
                          (np.power(vec, 2).sum())).sum(1)
            order = similarity.argsort()[::-1]
            return order, similarity[order]
        return self(vec)

    def __call__(self, vec: np.ndarray) -> np.ndarray:
        """
        Return the order index of the closest vector to the furthest
        vec: an 1 dimentional vector, marks the closest index
            to the further ones
        """
        return (vec * self.normed_base).sum(1).argsort()[::-1]

## Search for similar vecs

Create random vector, assimuating ```500 items x 128 embedding hidden size```

In [6]:
base = np.random.rand(500,128)

In [7]:
cosine = CosineSearch(base)
cosine

[Consine Similarity Search] (500 items)

### Rank the distance to 6th item

In [8]:
cosine(base[5])

array([  5, 268, 327, 309, 365,  34, 388, 173, 415, 135, 151, 461, 307,
       275, 469, 384, 416,  60, 293, 236, 153, 493, 464, 402,  74, 383,
        15, 294,  95, 485, 103, 488, 156, 122, 283, 379, 321, 477, 300,
       348, 100, 381, 317, 209, 231, 182, 174, 457, 332, 314, 256, 326,
       251, 313,   9, 183, 270, 133,  70, 424, 227, 399, 234, 205, 487,
        84,  31, 232, 322, 428, 311,  67, 380,  19, 471, 255, 419, 224,
       413, 247, 328, 436, 367,  64, 385, 279, 344, 406, 306, 238, 357,
       335, 248, 249, 312, 169, 221, 124, 297, 427,  52, 346, 136, 288,
       120,  93, 250, 495,  22, 143, 273, 206, 149, 305, 159, 438, 218,
       343, 195,  24, 142,  50, 150, 199, 434, 465, 123,  69, 223,  79,
       291, 154,  73,  10, 222,  18,  76,  68, 213, 139, 489, 323, 286,
       241, 158, 106,  92,  37, 301, 408, 141, 272,   2, 207, 179,  32,
       395, 390, 290, 366, 121, 138, 181, 292, 377,  20, 134, 282,   7,
       497, 108, 244, 210, 146, 360, 304, 404, 467, 296, 498, 47

### Rank the distance to 10th item
> Returning the similarity value also

In [11]:
order, similarity = cosine.search(base[9], return_similarity=True)

pd.DataFrame({"order": order, "similarity": similarity}).head(10)

Unnamed: 0,order,similarity
0,9,0.146922
1,28,0.122393
2,436,0.122063
3,236,0.121338
4,135,0.120816
5,225,0.120507
6,113,0.120241
7,160,0.120201
8,446,0.11938
9,58,0.119282


## Embedding interpretation
> Interpreting pytorch embedding matrix by utilizing [tensorboard](https://www.tensorflow.org/tensorboard) in colab

In [9]:
# export
class InterpEmbeddings:
    """
    interp = InterpEmbeddings(embedding_matrix, vocab_dict)
    
    interp.search("computer")
    
    # visualize the embedding with tensorboard 
    interp.visualize_in_tb()
    """

    def __init__(
        self,
        embedding_matrix: np.ndarray,
        vocab: Dict[int, str]
    ):
        """
        embedding_matrix: np.ndarray, embedding matrix of shape:
            (num_items, hidden_size)
        """
        self.base = embedding_matrix
        self.cosine = CosineSearch(embedding_matrix)
        self.vocab = vocab
        self.c2i = dict((v, k) for k, v in vocab.items())

    def __repr__(self) -> str:
        cls = self.__class__.__name__
        return f"{cls} with\n\t{self.cosine}"

    def search(
        self,
        category: str,
        top_k: int = 20,
    ) -> pd.DataFrame:
        """
        search for similar words with embedding and vocabulary dictionary
        """
        token_id = self.c2i.get(category)
        if token_id is None:
            match_list = []
            for token_id, token in self.vocab.items():
                if category.lower() in str(token).lower():
                    match_list.append({"token": token, "token_id": token_id})
            if len(match_list)==0:
                raise KeyError(
                    f"[UnpackAI] category: {category} not in vocabulary")
            else:
                match_df = pd.DataFrame(match_list)
                DOM("Search with the following categories","h3")()
                display(match_df)
                token_ids = list(match_df.token_id)
        else:
            DOM(f"Search with token id {token_id}","h3")()
            token_ids = [token_id,]

        # combine multiple tokens into 1
        vec = self.base[token_ids].mean(0)

        # distance search
        closest, similarity = self.cosine.search(vec, return_similarity=True)
        
        closest = closest[:top_k]
        similarity = similarity[:top_k]
        tokens = list(self.vocab.get(idx) for idx in closest)
        return pd.DataFrame({
            "tokens": tokens,
            "idx": closest,
            "similarity": similarity})
    
    def visualize_in_tb(
        self,
        log_dir:str="./logs",
        selection: np.ndarray=None,
        first_k:int=500,
    ) -> None:
        """
        Visualize the embedding in tensorboard
        For now this function is only supported on colab
        """
        # since this won't be excute too many times within a notebook
        # in large chances... so to avoid missing library when import
        # other function under this module: we import related stuff here
        from torch.utils.tensorboard import SummaryWriter
        # this version's pd has vc for quick value counts
        from forgebox.imports import pd
        import tensorflow as tf
        import tensorboard as tb
        import os
        
        # possible tensorflow version error
        tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
        os.system(f"rm -rf {log_dir}")
        writer = SummaryWriter(log_dir=log_dir,)
        self.i2c = dict((v,k) for k,v in self.c2i.items())  
        tokens = list(self.i2c.get(i) for i in range(len(self.i2c)))
        
        if selection is None:
            vecs = self.base[:first_k]
            tokens = tokens[:first_k]
        else:
            selection = np.array(selection).astype(dtype=np.int64)
            # select a pool of tokens for visualizaiton
            tokens = list(np.array(tokens)[selection][:first_k])
            vecs = self.base[selection][:first_k]
        writer.add_embedding(vecs, metadata=tokens,)
        # prompts for next step
        print(f"Please run the the following command in a cell")
        print("%load_ext tensorboard")
        print(f"%tensorboard  --logdir {log_dir}")

### Usage for other task
#### eg. recommender sys

Suppose we have 500 movies, that we do have learnt latent vectors on these movies

Given 1 movie,can we find the most similar ones

As we can create feature eg. "You watched <...>, You may also like..."

In [10]:
NUM_ITEMS = 500

# an embedding maxtrix, in shape of
movie_embedding = np.random.rand(NUM_ITEMS,42)

# a dictionary mapping index to string
vocab = dict((i, f"movie #{i}") for i in range(NUM_ITEMS,))

In [11]:
interp = InterpEmbeddings(movie_embedding, vocab=vocab)

In [14]:
interp.search("movie #22")

Unnamed: 0,tokens,idx,similarity
0,movie #22,22,0.262194
1,movie #295,295,0.227033
2,movie #108,108,0.225516
3,movie #0,0,0.223599
4,movie #144,144,0.222527
5,movie #232,232,0.222521
6,movie #498,498,0.222113
7,movie #381,381,0.220832
8,movie #220,220,0.219798
9,movie #305,305,0.218822
