# Cosine Similarirty
> Easy consine similarity search, search similar features among vectors is a frequently encountered situation

> Most of the code is from Ray's other library ```forgebox.cosine```

In [1]:
# default_exp cosine

In [4]:
# export
import numpy as np
import pandas as pd

In [5]:
# export
class CosineSearch:
    """
    Build a index search on cosine distance
    cos = CosineSearch(base_array)
    idx_order = cos(vec)
    """

    def __init__(self, base: np.ndarray):
        """
        base: np.ndarray, embedding matrix of shape:
            (num_items, hidden_size)
        """
        assert len(base.shape) == 2,\
            f"Base array has to be 2 dimentional, input is {len(base.shape)}"
        self.base = base
        self.base_norm = self.calc_base_norm(self.base)
        self.normed_base = self.base/self.base_norm[:, None]
        self.dim = self.base.shape[1]
        
    def __repr__(self):
        return f"[Consine Similarity Search] ({len(self)} items)"

    def __len__(self): return self.base.shape[0]

    @staticmethod
    def calc_base_norm(base: np.ndarray) -> np.ndarray:
        return np.sqrt(np.power(base, 2).sum(1))

    def search(self, vec: np.ndarray, return_similarity: bool = False):
        if return_similarity:
            similarity = (vec * self.normed_base /
                          (np.power(vec, 2).sum())).sum(1)
            order = similarity.argsort()[::-1]
            return order, similarity[order]
        return self(vec)

    def __call__(self, vec: np.ndarray) -> np.ndarray:
        """
        Return the order index of the closest vector to the furthest
        vec: an 1 dimentional vector, marks the closest index
            to the further ones
        """
        return (vec * self.normed_base).sum(1).argsort()[::-1]

## Search for similar vecs

Create random vector, assimuating ```500 items x 128 embedding hidden size```

In [6]:
base = np.random.rand(500,128)

In [7]:
cosine = CosineSearch(base)
cosine

[Consine Similarity Search] (500 items)

### Rank the distance to 6th item

In [8]:
cosine(base[5])

array([  5, 268, 327, 309, 365,  34, 388, 173, 415, 135, 151, 461, 307,
       275, 469, 384, 416,  60, 293, 236, 153, 493, 464, 402,  74, 383,
        15, 294,  95, 485, 103, 488, 156, 122, 283, 379, 321, 477, 300,
       348, 100, 381, 317, 209, 231, 182, 174, 457, 332, 314, 256, 326,
       251, 313,   9, 183, 270, 133,  70, 424, 227, 399, 234, 205, 487,
        84,  31, 232, 322, 428, 311,  67, 380,  19, 471, 255, 419, 224,
       413, 247, 328, 436, 367,  64, 385, 279, 344, 406, 306, 238, 357,
       335, 248, 249, 312, 169, 221, 124, 297, 427,  52, 346, 136, 288,
       120,  93, 250, 495,  22, 143, 273, 206, 149, 305, 159, 438, 218,
       343, 195,  24, 142,  50, 150, 199, 434, 465, 123,  69, 223,  79,
       291, 154,  73,  10, 222,  18,  76,  68, 213, 139, 489, 323, 286,
       241, 158, 106,  92,  37, 301, 408, 141, 272,   2, 207, 179,  32,
       395, 390, 290, 366, 121, 138, 181, 292, 377,  20, 134, 282,   7,
       497, 108, 244, 210, 146, 360, 304, 404, 467, 296, 498, 47

### Rank the distance to 10th item
> Returning the similarity value also

In [11]:
order, similarity = cosine.search(base[9], return_similarity=True)

pd.DataFrame({"order": order, "similarity": similarity}).head(10)

Unnamed: 0,order,similarity
0,9,0.146922
1,28,0.122393
2,436,0.122063
3,236,0.121338
4,135,0.120816
5,225,0.120507
6,113,0.120241
7,160,0.120201
8,446,0.11938
9,58,0.119282
