** ANN algorithms**

1. LSH: Locality-sensitive hashing
2. Exhaustive Search
3. Product Quantization
4. Trees and Graphs
5. HSNW: Hierarchical Navigable Small World : https://colab.research.google.com/drive/1iOOim6-l0xvK1hmZp5Yg6qA9_CYf3voh?authuser=1#scrollTo=PbfJSeMNFolO

**Implementing above all using Faiss**

In [None]:
!pip install faiss



In [None]:
!pip3 install faiss
!sudo apt-get install libopenblas-dev
!sudo apt-get install libomp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenblas-dev is already the newest version (0.2.20+ds-4).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
import faiss
import pickle
import pandas as pd


In [None]:
def data_set():
    with open('movies.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = data_set()
vectors = data["vector"]
names = data["name"]
data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

**LSH**


In [None]:
class LSHIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, num_bits=8):
        self.index = faiss.IndexLSH(self.dimension, num_bits)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
       
        return [self.labels[i] for i in indices[0]]

In [None]:
index = LSHIndex(data["vector"], data["name"])
index.build()

In [None]:
data["vector"]

array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
        -0.17529577, -0.03061521],
       [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
        -0.09166156,  0.05047869],
       [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
        -0.08328608, -0.0152858 ],
       ...,
       [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
         0.14679626,  0.2462584 ],
       [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
         0.16223577,  0.19897607],
       [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
         0.0467432 ,  0.07893164]], dtype=float32)

In [None]:
index.query(data['vector'])

['Supercop (1992)',
 'Rumble in the Bronx (1995)',
 'Mission: Impossible (1996)',
 'Four Rooms (1995)',
 'Donnie Brasco (1997)',
 'Cold Comfort Farm (1995)',
 'Toy Story (1995)',
 'Angels and Insects (1995)',
 'Twelve Monkeys (1995)',
 'Lone Star (1996)']

**Exhaustive Search**



In [None]:
class BruteForceIndex():
    def __init__(self, vectors, labels):
        self.vectors = vectors.astype('float32')
        self.labels = labels
        self.index = faiss.IndexFlatL2(vectors.shape[1])
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [None]:
index = BruteForceIndex(data["vector"], data["name"])


In [None]:
movie_vector, movie_name = data['vector'][80:91], data['name'][80]
simlar_movies_names = '\n* '.join(index.query(movie_vector))
print("The most similar movies to {movie_name} are:\n* {simlar_movies_names}")


The most similar movies to Hudsucker Proxy, The (1994) are:
* Hudsucker Proxy, The (1994)
* Bob Roberts (1992)
* Ed Wood (1994)
* Heathers (1989)
* This Is Spinal Tap (1984)
* Sirens (1994)
* In the Name of the Father (1993)
* Vanya on 42nd Street (1994)
* Quiz Show (1994)
* What's Eating Gilbert Grape (1993)


**Product Quantization**



In [None]:
class PQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [None]:
index = PQIndex(data["vector"], data["name"])
index.build()

In [None]:
movie_index = 80
movie_vector = data['vector'][movie_index:movie_index+1]
print("The most simillar movies to {data['name'][movie_index]} are:")
index.query(movie_vector)

The most simillar movies to Hudsucker Proxy, The (1994) are:


['Hudsucker Proxy, The (1994)',
 'Bob Roberts (1992)',
 'Secret Garden, The (1993)',
 'Ed Wood (1994)',
 'Bullets Over Broadway (1994)',
 'Nikita (La Femme Nikita) (1990)',
 'Harold and Maude (1971)',
 'Sirens (1994)',
 "Microcosmos: Le peuple de l'herbe (1996)",
 'Fearless (1993)']

**Trees and Graph**

In [None]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[?25l[K     |▌                               | 10 kB 29.6 MB/s eta 0:00:01[K     |█                               | 20 kB 33.8 MB/s eta 0:00:01[K     |█▌                              | 30 kB 37.8 MB/s eta 0:00:01[K     |██                              | 40 kB 37.6 MB/s eta 0:00:01[K     |██▌                             | 51 kB 36.9 MB/s eta 0:00:01[K     |███                             | 61 kB 37.1 MB/s eta 0:00:01[K     |███▌                            | 71 kB 38.8 MB/s eta 0:00:01[K     |████                            | 81 kB 39.0 MB/s eta 0:00:01[K     |████▋                           | 92 kB 40.6 MB/s eta 0:00:01[K     |█████                           | 102 kB 39.0 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 39.0 MB/s eta 0:00:01[K     |██████                          | 122 kB 39.0 MB/s eta 0:00:01[K     |██████▋                         | 133 kB 39.0 MB/s eta 0:00:01[K   

In [None]:
import annoy
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [None]:
index = AnnoyIndex(data["vector"], data["name"])
index.build()

  # Remove the CWD from sys.path while we load stuff.


In [None]:
movie_vector, movie_name = data['vector'][70], data['name'][70]
simlar_movies_names = '\n* '.join(index.query(movie_vector))
print(f"The most similar movies to {movie_name} are:\n* {simlar_movies_names}")

The most similar movies to Lion King, The (1994) are:
* Lion King, The (1994)
* Aladdin (1992)
* Snow White and the Seven Dwarfs (1937)
* Beauty and the Beast (1991)
* Dumbo (1941)
* Cinderella (1950)
* Fantasia (1940)
* Sound of Music, The (1965)
* Pinocchio (1940)
* E.T. the Extra-Terrestrial (1982)
