In [1]:
import faiss
import numpy as np
import pandas as pd

In [2]:
from enum import Enum
class DatasetDirs(Enum):
    COPHIR_1M = "/storage/brno6/home/tslaninakova/learned-indexes/MTree1M"
    COPHIR_100k = "/storage/brno6/home/tslaninakova/learned-indexes/MTree100k"
    PROFI_1M = "/storage/brno6/home/tslaninakova/learned-indexes/MtreeProfi2000"
    
class DatasetDirsLocal(Enum):
    COPHIR_100k = "./Mtree-Cophir-100k"

from LMI import LMI
li = LMI(DatasetDirs.COPHIR_1M.value)
df = li.get_dataset()
df.head(2)

10-03-21 00:10 INFO: Loaded dataset of shape: (1000000, 285)


Unnamed: 0,L1,L2,object_id,0,1,2,3,4,5,6,...,272,273,274,275,276,277,278,279,280,281
0,5,6,337981,-1.003102,-0.007772,-0.697076,-1.839537,0.510095,-1.341517,-0.388986,...,-2.167773,-0.687378,-0.158062,0.255001,0.200754,0.134813,0.200796,0.17399,0.06185,0.280142
1,5,6,549406,-0.91776,0.548654,-0.961553,-0.733066,-0.805921,-0.333378,-1.517734,...,-0.056616,-0.074816,-3.158292,0.255001,0.200754,0.134813,0.200796,0.17399,0.06185,-0.734546


## Faiss Demo
Source: Faiss documentation - https://github.com/facebookresearch/faiss/wiki/Getting-started 


### Steps:
1. Convert data to float32, keep only the descriptors, get the dimensions
2. Set up faiss to use GPUs
3. Set up an efficient flat index
4. Train on the data
5. Query

In [3]:
import faiss
# 1.
df[df.columns] = df[df.columns].apply(np.float32)
df["object_id"] = df["object_id"].astype(np.int64)

df_data = df.drop(["L1", "L2", "object_id"], axis=1).values
dimension = df_data.shape[1]
n = df_data.shape[0]

In [4]:
# Installation guidelines: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md
#!pip install faiss-gpu

In [5]:
# 2.
ngpus = faiss.get_num_gpus()
res = faiss.StandardGpuResources() # declaring a GPU resource, using all the available GPUs
print(f"Faiss will be using all {ngpus} gpu(s)")
cpu_index = faiss.IndexFlatL2(dimension) # creating an index

gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
    cpu_index
)

AttributeError: module 'faiss' has no attribute 'get_num_gpus'

In [25]:
# 3.
nlist = 10
ivf_index = faiss.IndexIVFFlat(gpu_index, dimension, nlist, faiss.METRIC_L2)

In [26]:
# 4.
db_vectors = np.ascontiguousarray(df_data)
ivf_index.train(db_vectors)
print(ivf_index.is_trained)
ivf_index.add(db_vectors)
print(ivf_index.ntotal)

True
1000000


In [30]:
# 5.
from utils import get_knn_objects
df_1k = li.get_sample_1k_objects(df)
df_1k[df_1k.columns] = df_1k[df_1k.columns].apply(np.float32)

gt_knns = li.get_knn_ground_truth()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [39]:
query_vector = np.ascontiguousarray([df_1k.iloc[0].drop(["L1", "L2", "object_id"])])
%time distances, indices = ivf_index.search(query_vector, 30)

CPU times: user 52.5 ms, sys: 0 ns, total: 52.5 ms
Wall time: 52.5 ms


In [41]:
distances, indices

(array([[  0.      , 104.310745, 104.76199 , 108.10971 , 111.381195,
         114.86546 , 115.27322 , 116.84767 , 117.91591 , 118.10533 ,
         118.13173 , 118.85106 , 121.0092  , 121.040474, 121.10208 ,
         121.11687 , 121.15764 , 121.579926, 122.750656, 123.054955,
         123.06973 , 123.14552 , 123.74521 , 123.83394 , 124.79977 ,
         124.85843 , 125.08534 , 125.22952 , 125.295135, 125.34945 ]],
       dtype=float32),
 array([[   172, 652359, 550957, 699843, 149304, 797342, 558417, 797348,
         797537, 551033, 991880, 149309, 841081, 663561, 551356, 924292,
         149315, 867333, 149317, 333231, 221325, 797444, 189864, 866823,
         995981, 961729, 149316, 558278, 797949, 798247]]))

## Faiss as k-means

### Training idea:
Starting from the root model:
1. Train K-Means (faiss) = clusters the data -> clusters==labels
2. Train a supervised algorithm on the same data + labels -> predictions
3. Divide the data according to the predictions



In [43]:
import faiss
import numpy as np


class FaissKMeans:
    def __init__(self, n_clusters=8, n_init=10, max_iter=300):
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.max_iter = max_iter
        self.kmeans = None
        self.cluster_centers_ = None
        self.inertia_ = None

    def fit(self, X):
        self.kmeans = faiss.Kmeans(d=X.shape[1],
                                   k=self.n_clusters,
                                   niter=self.max_iter,
                                   nredo=self.n_init)
        self.kmeans.train(X.astype(np.float32))
        self.cluster_centers_ = self.kmeans.centroids
        self.inertia_ = self.kmeans.obj[-1] 

    def predict(self, X):
        return self.kmeans.index.search(X.astype(np.float32), 1)

In [44]:
fkm = FaissKMeans()
%time fkm.fit(db_vectors[:1000])

CPU times: user 6.57 s, sys: 5.62 ms, total: 6.57 s
Wall time: 6.58 s


In [48]:
df_1k.head()

Unnamed: 0,L1,L2,object_id,0,1,2,3,4,5,6,...,272,273,274,275,276,277,278,279,280,281
172,5.0,6.0,14274802.0,-0.661732,1.10508,0.228591,0.557815,0.510095,-0.535006,-2.521066,...,-0.056616,0.231465,0.091958,1.843996,0.200754,0.134813,0.200796,0.17399,0.06185,0.280142
212,5.0,6.0,15699747.0,0.277035,-0.749673,-0.4326,-0.917478,0.246892,0.271505,-2.144816,...,1.63231,1.150308,0.842015,-0.936745,-7.242367,0.134813,0.200796,0.17399,0.06185,0.280142
229,5.0,6.0,15930711.0,0.277035,1.476031,0.757544,-0.733066,-0.016311,-0.535006,-1.893983,...,2.054542,4.825678,3.092188,-0.936745,-4.451197,0.134813,0.200796,0.17399,0.06185,6.368273
567,5.0,6.0,24679440.0,0.191693,0.919605,-1.358267,0.00458,2.089313,-1.543144,-1.517734,...,2.899005,2.987993,-1.408157,-0.936745,-0.729636,0.134813,0.200796,0.17399,0.06185,0.280142
702,5.0,6.0,28327244.0,-1.003102,0.363179,1.022021,0.00458,-0.016311,0.67476,-1.266901,...,-0.056616,0.231465,-0.908119,-2.525739,0.200754,0.134813,0.200796,0.17399,0.06185,0.280142


In [49]:
query_vectors = np.ascontiguousarray(df_1k.iloc[:5].drop(["L1", "L2", "object_id"], axis=1))
%time preds = fkm.predict(query_vectors)

CPU times: user 56 µs, sys: 15 µs, total: 71 µs
Wall time: 73 µs


In [50]:
preds

(array([[ 97.45523],
        [184.4277 ],
        [396.6125 ],
        [206.40402],
        [179.11133]], dtype=float32),
 array([[1],
        [3],
        [3],
        [0],
        [2]]))