In [4]:
import faiss
import numpy as np
import pandas as pd

In [5]:
from enum import Enum
class DatasetDirs(Enum):
    COPHIR_1M = "/storage/brno6/home/tslaninakova/learned-indexes/MTree1M"
    COPHIR_100k = "/storage/brno6/home/tslaninakova/learned-indexes/MTree100k"
    PROFI_1M = "/storage/brno6/home/tslaninakova/learned-indexes/MtreeProfi2000"
    
class DatasetDirsLocal(Enum):
    COPHIR_100k = "./Mtree-Cophir-100k"

from LMI import LMI
li = LMI(DatasetDirs.COPHIR_1M.value)
df_orig = li.get_dataset()

10-03-21 00:33 INFO: Loaded dataset of shape: (1000000, 285)


## Faiss Demo
Source: Faiss documentation - https://github.com/facebookresearch/faiss/wiki/Getting-started 


### Steps:
1. Convert data to float32, keep only the descriptors, get the dimensions
2. Set up faiss to use GPUs
3. Set up an efficient flat index
4. Train on the data
5. Query

1. Convert data to float32, keep only the descriptors, get the dimensions

## Faiss KMeans

In [6]:
import re

def get_knn_objects(path="./queries.data", should_be_int = True):
    knn_object_ids = []
    with open(path) as f:
        for line in f.readlines():
            z_1 = re.findall(r"AbstractObjectKey ([\d\-_]+)", line)
            if z_1:
                if should_be_int:
                    knn_object_ids.append(int(z_1[0]))
                else:
                    knn_object_ids.append(z_1[0])
    if should_be_int:
        return np.array(knn_object_ids, dtype=np.int64)
    
    return np.array(knn_object_ids)

def get_sample_1k_objects(df_res):
    return df_res[df_res["object_id"].isin(get_knn_objects(path = "/storage/brno6/home/tslaninakova/learned-indexes/datasets/queries.data"))]

def unify_types(df):
    df_modified = df.drop(["object_id"], axis = 1).astype(np.float32)
    df_modified["object_id"] = df["object_id"].astype(np.int64)

    cols = df_modified.columns.tolist()
    cols = cols[0:2] + cols[-1:] + cols[2:-1]    
    df_modified = df_modified[cols]

    return df_modified


In [7]:
df = unify_types(df_orig)

df_data = df.drop(["L1", "L2", "object_id"], axis=1).values
n = df_data.shape[0]
dimension = df_data.shape[1]

print(dimension)
print(n)

282
1000000


In [8]:
import faiss

ngpus = faiss.get_num_gpus()
res = faiss.StandardGpuResources() # declaring a GPU resource, using all the available GPUs
print(f"Faiss will be using all {ngpus} gpu(s)")
cpu_index = faiss.IndexFlatL2(dimension) # creating an index

gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
    cpu_index
)

nlist = 10
ivf_index = faiss.IndexIVFFlat(gpu_index, dimension, nlist, faiss.METRIC_L2)

Faiss will be using all 1 gpu(s)


In [9]:
db_vectors = np.ascontiguousarray(df_data)
ivf_index.train(db_vectors)
print(ivf_index.is_trained)
ivf_index.add(db_vectors)
print(ivf_index.ntotal)

True
1000000


In [10]:
gt_knns = li.get_knn_ground_truth()
df_1k = get_sample_1k_objects(df)
df_1k

Unnamed: 0,L1,L2,object_id,0,1,2,3,4,5,6,...,272,273,274,275,276,277,278,279,280,281
172,5.0,6.0,14274802,-0.661732,1.105080,0.228591,0.557815,0.510095,-0.535006,-2.521066,...,-0.056616,0.231465,0.091958,1.843996,0.200754,0.134813,0.200796,0.17399,0.061850,0.280142
212,5.0,6.0,15699747,0.277035,-0.749673,-0.432600,-0.917478,0.246892,0.271505,-2.144816,...,1.632310,1.150308,0.842015,-0.936745,-7.242367,0.134813,0.200796,0.17399,0.061850,0.280142
229,5.0,6.0,15930711,0.277035,1.476031,0.757544,-0.733066,-0.016311,-0.535006,-1.893983,...,2.054542,4.825678,3.092188,-0.936745,-4.451197,0.134813,0.200796,0.17399,0.061850,6.368273
304,5.0,6.0,17355051,0.021008,2.032456,-0.829314,-2.208360,-0.542718,-1.341517,-0.765236,...,-0.056616,0.537746,0.591996,0.255001,0.200754,0.134813,0.200796,0.17399,0.061850,0.280142
390,5.0,6.0,19729509,0.703748,-0.564198,0.360830,1.111050,-0.016311,0.271505,-1.893983,...,-0.056616,-0.993659,0.091958,-0.142247,0.200754,0.134813,0.200796,0.17399,0.061850,-0.227202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984021,18.0,15.0,49783351,-0.064335,1.105080,-0.035885,0.188992,-0.542718,0.876388,-1.768567,...,-0.056616,0.231465,-2.658253,-3.320236,0.200754,0.134813,0.200796,0.17399,0.061850,0.280142
986616,15.0,37.0,84478260,0.703748,1.476031,-1.490505,-2.208360,0.510095,-0.535006,0.112679,...,-0.056616,-0.381097,0.842015,0.255001,0.200754,0.134813,0.200796,0.17399,0.061850,2.309519
990031,18.0,17.0,99783052,0.362378,-1.862525,0.889782,2.217520,-1.858733,0.674760,-1.517734,...,0.787847,0.537746,-2.408234,-1.333993,0.200754,0.134813,0.200796,0.17399,0.061850,0.280142
998682,2.0,60.0,33336832,3.093338,-0.378722,-0.168123,1.295462,0.246892,0.876388,0.739761,...,2.054542,0.844027,0.591996,0.255001,0.200754,0.134813,0.200796,0.17399,-11.170002,0.280142


In [11]:
class FaissKMeans:
    def __init__(self, n_clusters=19, n_init=10, max_iter=300, gpu=True):
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.max_iter = max_iter
        self.kmeans = None
        self.cluster_centers_ = None
        self.inertia_ = None
        self.gpu = gpu

    def fit(self, X):
        self.kmeans = faiss.Kmeans(d=X.shape[1],
                                   k=self.n_clusters,
                                   niter=self.max_iter,
                                   nredo=self.n_init,
                                   gpu=self.gpu)
        self.kmeans.train(X.astype(np.float32))
        self.cluster_centers_ = self.kmeans.centroids
        self.inertia_ = self.kmeans.obj[-1] 

    def predict(self, X):
        return self.kmeans.index.search(X.astype(np.float32), 1)
    
    def prepare(self, df):
        if len(df.shape) == 1:
            return np.ascontiguousarray([df.drop(["L1", "L2", "object_id"])])
        
        return np.ascontiguousarray(df.drop(["L1", "L2", "object_id"], axis=1))

In [12]:
fkm = FaissKMeans(n_clusters = 19)
%time fkm.fit(db_vectors)

CPU times: user 5.52 s, sys: 472 ms, total: 6 s
Wall time: 6 s


In [13]:
query_vectors = fkm.prepare(df)
%time distances, indices = fkm.predict(query_vectors)

CPU times: user 457 ms, sys: 212 ms, total: 670 ms
Wall time: 671 ms


In [16]:
len(indices)

1000000

In [27]:
indices[:1000]

1000

In [None]:
#df["KMeans_L1_pred"] = fkm.predict(fkm.prepare(df))[1]

In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 10)

In [28]:
clf.fit(df.iloc[:1000].drop(["L1", "L2", "object_id"], axis = 1), indices[:1000])

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
foo = clf.predict(df.iloc[1000:1100].drop(["L1", "L2", "object_id"], axis = 1))

In [42]:
foo

array([14, 14, 14, 12, 12, 18, 12, 14, 10, 14, 18, 14, 12, 12, 12, 14, 14,
       14, 12, 14, 14, 18, 12, 12, 12, 12, 14, 12, 14, 12, 12, 12, 14, 12,
       12, 12,  9, 12, 12, 14, 12, 12, 12, 14, 14, 15, 14, 18, 14, 12, 12,
        9, 18, 14, 18, 12, 14, 14, 14, 14,  9, 14, 12, 14, 12, 14, 14, 14,
       14, 12, 14, 14,  9, 14, 14, 14, 14, 14, 14, 14, 14, 12, 12, 18, 14,
       14, 18,  9, 14, 14,  7, 14, 14,  7, 14, 14, 14, 14, 14, 14])

In [41]:
indices[1000:1100]

array([[14],
       [14],
       [14],
       [14],
       [12],
       [18],
       [12],
       [14],
       [10],
       [14],
       [18],
       [11],
       [12],
       [12],
       [12],
       [14],
       [14],
       [14],
       [18],
       [12],
       [14],
       [18],
       [12],
       [12],
       [12],
       [12],
       [12],
       [12],
       [14],
       [12],
       [12],
       [12],
       [14],
       [12],
       [12],
       [12],
       [ 9],
       [12],
       [12],
       [14],
       [12],
       [12],
       [12],
       [14],
       [14],
       [15],
       [14],
       [18],
       [12],
       [12],
       [12],
       [12],
       [18],
       [14],
       [18],
       [12],
       [14],
       [14],
       [14],
       [14],
       [ 9],
       [14],
       [12],
       [14],
       [12],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],
       [14],

In [46]:
ok = 0
for i in range(100):
    if foo[i] == indices[1000:1100][i][0]:
        ok += 1

In [47]:
ok

89