BUG:
- omp_set_num_threads() doesn't work with local install of swigfaiss

You can pip install faiss-cpu to do the release version.   
But you can also install local faiss implementation.   
For example the build-release version from the crack-ivf project with my custom methods:

If in faiss-dev-env and not in the following project:
`/home/vmageirakos/projects/crack-ivf`

Then you need to navigate to the above and:
- you may need to `pip uninstall faiss`

Make sure you've compiled swiggfaiss with your changes:
``` bash
make -C build-release -j swigfaiss
cd build-release/faiss/python
python setup.py install
```

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# InteractiveShell.ast_node_interactivity = "last"

%load_ext autoreload
%autoreload 2

In [2]:
import faiss
print(faiss.__file__)

/home/vmageirakos/enviroments/miniforge3/envs/faiss-dev-env/lib/python3.12/site-packages/faiss-1.9.0-py3.12.egg/faiss/__init__.py


In [3]:
import faiss
import numpy as np
import time

In [4]:
def train_ivfflat(
    data,
    nlist=10,
    km_n_iter=10,
    km_max_pts=256,
    seed=1,
    nredo=1,
    verbose=True,
    store_dir=None,
    metric='euclidean',
    store = False,
    train_pts=None,
):
    print(f"Kmeans... {nlist=} {km_n_iter=} {km_max_pts=} {seed=} {nredo=}")
    nb, d = data.shape

    quantizer = faiss.IndexFlatL2(d)
    if metric == "euclidean":
        index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
    elif metric == "angular":
        index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
    
    index.cp.seed = seed
    index.cp.niter = km_n_iter
    index.cp.max_points_per_centroid = km_max_pts
    index.cp.nredo = nredo
    index.verbose = verbose

    strain = time.perf_counter()
    if train_pts is not None:
        index.train(train_pts)
    else:
        index.train(data)
    etrain = time.perf_counter()

    sadd = time.perf_counter()
    index.add(data)
    eadd = time.perf_counter()

    if store_dir is not None:
        filename = (
            store_dir
            + f"/index-n_iter_{km_n_iter}-nlist_{nlist}-max_pts_{km_max_pts}-seed_{seed}.index"
        )
        if store:
            print("storing index", filename)
            faiss.write_index(index, filename)
    
    print(f"\t---> Index Train Time = {(etrain - strain)*1000} ms | Add Time = {(eadd - sadd)*1000} ms <---")
    return index, etrain - strain, eadd - sadd

def create_random_dataset_fast(d=16, nb=100, nq=1, seed=1234):
    '''
    This is faster than calling SyntheticDataset from FAISS which returns ground truth etc.
    '''
    # nq = 1                       # nb of queries
    np.random.seed(seed)             # make reproducible
    xb = np.random.random((nb, d)).astype('float32')
    xb[:, 0] += np.arange(nb) / 1000.
    xq = np.random.random((nq, d)).astype('float32')
    xq[:, 0] += np.arange(nq) / 1000.
    print("dataset shape:")
    print(f"{xb.shape=}")
    print(f"{xq.shape=}")
    return xb, xq

In [10]:
d = 5
nb = 100
nq = 2
seed = 42
xb, xq = create_random_dataset_fast(d, nb, nq, seed)

dataset shape:
xb.shape=(100, 5)
xq.shape=(2, 5)


In [11]:
nlist = 2
n_iter = 0
max_pts = 256
seed = 42 
result_dir= None
metric='euclidean'

In [12]:
faiss.omp_set_num_threads(1)

In [13]:
index, train_time, add_time = train_ivfflat(
    xb,
    nlist=nlist,
    km_n_iter=10,
    km_max_pts=max_pts,
    seed=seed,
    store_dir=None,  # if you want to store the index
    verbose=True,
    metric=metric,
)

Kmeans... nlist=2 km_n_iter=10 km_max_pts=256 seed=42 nredo=1
Training level-1 quantizer
Training level-1 quantizer on 100 vectors in 5D
Training IVF residual
IndexIVF: no residual training
	---> Index Train Time = 97.33075399708468 ms | Add Time = 0.23175900059868582 ms <---
IndexIVFFlat::add_core: added 100 / 100 vectors


In [14]:
index.nlist
index.invlists.nlist

2

2

Test if .add_empty_list() exists:

It exists, but you have to downcast (dynamic cast) from inverted list to array inverted list and access it

It does not update nlist of the index but it does update the nlist of the invlists...

In [15]:
faiss.downcast_InvertedLists(index.invlists).add_empty_list()

In [16]:
index.nlist
index.invlists.nlist

2

3

In [11]:
dir(index.invlists)

['INVALID_CODE_SIZE',
 'SUBSET_TYPE_ELEMENT_RANGE',
 'SUBSET_TYPE_ID_MOD',
 'SUBSET_TYPE_ID_RANGE',
 'SUBSET_TYPE_INVLIST',
 'SUBSET_TYPE_INVLIST_FRACTION',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__swig_destroy__',
 '__weakref__',
 'add_entries',
 'add_entry',
 'code_size',
 'compute_ntotal',
 'copy_subset_to',
 'get_codes',
 'get_ids',
 'get_iterator',
 'get_single_code',
 'get_single_id',
 'imbalance_factor',
 'is_empty',
 'list_size',
 'merge_from',
 'nlist',
 'prefetch_lists',
 'print_stats',
 'release_codes',
 'release_ids',
 'reset',
 'resize',
 'this',
 'thisown',
 'update_entries',
 'update_entry',
 'use_iterator']

As you can see poitns still in the first two, and the third one is empty

In [12]:
index.invlists.list_size(0)
index.invlists.list_size(1)
index.invlists.list_size(2)
# index.invlists.list_size(3) # will throw kernel error & out of bounds
index.invlists.is_empty(0)
index.invlists.is_empty(1)
index.invlists.is_empty(2)

72

28

0

False

False

True

Probably cracking works now 