In [1]:
import gc
import glob

import faiss
import numpy as np
import pandas as pd

In [2]:
%%time
npy_paths = [path for path in glob.glob("../preprocessed/100_embedding/000/*.npy") if "all" not in path]
npy_paths.sort()
embeddings = np.concatenate(
    [np.load(path) for path in npy_paths],
    axis=0,
)
gc.collect()
embeddings = embeddings.astype(np.float32, copy=False)
gc.collect()

all_path = "../preprocessed/100_embedding/000/all.npy"
np.save(all_path, embeddings)

"""
embeddings = np.load(all_path)
"""

CPU times: user 27.2 s, sys: 26.1 s, total: 53.4 s
Wall time: 2min 8s


'\nembeddings = np.load(all_path)\n'

In [3]:
gc.collect()
print(embeddings.nbytes / (1024**3))

print(embeddings.shape)

embeddings = embeddings.astype(np.float32, copy=False)
print(embeddings.nbytes / (1024**3))

22.87635898590088
(15991734, 384)
22.87635898590088


In [4]:
parquet_paths = [path for path in glob.glob("../preprocessed/100_embedding/000/*.parquet") if "all" not in path]

parquet_paths.sort()

dfs = []

for path in parquet_paths:
    df = pd.read_parquet(path, columns=["id"])
    df["file"] = path.split("/")[-1]
    dfs.append(df)
parquet_df = pd.concat(dfs).reset_index(drop=True)

parquet_df.to_parquet("../preprocessed/100_embedding/000/all.parquet")

In [5]:
parquet_df.shape

(15991734, 2)

In [6]:
a_df = pd.read_parquet("../preprocessed/100_embedding/000/a.parquet")

## IVFPQ

In [7]:
%%time

gc.collect()
dim = embeddings.shape[1]
nlist = 100
n_subquantizer = 64
n_bits = 8
quantizer = faiss.IndexFlatL2(dim)  # the other index
index = faiss.IndexIVFPQ(quantizer, dim, nlist, n_subquantizer, n_bits)

CPU times: user 103 ms, sys: 0 ns, total: 103 ms
Wall time: 102 ms


In [8]:
%%time
res = faiss.StandardGpuResources()  # use a single GPU
co = faiss.GpuClonerOptions()
co.useFloat16 = True
index = faiss.index_cpu_to_gpu(res, 0, index, co)

CPU times: user 127 ms, sys: 2.55 s, total: 2.67 s
Wall time: 2.65 s


In [9]:
%%time
assert not index.is_trained
index.train(
    embeddings,
)
assert index.is_trained

CPU times: user 52.5 s, sys: 0 ns, total: 52.5 s
Wall time: 15.2 s


In [10]:
%%time
index.add(embeddings)  # add may be a bit slower as well

CPU times: user 17.5 s, sys: 11.7 s, total: 29.2 s
Wall time: 29.1 s


### 実行時間と性能の簡単な確認

In [11]:
%%time
k = 10
index.nprobe = 1
D, I = index.search(embeddings[:6], k)  # actual search
print(I[:6])

[[       0  2664772  1794577  1794580  1794584  9080806  1794582  9265231
  11239431  3239991]
 [       1  5512648   212285        2 14232048  5497787 14231392  1514687
  14231170 14231163]
 [       2  1528834 12505410  1531294  1524962  5512648        1  3931942
   1517411 15622389]
 [       3 14648592   718660  5389434   868484  6615637  2258850 14399810
   2936709 12619090]
 [       4  5218866  6603728  4705914  4665079 15701430   449083  3694397
   6165892 12756823]
 [       5  5366892  1553758  2673413  9374517  6492238  4463322  8053988
   8971065  6217212]]
CPU times: user 1.78 ms, sys: 4.81 ms, total: 6.58 ms
Wall time: 3.9 ms


In [12]:
%%time
index.nprobe = 10
D, I = index.search(embeddings[:10], k)  # actual search
print(I[:10])

[[       0  2664772  1794577  1794580  1794584  9080806  1794582  9265231
  11239431  3239991]
 [       1  5512648   212285        2 14232048  5497787 14231392  1514687
  14383889 14231398]
 [       2  1528834 12505410   149575  1531294  1524962  5512648        1
   3931942 14231398]
 [       3  3939877 14648592   718660 12619088  5389434   868484  6615637
   2258850  6724068]
 [       4 15412221  5218866  6603728  4705914  4665079 15701430   449083
   3694397  6165892]
 [       5  5366892  1553758  2673413  9374517  6492238  4463322  8053988
   8971065  6217212]
 [       6 15651448  1606433  1374536  1643661  8945823  1935421   609982
  14537710  3578084]
 [       7        8        9       10  1053456  4544113  4537453 13810074
   4950121  3933272]
 [       8  3468295  5617516 11228705 15620037 13782158 15620038 13466617
  15503259  8131415]
 [       9 14901787 13701537 13308628  9917590  4457404  2224673  4920265
  15746954 13064564]]
CPU times: user 2.85 ms, sys: 8.11 ms, total: 11 

In [13]:
parquet_df.head(15)

Unnamed: 0,id,file
0,49495844,a.parquet
1,3579086,a.parquet
2,3579086,a.parquet
3,3579086,a.parquet
4,3579086,a.parquet
5,3579086,a.parquet
6,62397582,a.parquet
7,15547032,a.parquet
8,15547032,a.parquet
9,15547032,a.parquet


In [14]:
%%time
# 大量に検索する時間
index.nprobe = 10  # default nprobe is 1, try a few more
D, I = index.search(embeddings[:10000], k)  # actual search
print(I)

[[       0  2664772  1794577 ...  9265231 11239431  3239991]
 [       1  5512648   212285 ...  1514687 14383889 14231398]
 [       2  1528834 12505410 ...        1  3931942 14231398]
 ...
 [    9997 14284738  8200788 ... 13596032  2814030  5560265]
 [    9998 14262067     9999 ...  1611181 14361650 14272446]
 [    9999  5589678     9996 ... 14419966 14317346 14048873]]
CPU times: user 1.08 s, sys: 5.44 s, total: 6.52 s
Wall time: 6.52 s


In [15]:
%%time
cpu_index = faiss.index_gpu_to_cpu(index)

CPU times: user 613 ms, sys: 315 ms, total: 928 ms
Wall time: 925 ms


In [16]:
%%time
cpu_index.nprobe = 1  # default nprobe is 1, try a few more
D, I = cpu_index.search(embeddings[: 10000 // 10], k)  # actual search
print(I)

[[       0  2664772  1794577 ...  9265231 11239431  3239991]
 [       1  5512648   212285 ...  1514687 14231170 14231163]
 [       2  1528834 12505410 ...  3931942  1517411 15622389]
 ...
 [     997 15563287 14386251 ... 11765040 14396443 14386253]
 [     998 13594819  8134431 ...  5955552  3125639  3362304]
 [     999  7294470 12081759 ...  5197967 13594449 14352346]]
CPU times: user 15.3 s, sys: 0 ns, total: 15.3 s
Wall time: 2.09 s


In [17]:
cpu_index = faiss.index_gpu_to_cpu(index)
faiss.write_index(cpu_index, "../preprocessed/100_embedding/000/ivfpq_100_64_8.index")

In [18]:
# メモリ解放
del index
del cpu_index
res.noTempMemory()
gc.collect()

0