In [1]:
import gc
import glob

import faiss
import numpy as np
import pandas as pd

In [2]:
%%time

name = "101_details"

npy_paths = [path for path in glob.glob(f"../preprocessed/{name}/000/*.npy") if "all" not in path]
npy_paths.sort()
embeddings = np.concatenate(
    [np.load(path) for path in npy_paths],
    axis=0,
)
gc.collect()
embeddings = embeddings.astype(np.float32, copy=False)
gc.collect()

all_path = f"../preprocessed/{name}/000/all.npy"
np.save(all_path, embeddings)

"""
embeddings = np.load(all_path)
"""

CPU times: user 18.3 s, sys: 25.8 s, total: 44.2 s
Wall time: 1min 31s


'\nembeddings = np.load(all_path)\n'

In [3]:
gc.collect()
print(embeddings.shape)

embeddings = embeddings.astype(np.float32, copy=False)
print(embeddings.nbytes / (1024**3))

(14402689, 384)
20.603211879730225


In [4]:
parquet_paths = [path for path in glob.glob(f"../preprocessed/{name}/000/*.parquet") if "all" not in path]

parquet_paths.sort()

dfs = []

for path in parquet_paths:
    df = pd.read_parquet(path, columns=["id"])
    df["file"] = path.split("/")[-1]
    dfs.append(df)
parquet_df = pd.concat(dfs).reset_index(drop=True)

parquet_df.to_parquet(f"../preprocessed/{name}/000/all.parquet")

In [5]:
parquet_df.shape

(14402689, 2)

In [6]:
a_df = pd.read_parquet(f"../preprocessed/{name}/000/a.parquet")

## IVFPQ

In [7]:
%%time

gc.collect()
dim = embeddings.shape[1]
nlist = 100
n_subquantizer = 64
n_bits = 8
quantizer = faiss.IndexFlatL2(dim)  # the other index
index = faiss.IndexIVFPQ(quantizer, dim, nlist, n_subquantizer, n_bits)

CPU times: user 112 ms, sys: 442 µs, total: 112 ms
Wall time: 109 ms


In [8]:
%%time
res = faiss.StandardGpuResources()  # use a single GPU
co = faiss.GpuClonerOptions()
co.useFloat16 = True
index = faiss.index_cpu_to_gpu(res, 0, index, co)

CPU times: user 345 ms, sys: 2.41 s, total: 2.75 s
Wall time: 2.72 s


In [9]:
%%time
assert not index.is_trained
index.train(
    embeddings,
)
assert index.is_trained

CPU times: user 1min 55s, sys: 803 ms, total: 1min 56s
Wall time: 15.9 s


In [10]:
%%time
index.add(embeddings)  # add may be a bit slower as well

CPU times: user 15.7 s, sys: 11.7 s, total: 27.4 s
Wall time: 27.2 s


### 実行時間と性能の簡単な確認

In [11]:
%%time
k = 10
index.nprobe = 1
D, I = index.search(embeddings[:6], k)  # actual search
print(I[:6])

[[       0  7136160  2360485 12061259 12061253 11784258 12301836 12061400
  10109887  9862356]
 [       1        2  3462671  1378108  4980957  1367727  4980966  5141363
   4980958  1365167]
 [       2        1  1367727  8026582   776582  1368243  3076910  3098498
   5923100 14189452]
 [       3  6036590 12466612  9307671  9131174 12519668  3131718  2035262
   7426598  4161378]
 [       4  8104083 11874226  2938327  2730488 11047280  3410072  3848985
   5072513   111712]
 [       5 11744447  3118848 14110188  5080844  8464028 13531655  5817207
   2901626  7600922]]
CPU times: user 4 ms, sys: 11.9 ms, total: 15.9 ms
Wall time: 14.3 ms


In [12]:
%%time
index.nprobe = 10
D, I = index.search(embeddings[:10], k)  # actual search
print(I[:10])

[[       0  2399446  1616515  8223953  4873450   836167  7136160 10110068
   9860840  2360485]
 [       1        2  3462671  1378108   134094  4980957  1367727  4980966
   5141363  4980958]
 [       2        1  1367727  8026582   776582  1368243  3076910  3098498
   5923100 14189452]
 [       3  6036590  7279883  7279880 12466612  9307671  9057406  9131174
  12519668  3131718]
 [       4  8104083 11874226  2938327  2637086  2730488  1447962 11047280
   3410072  3848985]
 [       5        6   921020        7 14066914  1621016  2893662 11744447
   3118848  8018113]
 [       6        5    86339   921020 12762945        7  1575131 13957573
  12324362  8492551]
 [       7 10938950  8690291  8492376  8492332  8492321  8492452  8690505
  11974965  8688874]
 [       8  8492519  6995812  1565262  7034554  8689165   788913  8689182
  10651259  8492332]
 [       9 12324265       10  9909667 12325172  7359755  7227377 12957606
   7227378  8492525]]
CPU times: user 1.53 ms, sys: 8.5 ms, total: 10 m

In [13]:
parquet_df.head(15)

Unnamed: 0,id,file
0,49495844,a.parquet
1,3579086,a.parquet
2,3579086,a.parquet
3,3579086,a.parquet
4,62397582,a.parquet
5,15547032,a.parquet
6,15547032,a.parquet
7,15547032,a.parquet
8,15547032,a.parquet
9,15547032,a.parquet


In [14]:
%%time
# 大量に検索する時間
index.nprobe = 10  # default nprobe is 1, try a few more
D, I = index.search(embeddings[:10000], k)  # actual search
print(I)

[[       0  2399446  1616515 ... 10110068  9860840  2360485]
 [       1        2  3462671 ...  4980966  5141363  4980958]
 [       2        1  1367727 ...  3098498  5923100 14189452]
 ...
 [    9997  2756256 10080710 ...     9998   960129  2756292]
 [    9998  2756256   720358 ...  2756302   720357   720361]
 [    9999  2756256  2756258 ...  2288245  2756302 10080712]]
CPU times: user 1.41 s, sys: 5.65 s, total: 7.06 s
Wall time: 7.06 s


In [15]:
%%time
cpu_index = faiss.index_gpu_to_cpu(index)

CPU times: user 527 ms, sys: 257 ms, total: 784 ms
Wall time: 781 ms


In [16]:
%%time
cpu_index.nprobe = 1  # default nprobe is 1, try a few more
D, I = cpu_index.search(embeddings[: 10000 // 10], k)  # actual search
print(I)

[[       0  7136160  2360485 ... 12061400 10109887  9862356]
 [       1        2  3462671 ...  5141363  4980958  1365167]
 [       2        1  1367727 ...  3098498  5923100 14189452]
 ...
 [     997      991  3398729 ...      996  3398735  3398697]
 [     998  4442885  5083123 ...  6868668 13231187 13237271]
 [     999  7893775  7989197 ...  7723576  7899020  7989198]]
CPU times: user 19.2 s, sys: 1.94 ms, total: 19.2 s
Wall time: 1.28 s


In [17]:
cpu_index = faiss.index_gpu_to_cpu(index)
faiss.write_index(cpu_index, f"../preprocessed/{name}/000/ivfpq_100_64_8.index")

In [18]:
# メモリ解放
del index
del cpu_index
res.noTempMemory()
gc.collect()

0