In [1]:
import datasets
import faiss
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = datasets.load_from_disk("vacancies_dataset")
dataset

Dataset({
    features: ['vacancy_id', 'embedding', '__index_level_0__'],
    num_rows: 2734129
})

In [3]:
dataset.set_format("numpy")

In [4]:
X = dataset["embedding"]
X

array([[ 1.6863085e-01,  3.5776415e-01, -3.6538461e-01, ...,
         1.0711843e-03,  9.5793498e-01, -1.8816636e+00],
       [ 4.9633396e-01, -7.8782260e-01, -9.6153843e-01, ...,
        -3.9433387e-01,  9.6770871e-01, -1.7618304e+00],
       [ 4.2058936e-01,  7.7470410e-01,  5.0000000e-01, ...,
         5.4468989e-02,  7.6047087e-01, -1.9440467e+00],
       ...,
       [ 5.0130326e-01, -4.9921575e-01, -2.1153846e-01, ...,
        -6.6612816e-01,  5.4262793e-01, -1.5966307e+00],
       [ 7.6575321e-01, -2.9516611e-02,  1.5384616e-01, ...,
        -7.3348075e-02,  7.1282929e-01, -2.0943880e+00],
       [-4.7516590e-01, -5.1803792e-01,  7.5000000e-01, ...,
        -2.6423350e-02,  5.4217106e-01, -2.0552166e+00]], dtype=float32)

In [5]:
y = dataset["vacancy_id"]
y

array(['v_862116', 'v_288642', 'v_1840054', ..., 'v_639897', 'v_1636531',
       'v_1689739'], dtype='<U9')

In [6]:
class FaissKNeighbors:
    def __init__(self, k=1):
        self.index = None
        self.y = None
        self.k = k
        self.res = faiss.StandardGpuResources()

    def fit(self, X, y):
        index = faiss.IndexFlatL2(X.shape[1])
        self.index = faiss.index_cpu_to_all_gpus(index)
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = self.y[indices]
        return votes

In [7]:
faiss_model = FaissKNeighbors()

In [8]:
faiss_model.fit(X, y)

In [9]:
idx = np.random.permutation(X.shape[0])[:1_000_000]

In [16]:
X.shape

(2734129, 797)

In [10]:
X_test = X[idx]
y_test = y[idx]

In [11]:
y_pred = faiss_model.predict(X_test)
y_pred

array([['v_2645381'],
       ['v_644091'],
       ['v_1755298'],
       ...,
       ['v_1829184'],
       ['v_472539'],
       ['v_585295']], dtype='<U9')

In [12]:
f1_score(y_test, y_pred, average="macro")

0.9367124311080318

In [13]:
recall_score(y_test, y_pred, average="macro")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.9408256853787818

In [14]:
precision_score(y_test, y_pred, average="macro")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.9347756354886289