In [11]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-5.2.2-py3-none-any.whl (494 kB)
     ---------------------------------------- 0.0/494.1 kB ? eta -:--:--
     --------------- ---------------------- 204.8/494.1 kB 4.1 MB/s eta 0:00:01
     -------------------------------------- 494.1/494.1 kB 5.1 MB/s eta 0:00:00
Collecting torch>=1.11.0
  Downloading torch-2.10.0-cp310-cp310-win_amd64.whl (113.7 MB)
     ---------------------------------------- 0.0/113.7 MB ? eta -:--:--
     --------------------------------------- 1.5/113.7 MB 30.7 MB/s eta 0:00:04
     - ------------------------------------- 3.2/113.7 MB 34.7 MB/s eta 0:00:04
     - ------------------------------------- 5.1/113.7 MB 36.3 MB/s eta 0:00:03
     -- ------------------------------------ 7.0/113.7 MB 37.5 MB/s eta 0:00:03
     --- ----------------------------------- 9.0/113.7 MB 38.3 MB/s eta 0:00:03
     --- ----------------------------------- 9.9/113.7 MB 35.1 MB/s eta 0:00:03
     --- ---------------


[notice] A new release of pip is available: 23.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from pathlib import Path
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Convert Papers into Vectors

In [25]:
path = Path("data/")
papers = [file.name for file in path.iterdir() if file.suffix == ".pdf"]

In [26]:
print(papers)

['A geometrical meaning to the electron mass from breakdown of Lorentz invariance.pdf', 'Changing Data Sources in the Age of Machine Learning for Official Statistics.pdf', 'Electronic and magnetic properties of the graphene densely decorated with 3d metallic adatoms.pdf', 'Physics-Inspired Interpretability Of Machine Learning Models.pdf', 'Surface effects on the electronic energy loss of charged particles entering a metal surface.pdf', 'The electronic structure of cuprates from high energy spectroscopy.pdf', 'Ultrafast Electron Dynamics in the Topological Insulator Bi2Se3 Studied by Time-Resolved Photoemission Spectroscopy.pdf']


In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(papers)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 180.58it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [28]:
embeddings

array([[-0.09818988,  0.01670749,  0.06488056, ...,  0.04266714,
         0.07260435, -0.02000382],
       [-0.05614357, -0.03768128,  0.02812052, ..., -0.08241341,
        -0.02525806,  0.01933871],
       [-0.07868911, -0.00835283, -0.04886892, ..., -0.06218854,
        -0.11740713, -0.00828967],
       ...,
       [-0.00786057,  0.0623866 ,  0.03839498, ...,  0.00964377,
        -0.04893703,  0.05166593],
       [-0.02969235, -0.01151001, -0.02464625, ..., -0.01383608,
         0.00875939, -0.01773524],
       [-0.08166556, -0.01702412, -0.05271094, ..., -0.01419867,
        -0.07008444,  0.0263614 ]], shape=(7, 384), dtype=float32)

## Store Vectors in a Vector Database

In [15]:
!pip install faiss-cpu numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-cp310-win_amd64.whl (18.9 MB)
     ---------------------------------------- 0.0/18.9 MB ? eta -:--:--
     ---------------------------------------- 0.1/18.9 MB 3.3 MB/s eta 0:00:06
      --------------------------------------- 0.4/18.9 MB 3.9 MB/s eta 0:00:05
     -- ------------------------------------- 1.2/18.9 MB 8.3 MB/s eta 0:00:03
     ------ --------------------------------- 2.9/18.9 MB 15.6 MB/s eta 0:00:02
     ------- -------------------------------- 3.4/18.9 MB 18.0 MB/s eta 0:00:01
     ---------- ----------------------------- 5.0/18.9 MB 17.9 MB/s eta 0:00:01
     ------------------ --------------------- 8.6/18.9 MB 26.1 MB/s eta 0:00:01
     --------------------- ----------------- 10.4/18.9 MB 31.2 MB/s eta 0:00:01
     ------------------------- ------------- 12.4/18.9 MB 38.5 MB/s eta 0:00:01
     ----------------------------- --------- 14.3/18.9 MB 43.5 MB/s eta 0:00:01
     --------------------------------- ----


[notice] A new release of pip is available: 23.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import faiss
import numpy as np

In [29]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [30]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002C7F9FD0E10> >

## Convert User Query into a Vector

In [31]:
query = "Deep Learning for Natural Language Processing"
query_vector = model.encode([query])

## Search for Similar Papers

In [36]:
k = 2  # top 2 results
distances, indices = index.search(query_vector, k)

for i in indices[0]:
    print(papers[i])

Physics-Inspired Interpretability Of Machine Learning Models.pdf
Changing Data Sources in the Age of Machine Learning for Official Statistics.pdf
