#### Package installation

In [None]:
# pip install faiss-cpu sentence-transformers

#### Import packages

In [None]:
import faiss
import torch
import tqdm as tqdm
from sentence_transformers import SentenceTransformer

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Load model

In [3]:
model = SentenceTransformer('intfloat/multilingual-e5-large').to(device)

In [None]:
model

In [None]:
print(dir(model))

### Text from embedding

In [47]:
texts = [
    "این یک متن نمونه است.",
    "Retrieval-Augmented Generation یک تکنیک قدرتمند است.",
    "مدل‌های زبانی بزرگ مانند GPT می‌توانند مفید باشند."
]

### Convert text to Vector

In [48]:
embeddings = model.encode(texts, convert_to_numpy=True)

In [70]:
embeddings.shape

(3, 1024)

#### Count feather vector

In [51]:
dimension = embeddings.shape[1]

## Faiss

#### Distination L2(Euclid)

In [52]:
index = faiss.IndexFlatL2(dimension)

### Add vector to index

In [53]:
index.add(embeddings)

### Save index in file

In [54]:
faiss.write_index(index, "my_faiss_index.index")

### Load index

In [55]:
loaded_index = faiss.read_index("my_faiss_index.index")

### Input new text to vector

In [56]:
query = "RAG یک روش کاربردی است."
query_embedding = model.encode([query], convert_to_numpy=True)

In [57]:
query_embedding

array([[ 0.03608019, -0.00548307, -0.021544  , ..., -0.03164741,
        -0.00523785,  0.02859626]], shape=(1, 1024), dtype=float32)

### Find near vector from new text

In [None]:
k = 2   # count near vector
distances, idx = loaded_index.search(query_embedding, k)

### from one dim

In [None]:
# distances, indices = index.search(query_embedding.reshape(1, -1), k)

In [None]:
query_embedding

array([[ 0.03608019, -0.00548307, -0.021544  , ..., -0.03164741,
        -0.00523785,  0.02859626]], shape=(1, 1024), dtype=float32)

In [69]:
query_embedding.shape

(1, 1024)

In [None]:
print(f"Indices: {idx}")
print(f"Distances: {distances}")
# print(f"متن‌های مشابه: {[texts[i] for i in indices[0]]}")

for i in idx[0]:
    print("Similar text:", texts[i])

Indices: [[1 0]]
Distances: [[0.25858375 0.3586954 ]]
متن‌های مشابه: ['Retrieval-Augmented Generation یک تکنیک قدرتمند است.', 'این یک متن نمونه است.']
