In [None]:
%pip install -U pymilvus sentence-transformers fsspec s3fs
%pip install --upgrade huggingface-hub

In [None]:
import torch

print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

In [None]:
filename = "npa_dataset_v2.db"  # change dataset version when new iteration begins!
database_path = f"/home/jupyter/datasphere/s3/hack-object-storage/database/{filename}"
embeddings_dim = 1024 #generated by choosen model

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel

model_sentence = SentenceTransformer("deepvk/USER-bge-m3")
model_sentence = model_sentence.to("cuda")

In [None]:
from pymilvus import MilvusClient

client = MilvusClient(database_path)
collection_name = "npa_storage_one"

In [None]:
questions = {
    "question": [
        "Какие требования к объекту капитального строительства?", "Как можно сэкономить энергию в ХМАО-Югре, модернизируя электросети и оборудование?"
    ]
}

In [None]:
import re
import string

question_list = []
for texts in questions["question"]:
    dataset_text = ''.join([char.lower() if not char.isdigit() and char is not None else char for char in texts])
    dataset_text = re.sub('  ', ' ', dataset_text) # remove useless space
    dataset_text = re.sub(r'[\x00-\x1F\x7F-\x9F]+', '', dataset_text)
    question_list.append(dataset_text)

questions["question"] = question_list

In [None]:
question_embeddings = [v.tolist() for v in model_sentence.encode(questions["question"], normalize_embeddings=True, batch_size=8)]

search_results = client.search(
    collection_name=collection_name,
    data=question_embeddings,
    limit=3,  # How many search results to output
    output_fields=["text", "vector", "id"],  # Include these fields in search results
)

for q, res in zip(questions["question"], search_results):
    print("Question:", q)
    for r in res:
        print(
            {
                "text": r["entity"]["text"],
                "score": r["distance"],
            }
        )
    print("\n")