# OpenSearch Query Notebook

This notebook demonstrates how to connect to a running OpenSearch instance and execute queries.


In [None]:
from opensearchpy import OpenSearch
import os

# OpenSearch connection details
host = os.getenv("OPENSEARCH_HOST", "localhost")
port = int(os.getenv("OPENSEARCH_PORT", "9200"))

# Create the client
client = OpenSearch(
    hosts=[{"host": host, "port": port}],
    http_compress=True,
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

# Test connection
try:
    info = client.info()
    print(f"Connected to OpenSearch: {info}")
except Exception as e:
    print(f"Could not connect to OpenSearch: {e}")

In [None]:
# Query for all documents
index_name = "documents"
search_body = {"query": {"match_all": {}}}

response = client.search(index=index_name, body=search_body)
print("Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Title: {hit['_source']['title']}")

## K-NN Search

To perform a k-NN search, you'll need to generate an embedding for your query text using the same BGE model. Then, you can query OpenSearch using that embedding.


In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

# Load the BGE model and tokenizer (ensure these are available in your environment)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")


def get_embedding(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings.tolist()[0]


# Example k-NN search
query_text = "What is attention mechanism?"
query_embedding = get_embedding(query_text)

knn_query = {
    "size": 2,
    "query": {"knn": {"embedding": {"vector": query_embedding, "k": 2}}},
    "_source": ["title", "text_content"],
}

response = client.search(index=index_name, body=knn_query)
print("K-NN Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}, Title: {hit['_source']['title']}")