# OpenSearch Query Notebook

This notebook demonstrates how to connect to a running OpenSearch instance and execute queries.


In [2]:
from opensearchpy import OpenSearch
import os

# OpenSearch connection details
host = os.getenv("OPENSEARCH_HOST", "localhost")
port = int(os.getenv("OPENSEARCH_PORT", "9200"))

# Create the client
client = OpenSearch(
    hosts=[{"host": host, "port": port}],
    http_compress=True,
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

# Test connection
try:
    info = client.info()
    print(f"Connected to OpenSearch: {info}")
except Exception as e:
    print(f"Could not connect to OpenSearch: {e}")

Connected to OpenSearch: {'name': '9dca2739160b', 'cluster_name': 'opensearch-docker-cluster', 'cluster_uuid': '5RMj-EPnQCiDM2-Iqpd-YQ', 'version': {'distribution': 'opensearch', 'number': '3.2.0', 'build_type': 'tar', 'build_hash': '6adc0bf476e1624190564d7fbe4aba00ccf49ad8', 'build_date': '2025-08-12T03:55:01.226522683Z', 'build_snapshot': False, 'lucene_version': '10.2.2', 'minimum_wire_compatibility_version': '2.19.0', 'minimum_index_compatibility_version': '2.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


In [3]:
# Query for all documents
index_name = "documents"
search_body = {"query": {"match_all": {}}}

response = client.search(index=index_name, body=search_body)
print("Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Title: {hit['_source']['title']}")

Search Results:
ID: tSLPe5kBliW_qDeLghGZ, Title: attention_is_all_you_need.pdf


In [5]:
## BM25 Query

search_term = "attention mechanism"

bm25_query = {
    "size": 5,
    "query": {"match": {"text_content": search_term}},
    "_source": ["title", "text_content"],
}

response = client.search(index=index_name, body=bm25_query)
print("\nBM25 Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}, Title: {hit['_source']['title']}")


BM25 Search Results:
ID: tSLPe5kBliW_qDeLghGZ, Score: 0.52425647, Title: attention_is_all_you_need.pdf


## K-NN Search

To perform a k-NN search, you'll need to generate an embedding for your query text using the same BGE model. Then, you can query OpenSearch using that embedding.


In [6]:
from transformers import AutoModel, AutoTokenizer
import torch

# Load the BGE model and tokenizer (ensure these are available in your environment)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")


def get_embedding(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings.tolist()[0]


# Example k-NN search
query_text = "What is attention mechanism?"
query_embedding = get_embedding(query_text)

knn_query = {
    "size": 2,
    "query": {"knn": {"embedding": {"vector": query_embedding, "k": 2}}},
    "_source": ["title", "text_content"],
}

response = client.search(index=index_name, body=knn_query)
print("K-NN Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}, Title: {hit['_source']['title']}")

  from .autonotebook import tqdm as notebook_tqdm


K-NN Search Results:
ID: tSLPe5kBliW_qDeLghGZ, Score: 0.56621206, Title: attention_is_all_you_need.pdf


In [7]:
## Hybrid (BM25 + Dense Vector) Query

# Hybrid query (combining BM25 and k-NN)
hybrid_query_text = "How does machine learning work?"
hybrid_query_embedding = get_embedding(hybrid_query_text)

hybrid_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": [{"match": {"text_content": hybrid_query_text}}],
            "should": [
                {"knn": {"embedding": {"vector": hybrid_query_embedding, "k": 5}}}
            ],
        }
    },
    "_source": ["title", "text_content", "_score"],
}

response = client.search(index=index_name, body=hybrid_query)
print("\nHybrid Search Results:")
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}, Title: {hit['_source']['title']}")


Hybrid Search Results:
ID: tSLPe5kBliW_qDeLghGZ, Score: 1.3661793, Title: attention_is_all_you_need.pdf
