# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

In [2]:
! pip install azure-search-documents --pre
! pip install openai
! pip install python-dotenv

Collecting azure-search-documents
  Using cached azure_search_documents-11.4.0b6-py3-none-any.whl (306 kB)
Collecting azure-core<2.0.0,>=1.24.0 (from azure-search-documents)
  Using cached azure_core-1.28.0-py3-none-any.whl (185 kB)
Collecting azure-common~=1.1 (from azure-search-documents)
  Using cached azure_common-1.1.28-py2.py3-none-any.whl (14 kB)
Collecting isodate>=0.6.0 (from azure-search-documents)
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Installing collected packages: azure-common, isodate, azure-core, azure-search-documents
Successfully installed azure-common-1.1.28 azure-core-1.28.0 azure-search-documents-11.4.0b6 isodate-0.6.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is avail

## Import required libraries and environment variables

In [4]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)
# Configure environment variables  
load_dotenv() 
  

True

In [5]:

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
 
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY
#---
credential = AzureKeyCredential(key)

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:

In [6]:
#test embedding with langchain
embeddingmodel = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
vec = embeddingmodel.embed_query("transform to vec")
vec

[-0.00942082516849041,
 -0.00464690150693059,
 -0.0015674912137910724,
 -0.006866264622658491,
 0.0005358756170608103,
 0.015242683701217175,
 -0.020154215395450592,
 -0.020111873745918274,
 -0.031106365844607353,
 -0.051401715725660324,
 -0.002101161517202854,
 0.005289070308208466,
 -0.01695042848587036,
 3.806811946560629e-05,
 0.009371427819132805,
 0.004029431845992804,
 0.016357658430933952,
 0.00030961702577769756,
 0.009187950752675533,
 -0.013986573554575443,
 -0.011629602871835232,
 -0.005095008295029402,
 0.005881841294467449,
 -0.012631668709218502,
 -0.02088812179863453,
 0.004558691754937172,
 0.006936832331120968,
 -0.022976934909820557,
 -0.01630120351910591,
 -0.003579560900107026,
 0.010302925482392311,
 0.0035178137477487326,
 -0.004537520930171013,
 -0.028904644772410393,
 -0.009667813777923584,
 -0.01939208060503006,
 0.005659551825374365,
 -0.011827193200588226,
 0.005673665553331375,
 0.0015524955233559012,
 0.011686057783663273,
 -0.009992426261305809,
 -0.01498

In [None]:
# Generate Document Embeddings using OpenAI Ada 002

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(page):
    response = openai.Embedding.create(
        input=page, engine="text-embedding-ada-002")
   
    embeddings = response['data'][0]['embedding']
    return embeddings

In [None]:
#TODO add multiple documents 
doc_title = "Semantic Kernel"
#load pdf and split into pages
fileName = "../data/semantic-kernel.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print ("Number of pages: ", len(pages))

doc_with_vector_list = []
doc_id = 0
# Generate embeddings for title and content fields
for page in pages:
    page_with_vector = {}
    page_with_vector ['id'] = str(doc_id)
    page_with_vector ['title'] = doc_title
    page_with_vector ['titleVector'] = generate_embeddings(doc_title)
    page_with_vector ['content'] = page.page_content
    page_with_vector ['contentVector'] = generate_embeddings(page.page_content)
    doc_with_vector_list.append(page_with_vector)
    doc_id += 1

# Output embeddings to docVectors.json file
with open("../output/sk_vector/sk_Vectors.json", "w") as f:
    json.dump(doc_with_vector_list, f)

## Create your search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
#Note: You must create Cognitive Search resource and get the endpoint and key in advance
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

fields = [
    #doc id - mandatory field
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    
    #title and titleVector
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),
    
    #content and contentVector
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),
    
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="sk-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="sk-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name="sk-cogsrch-vector-index", fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [None]:
# Upload some documents to the index
with open('../output/sk_vector/sk_Vectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name="sk-cogsrch-vector-index", credential=credential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(documents)} documents") 

## Perform a vector similarity search

In [None]:
# Pure Vector Search
query = "what's semantic kernel?"  
  
search_client = SearchClient(service_endpoint, index_name="sk-cogsrch-vector-index", credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="contentVector",
    select=["title", "content"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    


In [None]:
# Pure Vector Search multi-lingual (e.g 'tools for software development' in Dutch)  
query = "what're main components of semantic kernel?"  
  
search_client = SearchClient(service_endpoint, index_name="sk-cogsrch-vector-index", credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query), top_k=3,  
    vector_fields="contentVector",
    select=["title","content"],
) 
  
for result in results:
    print(f"Title: {result['title']}")    
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    


## Perform a Cross-Field Vector Search

## Perform a Pure Vector Search with a filter

In [None]:
# Pure Vector Search with Filter
query = "which programming languages are supported by semantic kernel?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  

results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query), top_k=3,  
    vector_fields="contentVector",
    filter="title eq 'Semantic Kernel'",
    select=["title", "content"]
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    

## Perform a Hybrid Search

In [None]:
# Hybrid Search
query = "what're main components of semantic kernel?"  
  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
  
results = search_client.search(  
    search_text=query,  
    vector=generate_embeddings(query), top_k=3,  
    vector_fields="contentVector",
    filter="title eq 'Semantic Kernel'",
    select=["title", "content",],
    top=3
)  
  
print (type(results))
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}\n")  



## Perform a Semantic Hybrid Search

In [None]:
# Semantic Hybrid Search
query = "what're main components of semantic kernel?"

search_client = SearchClient(
    service_endpoint, index_name, AzureKeyCredential(key))

results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query), top_k=3,  
    vector_fields="contentVector",
    select=["title", "content"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='sk-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Content: {result['content']}")
    
    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")
