## Introduction


#### Additional References:
* Azure Search Openai Repo: https://github.com/Azure-Samples/azure-search-openai-demo/tree/main?tab=MIT-1-ov-file
* Azure Search Openai demo: https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/README.md#cost-estimation
* Vector Search demo code https://github.com/Azure/azure-search-vector-samples/tree/main/demo-python
* https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/community-integration/cohere/azure-search-cohere-embed-v3-sample.ipynb

In [7]:
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery
)
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    # edm,
    SearchFieldDataType,
    SearchIndexer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerDataContainer,
    SearchIndexerDataIdentity,
    HnswAlgorithmConfiguration,
    SearchField,
    SearchableField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile
)
from dotenv import dotenv_values
from azure.ai.inference import EmbeddingsClient, ImageEmbeddingsClient
from azure.ai.inference.models import ImageEmbeddingInput, EmbeddingItem, EmbeddingsResult
import re, os, uuid 

from embed_util import EmbedResult
import requests
from io import BytesIO

config = {
    **dotenv_values("./envs/aisearch.env")
}

search_key = config["AZURE_AI_SEARCH_KEY"]
search_endpoint = config["AZURE_AI_SEARCH_ENDPOINT"]
# Blob storage details
# storage_connection_string = config["AZURE_STORAGE_CONNECTION_STRING"]
blob_container_name = config["AZURE_BLOB_CONTAINER_NAME"]

embed_endpoint = config["AZURE_INFERENCE_ENDPOINT"]
embed_key = config["AZURE_INFERENCE_KEY"]
embed_model = config["MODEL_NAME"]

# use "" to avoid acidentally output the config variable in notebook cells
""


''

## Setup an indexing pipeline

In [8]:

# Azure AI Search service details
# give a name to your index, data source and indexer
index_name = "cohere-embed-v3-index"
# data_source_name = "blob-datasource-catalog-1"
# indexer_name = "blob-indexer-demo-1"

# Create SearchIndexClient
search_credential = AzureKeyCredential(search_key)

# index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)

# # Define Search Index Schema
# index = SearchIndex(
#     name=index_name,
#     fields=[
#         SimpleField(name="id", type=SearchFieldDataType.String, key=True),
#         SimpleField(name="content", type=SearchFieldDataType.String, searchable=True)
#     ]
# )
# index_client.create_or_update_index(index)

## Generate Embeddings Function

In [19]:
embed_text = EmbeddingsClient(
    endpoint=embed_endpoint,
    credential=AzureKeyCredential(embed_key)
)

img_embed = ImageEmbeddingsClient(
    endpoint=embed_endpoint,
    credential=AzureKeyCredential(embed_key)
)

# def parse_embed_result(response: EmbeddingsResult, model=None, idx=0) -> EmbedResult:
#     """parse the embedding result for response by given index"""
#     if response.data is not None:
#         item = response.data[idx]
#         if isinstance(item, EmbeddingItem):
#             return EmbedResult({
#                 "modelVersion": model,
#                 "vector":item.embedding
#             })
#     return None

def parse_embed_result(response: EmbeddingsResult, model=None) -> list:
    """parse the embedding result for responsex"""
    if response.data is not None:
       return [item.embedding for item in response.data]
    return []



In [20]:

def gen_text_embeddings(texts, model=embed_model)-> list:
    # ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]
    
    text_response = embed_text.embed(
        input=texts,
        model=model,
    )
    return parse_embed_result(text_response, model=model)

data_path = os.path.join(os.getcwd(), "data")

def gen_img_embeddings(url, image_format="jpeg", model=embed_model)-> EmbedResult:   
    url_pattern = re.compile(r'^https://[^\s]+$')
    
    if url_pattern.match(url):
        # save to cache
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        
        cache_image_name = f"image_cache_{str(uuid.uuid4())}.{image_format}"
        img_file_path = img.save(os.path.join(data_path, cache_image_name))

        img_embed_input = ImageEmbeddingInput.load(image_file=img_file_path, image_format=image_format)

        img_response = img_embed.embed(
            input=[img_embed_input],
            model=model,
        )
        return parse_embed_result(img_response, model=model)

## Create or Update Azure AI Search Index

This function creates or updates an Azure AI Search Index to include a vector field for storing the document embeddings

In [21]:
def create_or_update_index(client, index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="text",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # OData syntax for 32-bit floating point number
            # type="Collection(Edm.SByte)",  # OData syntax for 8-bit signed integer
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # hidden=False, Use hidden=False if you want to return the embeddings in the search results
        ),
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        # hierarchical navigable small world (HNSW) algorithm configuration
        # https://learn.microsoft.com/en-gb/azure/search/vector-search-overview#nearest-neighbors-search
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    client.create_or_update_index(index=index)

## Index Documents and Their Embeddings

Finally, this function indexes the documents along with their float embeddings into Azure AI Search. For demonstration, document IDs are generated sequentially, but in practical applications, it's essential to use meaningful identifiers like database row ID, unique filenames, or any other unique metadata associated with the document.

In [22]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "text": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)

## Run the workflow

In [23]:
documents = [
    "Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
    "Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
    "Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
    "Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
]

# Generate embeddings
embeddings = gen_text_embeddings(documents, model=embed_model)

# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=search_credential,
    index_name=index_name
)

# Create or update the search index to include the embedding field
create_or_update_index(search_index_client, index_name)

# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_endpoint, 
    index_name=index_name, 
    credential=search_credential
)

# # Index the documents and their embeddings
# index_documents(search_client, documents, embeddings)

In [None]:
# len(documents)

4

In [None]:
# len(embeddings)

4

In [27]:
# Index the documents and their embeddings
index_documents(search_client, documents, embeddings)

In [None]:
### Perform a Vector Search

In [38]:
from azure.search.documents import SearchClient

# Query for vector search
query = "foundational figures in computer science"

# Generate query embeddings
# Use input_type="search_query" for query embeddings
query_embeddings = gen_text_embeddings(query, model=embed_model)

search_client = SearchClient(search_endpoint, index_name, search_credential)

In [39]:
vector_query = VectorizedQuery(
    vector=query_embeddings[0], k_nearest_neighbors=3, fields="embedding"
)

results = search_client.search(
    search_text=None,  # No search text for pure vector search
    vector_queries=[vector_query],
)

# for result in results:
#     print(f"Text: {result['text']}")
#     print(f"Score: {result['@search.score']}\n")

A search result look like this.
```json
{'text': 'Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity',
 'id': '3',
 '@search.score': 0.6064307,
 '@search.reranker_score': None,
 '@search.highlights': None,
 '@search.captions': None}
 ```

In [40]:
for result in results:
    print(f"Text: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

Text: Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.
Score: 0.6697743

Text: Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.
Score: 0.616247

Text: Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity
Score: 0.6064307

