## Introduction


#### Additional References:
* Azure Search Openai Repo: https://github.com/Azure-Samples/azure-search-openai-demo/tree/main?tab=MIT-1-ov-file
* Azure Search Openai demo: https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/README.md#cost-estimation
* Vector Search demo code https://github.com/Azure/azure-search-vector-samples/tree/main/demo-python
* https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/community-integration/cohere/azure-search-cohere-embed-v3-sample.ipynb

In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery
)
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    # edm,
    SearchFieldDataType,
    SearchIndexer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerDataContainer,
    SearchIndexerDataIdentity,
    HnswAlgorithmConfiguration,
    SearchField,
    SearchableField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile
)
from dotenv import dotenv_values
from azure.ai.inference import EmbeddingsClient, ImageEmbeddingsClient
from azure.ai.inference.models import ImageEmbeddingInput, EmbeddingItem, EmbeddingsResult
import re, os, uuid 

from embed_util import EmbedResult
import requests
from io import BytesIO
from PIL import Image

config = {
    **dotenv_values("./envs/aisearch.env")
}

search_key = config["AZURE_AI_SEARCH_KEY"]
search_endpoint = config["AZURE_AI_SEARCH_ENDPOINT"]
search_credential = AzureKeyCredential(search_key)

embed_endpoint = config["AZURE_INFERENCE_ENDPOINT"]
embed_key = config["AZURE_INFERENCE_KEY"]
embed_model = config["MODEL_NAME"]

# give a name to your index
index_name = "cohere-embed-v3-index"

# use "" to avoid acidentally output the config variable in notebook cells
""


''

## Setup an indexing pipeline

In [2]:
# give a name to your index
index_name = "cohere-embed-v3-index"

search_credential = AzureKeyCredential(search_key)


## Generate Embeddings Function

In [None]:
## create embed_text client to embed text 
embed_text = EmbeddingsClient(
    endpoint=embed_endpoint,
    credential=AzureKeyCredential(embed_key)
)

## create embed_img client to embed text 
embed_img = ImageEmbeddingsClient(
    endpoint=embed_endpoint,
    credential=AzureKeyCredential(embed_key)
)


def parse_embed_result(response: EmbeddingsResult, model=None) -> list:
    """parse the embedding result for responsex"""
    if response.data is not None:
       return [item.embedding for item in response.data]
    return []


def gen_text_embeddings(texts, model=embed_model)-> list:
    # ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]
    
    text_response = embed_text.embed(
        input=texts,
        model=model,
    )
    return parse_embed_result(text_response, model=model)

data_path = os.path.join(os.getcwd(), "data")
# create data path if not exist
if not os.path.exists(data_path):
    os.makedirs(data_path)

def gen_img_embeddings(url, image_format="jpeg", model=embed_model)-> EmbedResult:   
    url_pattern = re.compile(r'^https://[^\s]+$')
    
    if url_pattern.match(url):
        # save to cache
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        
        cache_image_name = f"image_cache_{str(uuid.uuid4())}.{image_format}"
        img_file_path = os.path.join(data_path, cache_image_name)
        img.save(img_file_path)

        img_embed_input = ImageEmbeddingInput.load(image_file=img_file_path, image_format=image_format)

        img_response = embed_img.embed(
            input=[img_embed_input],
            model=model,
        )
        return parse_embed_result(img_response, model=model)

## Create or Update Azure AI Search Index

This function creates or updates an Azure AI Search Index to include a vector field for storing the document embeddings

In [4]:
def create_or_update_index(client, index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="text",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # OData syntax for 32-bit floating point number
            # type="Collection(Edm.SByte)",  # OData syntax for 8-bit signed integer
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # hidden=False, Use hidden=False if you want to return the embeddings in the search results
        ),
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        # hierarchical navigable small world (HNSW) algorithm configuration
        # https://learn.microsoft.com/en-gb/azure/search/vector-search-overview#nearest-neighbors-search
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    client.create_or_update_index(index=index)

## Index Documents and Their Embeddings

Finally, this function indexes the documents along with their float embeddings into Azure AI Search. For demonstration, document IDs are generated sequentially, but in practical applications, it's essential to use meaningful identifiers like database row ID, unique filenames, or any other unique metadata associated with the document.

In [5]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "text": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)

## Run the workflow

In [6]:
documents = [
    "Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
    "Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
    "Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
    "Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
]

# Generate embeddings
embeddings = gen_text_embeddings(documents, model=embed_model)

# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=search_credential,
    index_name=index_name
)

# Create or update the search index to include the embedding field
create_or_update_index(search_index_client, index_name)

# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_endpoint, 
    index_name=index_name, 
    credential=search_credential
)

In [7]:
# Index the documents and their embeddings
index_documents(search_client, documents, embeddings)

### Perform a Vector Search

In [8]:
# create a logger for the 'azure' SDK
# import logging, sys
# logger = logging.getLogger("azure")
# logger.setLevel(logging.DEBUG)

# # configure a console output
# handler = logging.StreamHandler(stream=sys.stdout)
# logger.addHandler(handler)

# from azure.search.documents import SearchClient

# Query for vector search
query = "foundational figures in computer science"

# Generate query embeddings for text query 
query_embeddings = gen_text_embeddings(query, model=embed_model)

search_client = SearchClient(search_endpoint, index_name, search_credential, logging_enable=False)

In [9]:
## retrieve the ranked documents based on the vector search with 2 nearest neighbors
vector_query = VectorizedQuery(
    vector=query_embeddings[0], k_nearest_neighbors=3, fields="embedding"
)

results = search_client.search(
    search_text=None,  # No search text for pure vector search
    vector_queries=[vector_query],
)

A search result look like this.
```json
{'text': 'Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity',
 'id': '3',
 '@search.score': 0.6064307,
 '@search.reranker_score': None,
 '@search.highlights': None,
 '@search.captions': None}
 ```

In [10]:
for result in results:
    print(f"Text: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

Text: Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.
Score: 0.6697743

Text: Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.
Score: 0.616247

Text: Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity
Score: 0.6064307



## Perform a keyword search

In [11]:
# create a logger for the 'azure' SDK
# import logging, sys
# logger = logging.getLogger("azure")
# logger.setLevel(logging.DEBUG)

# # configure a console output
# handler = logging.StreamHandler(stream=sys.stdout)
# logger.addHandler(handler)

# results = search_client.search(search_text="foundational figures in computer science", vector_queries=[vector_query], top=1)

my_query = "Der Wissenschaftler?" # "Science Guy?" in German
my_query_embeddings = gen_text_embeddings(query, model=embed_model)
my_vector_query = VectorizedQuery(
    vector=query_embeddings[0], k_nearest_neighbors=3, fields="embedding"
)

## While the keyword based search is not able to find any document matching the query.
results = search_client.search(search_text=my_query, top=2, logging_enable=False)
# results = search_client.search(search_text="scientist", top=2, select="text")

for result in results:
    print(f"Text: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

In [12]:
## The vector search is able to find the documents matching the query.
results = search_client.search(search_text=my_query, vector_queries=[my_vector_query], top=2)

for result in results:
    print(f"Text: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

Text: Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.
Score: 0.01666666753590107

Text: Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.
Score: 0.016393441706895828



In [13]:
## retrieve the document by key
json_doc = search_client.get_document(key="0")
json_doc

{'id': '0',
 'text': 'Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.'}

## Search with Image Embedding

Search with the image of Albert Einstein in the vector embedding of document.

In [14]:
# too large to display in the notebook
# img_query_url = "https://upload.wikimedia.org/wikipedia/commons/3/3f/Albert_Einstein_1921_by_F_Schmutzer.jpg"
img_query_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Albert_Einstein_1921_by_F_Schmutzer.jpg/500px-Albert_Einstein_1921_by_F_Schmutzer.jpg"

# cache the image and send to get the embedding
img_query_embeddings = gen_img_embeddings(img_query_url, image_format="jpeg", model=embed_model)

img_vector_query = VectorizedQuery(
    vector=img_query_embeddings[0], k_nearest_neighbors=3, fields="embedding"
)

In [15]:
results = search_client.search(
    search_text=None,  # No search text for pure vector search
    vector_queries=[img_vector_query], top=1,
    select=["text"],  # select the fields to return
)

for result in results:
    print(f"Text: {result['text']}")
    print(f"Score: {result['@search.score']}\n")

Text: Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.
Score: 0.6954385

