In [17]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "azure.env" # change to your own .env file name
config = dotenv_values(env_name)

In [None]:
import os
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    AzureOpenAIEmbeddingSkill,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    BlobIndexerImageAction,
    CorsOptions,
    HnswAlgorithmConfiguration,
    IndexingParameters,
    IndexingParametersConfiguration,
    IndexProjectionMode,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    SearchIndexerSkillset,
    SimpleField,
    SplitSkill,
    VectorSearch,
    VectorSearchProfile
)

# Azure Multi AI Services
cogservices_key = config["cogservices_key"]
cogservices_openAI_endpoint = config["cogservices_aoai_endpoint"]
cogservices_docintel_endpoint = config["cogservices_docintel_endpoint"]

# Azure OpenAI Service details
openai_type = config["openai_api_type"]
openai_api_base = config['openai_api_base']
openai_api_version = config['openai_api_version']
openai_deployment_completion = config["openai_deployment_completion"]
openai_model_completion = config["openai_model_completion"]
openai_deployment_embedding = config["openai_deployment_embedding"]
openai_model_embedding = config["openai_model_embedding"]
EMBEDDING_LENGTH = 1536

# Azure AI Search Service details
aisearch_endpoint = config["aisearch_endpoint"]
index_name = config["aisearch_index_name"] # Desired name of index -- does not need to exist already
vectorConfigName = "contentVector_config"

# Storage Account Service details
blob_conn_string = config["BLOB_CONNECTION_STRING_MSI"]
blob_container = SearchIndexerDataContainer(name=config["BLOB_CONTAINER_NAME"])
storage_name = config["STORAGE_ACCOUNT_NAME"]

# Identity
uami_id = config["UAMI_RESOURCE_ID"]

credential = DefaultAzureCredential()
managedCredential = ManagedIdentityCredential(client_id=uami_id)

print(managedCredential)
print(credential)

## Create data source connection

This step creates a connection that will be used to pull data from our document container in the Storage Account.

Documentation can be found [here.](https://learn.microsoft.com/en-us/azure/search/search-howto-indexing-azure-blob-storage)

In [None]:
#This solution uses a role based access token level authorization with Oauth. We capture it using the defaultcredential, it will have to be changed in the future to replace this with a user assigned identity or service principal for example
ds_client = SearchIndexerClient(aisearch_endpoint, credential)#AzureKeyCredential(aisearch_key))

data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{storage_name}-storageblob-connection",
    type="azureblob",
    connection_string=blob_conn_string,
    container=blob_container
)

data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

In [20]:
#Create index:

name = index_name

# Here we create an index with listed fields.
# contentVector is a search field because it's a vector search. We need to pass along the parameters vector_search_dimensions and vector_search_profile_name to create it. The profile name retrieves the name of the vector search config pasted below
# the SearchField is not retrievable by default, but there is no option to add the retrievable flag. We make it retrievable by adding the "hidden=False" flag, which is not doccumented. Yey
# in order to create projections, so that each individual chunk is added to an array in the index, we need to have a content_id field that is string, retrievable, not key and that includes the analyzer. I'm not sure what the analyzer does.
fields = [
    SearchableField(name="chunk_id", type=SearchFieldDataType.String, filterable=True, key=True, retrievable=True, sortable=True, analyzer_name="keyword"),
    SearchableField(name="parent_id", type=SearchFieldDataType.String, retrievable=True, filterable=True, sortable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String, retrievable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, retrievable=True),
    SimpleField(name="url", type=SearchFieldDataType.String, retrievable=True),
    SimpleField(name="filepath", type=SearchFieldDataType.String, retrievable=True),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), hidden=False, vector_search_dimensions=1536, vector_search_profile_name="contentVector_config")  
]

vector_search=VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction":400,
                "efSearch":500,
                "metric":"cosine"
            }
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myOpenAI",
            kind="azureOpenAI",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=openai_api_base,
                deployment_name=openai_deployment_embedding,
                #api_key=openai_key, #Leave this blank to use system assigned managed identity
                model_name=openai_model_embedding
                )
            )
    ],
    profiles=[
        VectorSearchProfile(
            name="contentVector_config",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="myOpenAI"
        )
    ]  
)

cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)

# pass in the name, fields and cors options and create the index
index = SearchIndex(name=name, fields=fields, cors_options=cors_options, vector_search=vector_search)
index_client = SearchIndexClient(aisearch_endpoint, credential)#AzureKeyCredential(aisearch_key))
result = index_client.create_index(index)

## Create skillset

We use two pre-built skills:
1. The OCR skill takes the characters out of any PDF or image file and sends it to the next skillsets
2. The Split Skill takes the concatenated text and divides it into chunks (to stay within the token limits for the OpenAI embedding service).
3. The Azure Open AI Embedding Skill takes the outputs of the Split Skill and vectorizes them individually.

Afterwards, we apply an Index Projector to make it so that our final index has one item for every chunk of text.

We recommend the following resources to learn more about the process and how one can adapt it to different applications:
* [Overview of indexers](https://learn.microsoft.com/en-us/azure/search/search-indexer-overview)
* [Skillset concepts in AI Search](https://learn.microsoft.com/en-us/azure/search/cognitive-search-working-with-skillsets)
* [Reference inputs and outputs in skillsets](https://learn.microsoft.com/en-us/azure/search/cognitive-search-concept-annotations-syntax)


In [None]:
# Create a skillset
skillset_name = f"{index_name}-skillset"

# def create_ocr_skillset():
#     ocr_skill = OcrSkill(
#         description="OCR skill to scan PDFs and other images with text",
#         context="/document/normalized_images/*",
#         line_ending="Space",
#         default_language_code="en",
#         should_detect_orientation=True,
#         inputs=[
#             InputFieldMappingEntry(name="image", source="/document/normalized_images/*")
#         ],
#         outputs=[
#             OutputFieldMappingEntry(name="text", target_name="text"),
#             OutputFieldMappingEntry(name="layoutText", target_name="layoutText")
#         ]
#     )

#     split_skill = SplitSkill(
#         description="Split skill to chunk documents",
#         text_split_mode="pages",
#         context="/document/text_docs/*",
#         maximum_page_length=2000,
#         page_overlap_length=500,
#         inputs=[
#             InputFieldMappingEntry(name="text", source="/document/normalized_images/*/text"),
#         ],
#         outputs=[
#             OutputFieldMappingEntry(name="textItems", target_name="pages")
#         ]
#     )

#     embedding_skill = AzureOpenAIEmbeddingSkill(
#         description="Skill to generate embeddings via Azure OpenAI",
#         context="/document/normalized_images/*/pages/*",
#         resource_url=openai_api_base,
#         deployment_name=openai_deployment_embedding,
#         api_key=openai_key,
#         model_name=openai_model_embedding,
#         dimensions=1536,
#         inputs=[
#             InputFieldMappingEntry(name="text", source="/document/normalized_images/*/pages/*"),
#         ],
#         outputs=[
#             OutputFieldMappingEntry(name="embedding", target_name="vector")
#         ]
#     )

#     index_projections = SearchIndexerIndexProjection(
#         selectors=[
#             SearchIndexerIndexProjectionSelector(
#                 target_index_name=index_name,
#                 parent_key_field_name="parent_id",
#                 source_context="/document/normalized_images/*/pages/*",
#                 mappings=[
#                     InputFieldMappingEntry(name="content", source="/document/normalized_images/*/pages/*"),
#                     InputFieldMappingEntry(name="contentVector", source="/document/normalized_images/*/pages/*/vector"),
#                     InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
#                     InputFieldMappingEntry(name="filepath", source="/document/normalized_images/*/pageNumber")
#                 ]
#             )
#         ],
#         parameters=SearchIndexerIndexProjectionsParameters(
#             projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
#         )
#     )

#     #cognitive_services_account = CognitiveServicesAccountKey(odata_type="CognitiveServicesByKey", key=cogservices_key)

#     skills = [ocr_skill, split_skill, embedding_skill]

#     return SearchIndexerSkillset(
#         name=skillset_name,
#         description="Skillset to chunk documents and generating embeddings",
#         skills=skills,
#         index_projections=index_projections,
#         #cognitive_services_account=cognitive_services_account
#     )

def create_skillset():
    
    split_skill = SplitSkill(
        name="SplitSkill",
        description="Split skill to chunk documents",
        text_split_mode="pages",
        context="/document",
        maximum_page_length=2000,
        page_overlap_length=300,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/content"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="textItems", target_name="pages")
        ]
    )

    embedding_skill = AzureOpenAIEmbeddingSkill(
        name="EmbeddingSkill",
        description="Skill to generate embeddings via Azure OpenAI",
        context="/document/pages/*",
        resource_url=openai_api_base,
        deployment_name=openai_deployment_embedding,
        #api_key=openai_key, #remove this field and auth_identity to use system assigned managed identity instead
        #auth_identity=managedCredential,
        model_name=openai_model_embedding,
        dimensions=1536,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/pages/*"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="embedding", target_name="vector")
        ]
    )

#Skillsets create one document per atomic operation. We need to map the chunks to each vector input and to their own content
    index_projections = SearchIndexerIndexProjection(
        selectors=[
            SearchIndexerIndexProjectionSelector(
                target_index_name=index_name,
                parent_key_field_name="parent_id",
                source_context="/document/pages/*",
                mappings=[
                    InputFieldMappingEntry(name="content", source="/document/pages/*"),
                    InputFieldMappingEntry(name="contentVector", source="/document/pages/*/vector"),
                    #InputFieldMappingEntry(name="chunk_id", source="/document/pages/*/vector"),
                    #InputFieldMappingEntry(name="parent_id", source="/document/pages/*/vector"),
                    InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
                    InputFieldMappingEntry(name="url", source="/document/metadata_storage_path"),
                    InputFieldMappingEntry(name="filepath", source="/document/metadata_storage_path")
                ]
            )
        ],
        parameters=SearchIndexerIndexProjectionsParameters(
            projection_mode=IndexProjectionMode.INCLUDE_INDEXING_PARENT_DOCUMENTS
        )
    )

    #cognitive_services_account = CognitiveServicesAccountKey(key=azure_ai_services_key) if use_ocr else None

    skills = [split_skill, embedding_skill]

    skillset = SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset to chunk documents and generating embeddings",
        skills=skills,
        index_projection=index_projections,
        #cognitive_services_account=cognitive_services_account
    )

    return skillset

#skillset = create_ocr_skillset() if use_ocr else create_skillset()
skillset = create_skillset()

client = SearchIndexerClient(aisearch_endpoint, credential)#AzureKeyCredential(aisearch_key))
client.create_or_update_skillset(skillset)
print(f"{skillset.name} created")

In [None]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  

indexer_parameters = None
# if use_ocr:
indexer_parameters = IndexingParameters(
    configuration=IndexingParametersConfiguration(
        image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGE_PER_PAGE,
        query_timeout=None))

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    parameters=indexer_parameters
)  

indexer_client = SearchIndexerClient(aisearch_endpoint, credential)#AzureKeyCredential(aisearch_key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')