# 2. Azure AI Search: Indexing Chunked and Embedded Text

In this notebook, we will use the chunked and embedded text generated in the previous notebook to create and populate an Azure AI Search index. This process involves initializing the search client, defining the index schema, creating the index, and uploading the documents.

## 2.1 Import Libraries and Load Environment Variables

In [None]:
# Import necessary libraries
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
import os
import json
import uuid
  
# Load environment variables from .env file
load_dotenv()

# Get the service name and admin key from environment variables
service_name = os.getenv('AZURE_AI_SEARCH_SERVICE_NAME')
admin_key = os.getenv('AZURE_AI_SEARCH_ADMIN_KEY')

# Use the service name and admin key as before
endpoint = f"https://{service_name}.search.windows.net"
credential = AzureKeyCredential(admin_key)

## 2.2 Initialize Azure AI Search Client

In [None]:
# Import the SearchIndexClient from Azure SDK  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SimpleField, SearchFieldDataType, SearchableField, SearchField,  
    VectorSearch, HnswAlgorithmConfiguration, VectorSearchProfile,  
    SemanticConfiguration, SemanticPrioritizedFields, SemanticField,  
    SemanticSearch, SearchIndex  
)  
  
# Initialize the SearchIndexClient  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)

## 2.3 Define Index Schema

In [None]:
# Define the index schema  
index_name = "example-index"  
fields = [  
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),  
    SimpleField(name="title", type=SearchFieldDataType.String),  
    SimpleField(name="chunk_id", type=SearchFieldDataType.String),  # Add chunk_id field  
    SearchableField(name="content", type=SearchFieldDataType.String),  
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw"  
        )  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
        )  
    ]  
)  
  
# Define the semantic configuration  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        title_field=SemanticField(field_name="title"),  
        content_fields=[SemanticField(field_name="content")]  
    )  
)  
  
# Create the semantic settings with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  


## 2.4 Drop Index if It Exists and Create Search Index

In [None]:
# Drop the index if it already exists  
try:  
    index_client.delete_index(index_name)  
    print(f"Index '{index_name}' deleted successfully.")  
except Exception as e:  
    print(f"Index '{index_name}' does not exist or could not be deleted: {e}")  
  
# Create the search index with the semantic settings  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"Index '{index_name}' created successfully with vector search and semantic search configurations.")  


## 2.5 Load and Upload Chunked Data

In [None]:
# Import the SearchClient from Azure SDK  
from azure.search.documents import SearchClient  
  
# Initialize the SearchClient  
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)  
  
# Define the embeddings folder  
embeddings_folder = 'embeddings'  
  
# Load JSON files from the embeddings folder  
documents = []  
for file_name in os.listdir(embeddings_folder):  
    if file_name.endswith('.json'):  
        with open(os.path.join(embeddings_folder, file_name), 'r', encoding='utf-8') as file:  
            chunked_data = json.load(file)  
            for chunk in chunked_data:  
                document_id = str(uuid.uuid4())
                documents.append({  
                    "id": document_id,  
                    "title": file_name.replace('.json', ''),  
                    "chunk_id": str(chunk['chunk_id']),
                    "content": chunk['chunk_text'],  
                    "contentVector": chunk['chunk_embedding']
                })  
  
# Upload the documents to the search index  
result = search_client.upload_documents(documents=documents)  
print(f"Documents uploaded: {result}")  
