In [1]:
%load_ext autoreload
%autoreload 2
from configparser import ConfigParser

config = ConfigParser()
with open("set-env") as stream:
    config.read_string("[DEFAULT]\n" + stream.read())  # This line does the trick.

pg_host=config['DEFAULT'].get("PG_HOST")
pg_uname=config['DEFAULT'].get("PG_UNAME")
pg_secret=config['DEFAULT'].get("PG_SECRET")
pg_db=config['DEFAULT'].get("PG_DB")
conn_str_alchemy = f"postgresql://{pg_uname}:{pg_secret}@{pg_host}/{pg_db}"

openai_api_endpoint=config['DEFAULT'].get("OPENAI_API_ENDPOINT")
openai_api_key=config['DEFAULT'].get("OPENAI_API_KEY")
openai_chat_deployment=config['DEFAULT'].get("OPENAI_CHAT_DEPLOYMENT")
openai_embedding_deployment=config['DEFAULT'].get("OPENAI_EMBEDDING_DEPLOYMENT")
openai_chat_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_chat_deployment}/chat/completions?api-version=2023-12-01-preview"
openai_embedding_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_embedding_deployment}/embeddings?api-version=2023-08-01-preview"

docai_api_endpoint=config['DEFAULT'].get("DOCAI_API_ENDPOINT")
docai_api_key=config['DEFAULT'].get("DOCAI_API_KEY")
docai_endpoint=f"https://{docai_api_endpoint}"

aisearch_api_endpoint=config['DEFAULT'].get("AISEARCH_API_ENDPOINT")
aisearch_api_key=config['DEFAULT'].get("AISEARCH_API_KEY")
aisearch_endpoint=f"https://{aisearch_api_endpoint}"

index_store_host=config['DEFAULT'].get("INDEX_STORE_HOST")
index_store_uname=config['DEFAULT'].get("INDEX_STORE_UNAME")
index_store_secret=config['DEFAULT'].get("INDEX_STORE_SECRET")
conn_str_index_store = f"mongodb+srv://{index_store_uname}:{index_store_secret}@{index_store_host}/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"

cache_secret=config['DEFAULT'].get("CACHE_SECRET")
cache_host=config['DEFAULT'].get("CACHE_HOST")
cache_port=config['DEFAULT'].get("CACHE_PORT")
conn_str_redis = f"redis://:{cache_secret}@{cache_host}:{cache_port}/0"

In [2]:
import os
from typing import Optional, Dict, List, Any
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
# from azure.ai.formrecognizer import DocumentIntelligenceClient
from azure.ai.documentintelligence import DocumentIntelligenceClient 
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureCliCredential

from ai_knowledge_base.embedding.ai_search import create_index
    
@dataclass_json
@dataclass
class AISearchEmbeddingConfig:
    """
    Embedding configuration
    """
    data_path: str
    search_service_endpoint: str
    index_name: str
    url_prefix: Optional[str] = None
    language: Optional[str] = None
    chunk_size: Optional[int] = 1024
    token_overlap: Optional[int] = 128
    semantic_config_name: Optional[str] = "semantic_default"
    vector_config_name: Optional[str] = "vector_default"
    docai_model_type: Optional[str] = "Layout"

config = AISearchEmbeddingConfig.from_dict({
    "data_path": "https://akbadls2.blob.core.windows.net/optimus-knowledge/",
    "url_prefix": "https://akbadls2.blob.core.windows.net/optimus-knowledge/",
    "search_service_endpoint": aisearch_endpoint,
    "index_name": "ratchet-openai-embedding"
})

njobs = 1

credential = AzureCliCredential()
form_recognizer_client = None
os.environ["AZURE_SEARCH_ADMIN_KEY"] = aisearch_api_key
os.environ["EMBEDDING_MODEL_KEY"] = openai_api_key
os.environ["EMBEDDING_MODEL_ENDPOINT"] = openai_embedding_endpoint
print("Data preparation script started")
if docai_endpoint and docai_api_key:
    os.environ["FORM_RECOGNIZER_ENDPOINT"] = docai_endpoint
    os.environ["FORM_RECOGNIZER_KEY"] = docai_api_key
    if njobs==1:
        form_recognizer_client = DocumentIntelligenceClient(
            endpoint=docai_endpoint,
            credential=AzureKeyCredential(docai_api_key),
            api_version="2023-10-31-preview"
        )
      
if config.index_name and not openai_embedding_endpoint:
    raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")
print("Preparing data for index:", config.index_name)



Data preparation script started
Preparing data for index: ratchet-openai-embedding


In [3]:
create_index(config, credential, form_recognizer_client, embedding_model_endpoint=openai_embedding_endpoint, use_layout=True if config.docai_model_type=="Layout" else False, njobs=njobs)
print("Data preparation for index", config.index_name, "completed")

Created search index ratchet-openai-embedding
Chunking path https://akbadls2.blob.core.windows.net/optimus-knowledge/...
Downloading https://akbadls2.blob.core.windows.net/optimus-knowledge/ to local folder
Downloaded.
Total files to process=65 out of total directory size=65
Single process to chunk and parse the files. --njobs > 1 can help performance.


  5%|▍         | 3/65 [01:38<33:29, 32.41s/it]Unable to retrieve continuation token: cannot pickle '_io.BufferedReader' object
  6%|▌         | 4/65 [02:08<32:07, 31.60s/it]

File (/var/folders/yl/gt92zxj943n1k6b_4t935wwm0000gp/T/tmpmyoa46oe/Optimus Risk Assessment.xlsx) failed with  (InternalServerError) An unexpected error occurred.
Code: InternalServerError
Message: An unexpected error occurred.
Exception Details:	(InternalServerError) An unexpected error occurred.
	Code: InternalServerError
	Message: An unexpected error occurred.
	Target: 0


100%|██████████| 65/65 [1:05:30<00:00, 60.47s/it] 


Processed 65 files
Unsupported formats: 0 files
Files with errors: 1 files
Found 3138 chunks
Uploading documents to index...


Indexing Chunks...: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]


Validating index...
Index is empty. Waiting 60 seconds to check again...
The index contains 3138 chunks.
The average chunk size of the index is 27821.827597195665 bytes.
Index validation completed
Data preparation for index ratchet-openai-embedding completed
