In [None]:
%load_ext autoreload
%autoreload 2
from configparser import ConfigParser

config = ConfigParser()
with open("set-env") as stream:
    config.read_string("[DEFAULT]\n" + stream.read())  # This line does the trick.

pg_host=config['DEFAULT'].get("PG_HOST")
pg_uname=config['DEFAULT'].get("PG_UNAME")
pg_secret=config['DEFAULT'].get("PG_SECRET")
pg_db=config['DEFAULT'].get("PG_DB")
conn_str_alchemy = f"postgresql://{pg_uname}:{pg_secret}@{pg_host}/{pg_db}"

openai_api_endpoint=config['DEFAULT'].get("OPENAI_API_ENDPOINT")
openai_api_key=config['DEFAULT'].get("OPENAI_API_KEY")
openai_chat_deployment=config['DEFAULT'].get("OPENAI_CHAT_DEPLOYMENT")
openai_embedding_deployment=config['DEFAULT'].get("OPENAI_EMBEDDING_DEPLOYMENT")
openai_chat_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_chat_deployment}/chat/completions?api-version=2023-12-01-preview"
openai_embedding_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_embedding_deployment}/embeddings?api-version=2023-08-01-preview"

docai_api_endpoint=config['DEFAULT'].get("DOCAI_API_ENDPOINT")
docai_api_key=config['DEFAULT'].get("DOCAI_API_KEY")
docai_endpoint=f"https://{docai_api_endpoint}/"

index_store_host=config['DEFAULT'].get("INDEX_STORE_HOST")
index_store_uname=config['DEFAULT'].get("INDEX_STORE_UNAME")
index_store_secret=config['DEFAULT'].get("INDEX_STORE_SECRET")
conn_str_index_store = f"mongodb+srv://{index_store_uname}:{index_store_secret}@{index_store_host}/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"

cache_secret=config['DEFAULT'].get("CACHE_SECRET")
cache_host=config['DEFAULT'].get("CACHE_HOST")
cache_port=config['DEFAULT'].get("CACHE_PORT")
conn_str_redis = f"redis://:{cache_secret}@{cache_host}:{cache_port}/0"

In [None]:
import os
from typing import Optional, Dict, List, Any
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureCliCredential

from ai_knowledge_base.embedding.nosql import create_index
    
@dataclass_json
@dataclass
class NoSQLEmbeddingConfig:
    """
    Embedding configuration
    """
    data_path: str
    connection_string: str
    database_name: str
    collection_name: str
    index_name: str
    vector_field: str
    language: Optional[str] = None
    chunk_size: Optional[int] = 1024
    token_overlap: Optional[int] = 128
    docai_model_type: Optional[str] = "Layout"

config = NoSQLEmbeddingConfig.from_dict({
    "data_path": "/Users/zjia/Resultant/OneDrive - Resultant/Optimus",
    "connection_string": conn_str_index_store,
    "database_name": "",
    "collection_name": "",
    "index_name": "",
    "vector_field": ""
})

njobs = 1

credential = AzureCliCredential()
form_recognizer_client = None

print("Data preparation script started")
if docai_endpoint and docai_api_key:
    os.environ["FORM_RECOGNIZER_ENDPOINT"] = docai_endpoint
    os.environ["FORM_RECOGNIZER_KEY"] = docai_api_key
    if njobs==1:
        form_recognizer_client = DocumentAnalysisClient(endpoint=docai_endpoint, credential=AzureKeyCredential(docai_api_key))
      
if config.index_name and not openai_embedding_endpoint:
    raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")
print("Preparing data for index:", config.index_name)

create_index(config, credential, form_recognizer_client, embedding_model_endpoint=openai_embedding_endpoint, use_layout=True if config.docai_model_type=="Layout" else False, njobs=njobs)
print("Data preparation for index", config.index_name, "completed")