In [5]:
%load_ext autoreload
%autoreload 2
from configparser import ConfigParser
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore")

config = ConfigParser()
with open("set-env") as stream:
    config.read_string("[DEFAULT]\n" + stream.read())  # This line does the trick.

pg_host=config['DEFAULT'].get("DB_HOST")
pg_uname=config['DEFAULT'].get("DB_UNAME")
pg_secret=config['DEFAULT'].get("DB_SECRET")
pg_db=config['DEFAULT'].get("DB_EMBEDDING_DB")
conn_str_alchemy = f"postgresql://{pg_uname}:{pg_secret}@{pg_host}/{pg_db}"

openai_api_endpoint=config['DEFAULT'].get("OPENAI_API_ENDPOINT")
openai_api_key=config['DEFAULT'].get("OPENAI_API_KEY")
# openai_chat_deployment=config['DEFAULT'].get("OPENAI_CHAT_DEPLOYMENT")
openai_embedding_deployment=config['DEFAULT'].get("OPENAI_EMBEDDING_DEPLOYMENT")
# openai_chat_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_chat_deployment}/chat/completions?api-version=2023-12-01-preview"
openai_embedding_endpoint=f"https://{openai_api_endpoint}/openai/deployments/{openai_embedding_deployment}/embeddings?api-version=2023-08-01-preview"

docai_api_endpoint=config['DEFAULT'].get("DOCAI_API_ENDPOINT")
docai_api_key=config['DEFAULT'].get("DOCAI_API_KEY")
docai_endpoint=f"https://{docai_api_endpoint}"

aisearch_api_endpoint=config['DEFAULT'].get("AISEARCH_API_ENDPOINT")
aisearch_api_key=config['DEFAULT'].get("AISEARCH_API_KEY")
aisearch_endpoint=f"https://{aisearch_api_endpoint}"

index_store_host=config['DEFAULT'].get("INDEX_STORE_HOST")
index_store_uname=config['DEFAULT'].get("INDEX_STORE_UNAME")
index_store_secret=config['DEFAULT'].get("INDEX_STORE_SECRET")
conn_str_index_store = f"mongodb+srv://{index_store_uname}:{index_store_secret}@{index_store_host}/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"

cache_secret=config['DEFAULT'].get("CACHE_SECRET")
cache_host=config['DEFAULT'].get("CACHE_HOST")
cache_port=config['DEFAULT'].get("CACHE_PORT")
conn_str_redis = f"redis://:{cache_secret}@{cache_host}:{cache_port}/0"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from ai_knowledge_base.config import *

config = IngestionConfig.from_dict({
    # "data_path": "https://akbadls2.blob.core.windows.net/resultant-sales-knowledge/OpportunityList/",
    "data_path": "/Users/zjia/Resultant/OneDrive - Resultant/OpportunityList/",
    "staging_path": "/Users/zjia/Workspace/gen-ai-knowledge-base/tmp/resultant-sales-knowledge/OpportunityList/",
    # "url_prefix": "https://akbadls2.blob.core.windows.net/resultant-sales-knowledge/OpportunityList/",
    "url_prefix": "https://ksmconsulting.sharepoint.com/:w:/r/sites/SalesSupport/OpportunityList/{}?web=1&e=VfnsGl",
    "database": {
        "db": {
            "url": conn_str_alchemy
        },
        "cache": {
            "url": conn_str_redis,
            "expire_time_second": 120
        }
    },
    "retrieval_method": {
        "type": "PROPRIETARY_SEARCH",
        "index_store": {
            "index_name": "bd-navigator-embedding",
            "index_service": {
                "endpoint": aisearch_endpoint,
                "secret": aisearch_api_key,
                "type": "azure_ai_search",
                "specs": {
                    "api_version": "2023-11-01"
                }
            },
            "semantic_config_name": "semantic_default",
            "vector_config_name": "vector_default",
            "embedding_service": {
                "endpoint": openai_embedding_endpoint,
                "secret": openai_api_key,
                "type": "openai_embedding",
                "specs": {
                    "deployment": openai_embedding_deployment,
                    "api_version": "2023-08-01-preview"
                }
            },
            "doc_extract_type": "DOC_ANALYSIS",
            "doc_extract_service": {
                "endpoint": docai_endpoint,
                "secret": docai_api_key,
                "type": "azure_doc_intelligence",
                "specs": {
                    "model_type": "Layout",
                    "api_version": "2023-10-31-preview"
                }
            }
        }
    }
})


In [7]:
# from azure.identity import AzureCliCredential
# from ai_knowledge_base.embedding.ai_search import create_index

# njobs = 6
# credential = AzureCliCredential()
# create_index(
#     config,
#     credential,
#     njobs=njobs
# )
# print("Data preparation completed")

In [8]:
from azure.identity import AzureCliCredential
from ai_knowledge_base.embedding.ai_search import upload_documents_to_index

credential = AzureCliCredential()
upload_documents_to_index(
    config,
    credential
)
print("Document ingestion completed")

Updated existing search index bd-navigator-embedding


Uploading documents...: 100%|██████████| 1176/1176 [5:03:08<00:00, 15.47s/it]   


The index contains 740295 chunks.
The average chunk size of the index is 27426.610938882473 bytes.
Document ingestion completed


In [None]:
# from azure.storage.blob import ContainerClient
# container_url = f'https://akbadls2.blob.core.windows.net/test'
# container_client = ContainerClient.from_container_url(container_url, credential=credential)
# for blob in container_client.list_blobs():
#     print(blob.name)
#     print(blob.size)

In [None]:
# from ai_knowledge_base.utils.watchtower import *
# from ai_knowledge_base.model import DocumentIngestion
# from datetime import datetime

# document_ingestion = DocumentIngestion.from_dict({
#     "url": "https://blob.windows.net/test/Opportunity List/BluWave/Pearlman Group/EOA App/Pearlman Group - EOA App Development - v1.0.docx",
#     "staging_path": "/Users/zjia/Workspace/gen-ai-knowledge-base/tmp/test/Opportunity List/BluWave/Pearlman Group/EOA App/Pearlman Group - EOA App Development - v1.0.docx",
#     "created_dt": datetime.today().strftime('%Y-%m-%d %H:%M:%S')
# })

In [None]:
# file_path = "/Users/zjia/Resultant/OneDrive - Resultant/OpportunityList/City of Indianapolis/Knowledgebase Deployment/City of Indianapolis - Knowledgebase Deployment Notes.txt"
# from chardet import detect
# with open(file_path, "rb") as f:
#     binary_content = f.read()
#     encoding = detect(binary_content).get('encoding', None)
#     content = binary_content.decode(encoding if encoding else 'utf8')

In [None]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
import html
from ai_knowledge_base.utils.transport import xlsx2html
from ai_knowledge_base.utils.document import BytesIOWrapper
from ai_knowledge_base.utils.document import extract_xlsx_content, extract_other_content, parser_factory
# file_path = '/Users/zjia/Resultant/OneDrive - Resultant/OpportunityList/State of Utah/Division of Technology/Enterprise Master Person Index EMPI/Draft/Utah - Enterprise Master Person Index Resource Plan - 2023_v01.xlsx'
file_path = '/Users/zjia/Resultant/OneDrive - Resultant/OpportunityList/Leaf Home/20230301_Leaf Home_IT_Resource_Plan v2.xlsx'
content = extract_xlsx_content(file_path)

# parser = parser_factory("html".split("_pdf")[0]) # to handle cracked pdf converted to html
# doc = parser.parse(content, file_name=None)

In [None]:
from langchain.text_splitter import TextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
from ai_knowledge_base.utils.document import PdfTextSplitter
SENTENCE_ENDINGS = [".", "!", "?"]
WORDS_BREAKS = list(reversed([",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]))

# splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#                 separators=SENTENCE_ENDINGS + WORDS_BREAKS,
#                 chunk_size=256, chunk_overlap=0)
splitter = PdfTextSplitter(separator=SENTENCE_ENDINGS + WORDS_BREAKS, chunk_size=256, chunk_overlap=0)
chunked_content_list = splitter.split_text(doc.content)