In [3]:
import os
import openai
from dotenv import load_dotenv

_ = load_dotenv(dotenv_path="../.env") # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']
openai.api_type = "azure"
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_version = os.environ['OPENAI_API_VERSION']

import sys
sys.path.append('../')
from json_module import load_docs_from_jsonl

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch



In [4]:
kbs = load_docs_from_jsonl("kbs copy.jsonl")

### Create embeddings and vector store instances

In [6]:
embedding_model: str = os.environ['OPENAI_EMBEDDING_MODEL']
vector_store_address: str = os.environ['COGNITIVE_SEARCH_URL']
vector_store_password: str = os.environ['COGNITIVE_SEARCH_KEY']

embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=embedding_model, chunk_size=1)
index_name: str = "sitecore-kb-articles"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [7]:
vector_store.add_documents(documents=kbs)

['MmQ5NWVlNzEtMjJlNC00NGIwLThkMDUtYjQ2NzMyNjdkMDg0',
 'MjQ3MGEzY2EtYjZiMS00NjgyLThjZjAtZmNmNWQ5YmZhNDk2',
 'ODFlOGIzZjEtMzA5ZC00MmViLTk1ZTUtZDQzOTY2ZjdlMTEw',
 'MDBlMzc0OTktZWVmOS00ZDU3LWFjNmUtMzA4MThiODc2YzU3',
 'YzE4ZTE0OWItYzgxYy00ZjFjLWE2YWYtNDVlODYyM2RmNmZm',
 'NTI4MTFkZmItMjIwMC00NGY0LWFkNmEtMGViNzliZWEyNzNk',
 'ZjU4MmVlMmQtYTEzNy00YmZkLWIwOGYtNWQwZGNmNmJmZjA3',
 'MWU5YWIxZGUtYjAwMS00ODc0LTg1NDUtOTY0M2RmMmRjMzIz',
 'NTQ3OWQzOGItYjk2MC00OWNiLWE0YjQtYmY5MmY2OTI3M2M5',
 'N2YxZGU3N2UtYjBhYS00NGE2LWFiOWItZTE1MDliOWRhOWY5',
 'N2NhMjIzMDUtZjM4Ni00MjQzLTgyYjktYjgyMjFlZjM5M2Mw',
 'MWJmNTg5NWEtNjBjMi00ODAyLWFkZDgtMDRjMDkyNDk4YWMw',
 'MDI5YzhkNWQtNDA5YS00OWNhLWI5ZTgtODhlNzU4YjYzYjc5',
 'MTFhNDBjZWItNDRjOS00ZDA0LThhOGUtZTM5NmY0ZDExMDI0',
 'YWJkMzEyMmEtZDkyOS00MGRlLWJkNGQtZGM2NzYxZjM2ZjZk',
 'ODIzNmExMWMtMjllOS00YzRlLThlNjctMjdkMzdkZTkxMWRh',
 'OWY0YjE3MzgtMWE3Ni00MjY1LThlOTUtMWVkODFjZjE0NGM4',
 'ZWFjMWZmY2EtMDYxZi00MzY4LWJiMzUtZGE3ZjEwZmY4ODYw',
 'YjcxOTEyYTMtODZkNy00YjRhLWIwMTAtOGRkM2I2YTUw

In [8]:
docs = vector_store.similarity_search(
    query="How to upgrade my AKS cluster version to 1.25",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

Description
The end of life of Azure Kubernetes Service (AKS) 1.24 is July 2023. Sitecore Managed Cloud Containers solutions based on AKS 1.24 must be updated to a newer version that uses AKS 1.25. For more details on AKS's end of life and the consequences of deprecation, refer to Microsoft's FAQ.
The following article contains upgrade instructions for the Sitecore Managed Cloud Containers solution to the version that uses AKS 1.25.
Important:
Upgrade your solution only if its product version (revision number) is earlier than those of the update packages listed in the Upgrade Instructions sections below.For example, the update package revision: mcc.xp.upgrade.10.1.1-r.0.1.309915.nupkg.To locate the version and revision number of your solution, view the solution.json file in the infrastructure repository of your Sitecore Managed Cloud Containers Azure DevOps project.
If your environment is still running on AKS 1.23, it is recommended to first upgrade to AKS 1.24 and then follow the step

In [9]:
print(docs[1].page_content)

Description
The end of life of Azure Kubernetes Service (AKS) 1.23 is March 2023. Sitecore Managed Cloud Containers solutions, based on AKS 1.23, must be updated to a newer version that uses AKS 1.24. For more details on AKS end of life and the consequences of deprecation see here.
This article contains upgrade instructions for the Sitecore Managed Cloud Containers solution to the version that uses AKS 1.24.
Note: Upgrade your solution only if the version you have is earlier than the ones listed below in the Solution section. To find out the version of your solution, view the solution.json file in the infrastructure repository of your Sitecore Managed Cloud Containers Azure DevOps project. If your environment is still running on AKS 1.22, it is recommended first to upgrade to AKS 1.23 and then follow the steps in this article.
Breaking Changes
Upgrading from AKS version 1.23 to 1.24 introduces no breaking changes to a standard installation of Sitecore Managed Cloud.
Upgrade Instruction

In [10]:
print(docs[2].page_content)

Description
The end of life of Azure Kubernetes Service (AKS) 1.21 is July 2022. Sitecore Managed Cloud Containers solutions, based on AKS 1.21, need to be updated to a newer version that uses AKS 1.22. For more details on AKS end of life and the consequences of deprecation refer to here.
Update your solution only if the version you have is less than the ones listed below in Stage 2 of the Solutions section. To find out the version of your solution, view the solution.json file in the infrastructure repository of your Managed Cloud Containers Azure DevOps project.
This article contains upgrade instructions for the Sitecore Managed Cloud Containers solution to the version that uses AKS 1.22.
Breaking changes
Upgrading the AKS version from 1.21 to 1.22 introduces some breaking changes in Kubernetes API. These breaking changes do not lead to downtime but require additional changes to Sitecore Managed Cloud Containers environments, and should be applied to the 2-stages process defined below

In [12]:
kbs = load_docs_from_jsonl("docsites2_copy.jsonl")

index_name: str = "sitecore-docs"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

vector_store.add_documents(documents=kbs)

['Y2M0ZDA0MjQtYTA1My00NmJjLTg2ZDgtNjM3ZTk4ZmVmOGVh',
 'ODcxYmFkNDItZDg1My00YmQ2LTgxZTItNDJlZGNjMTc5ZmM5',
 'NTVhNjJlZDgtNDgxZi00MmEwLWE1ZDctOWNmZjE1YmEyMDdk',
 'NzA0ZjUyZDMtZTlhZC00NjFjLTg3MjItN2MzNWI4MjVlOGI0',
 'YzhkZTA0YjEtNmEzYy00Zjk4LThiN2EtNjVkOGEwYmY3ZDY3',
 'ZGNmNWI1Y2YtZDQzYS00NTFmLTgyYmYtOGYwZDgwNTdjODVk',
 'YjNlYzc4YjEtZmIzNy00OGNjLTg0ZjMtNGM0MDkzMTIwOWRj',
 'MWI1YjNjNDUtYTdjYS00MDE4LWI2NTctYTEzYjQ5NWM2OTk0',
 'NGYxMjBmNDItOGRkYi00MWJmLWJjODItZTk2MDMxNjUzMDky',
 'ODkwOTQwNDEtMDY4Zi00ZDY4LTllMjgtZjBjNTc1MTdjMmU0',
 'ZTRkYzEyNTMtZTEzZC00MTgwLWJiMWYtYzE1M2FlY2RjYjg4',
 'N2E1MTg4NGQtNmY0NC00MzNiLWE4MmQtYWY2ZDdlNzFjNzA5',
 'OWYxYzcyMzctZDUzYy00NDFiLWI1MWYtZTA0YjQ5ODhlYjFm',
 'NGFhODY3NTYtYjk4Ni00NGMyLWI5M2UtYzk5NjQ5MGVkNzQ1',
 'ZmUwNDcwZjctZTI3Zi00ODI0LTliNmYtOGZjYWJiYzA0YTI4',
 'ZTJmNDJlYzAtMjdiNy00NDJmLTlkNmYtMzJjMTg5ZTYyOWMw',
 'YTZkYWY2ZjgtNzg0Zi00ZGU4LTkzMmItODEwMTI3MWRiYjA2',
 'ZjUwZDBiNWItZjYxYS00OThlLTk2MTYtMTY3YTQ3NzZjMDIy',
 'MDRlMGE5NDUtMmM2ZS00YTA3LTk5NjAtNzQ2ZWI1N2M5

In [17]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    ScoringProfile,
    TextWeights,
)

def upload_kbs():
    chunks = [[500,120], [300,90]]
    chunks_encoder = [200,150]

    fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embeddings.embed_query("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]
    for i in chunks_encoder:
        encoder_filename = "kb_recursive_split_chunk{chunksize}_tiktoken_encoder.jsonl".format(chunksize=i)
        encoder_kbs = load_docs_from_jsonl(encoder_filename)

        index_name: str = encoder_filename.replace("_", "-").replace(".jsonl", "").replace("chunk", "tokensize")
        vector_store: AzureSearch = AzureSearch(
            azure_search_endpoint=vector_store_address,
            azure_search_key=vector_store_password,
            index_name=index_name,
            embedding_function=embeddings.embed_query,
            fields=fields
        )

        vector_store.add_documents(documents=encoder_kbs)

    for i in chunks:
        filename = "kb_recursive_split_chunk{chunksize}_chunkoverlap{chunkoverlap}.jsonl".format(chunksize=i[0], chunkoverlap=i[1])

        kbs = load_docs_from_jsonl(filename)
        index_name: str = filename.replace("_", "-").replace(".jsonl", "")
        vector_store: AzureSearch = AzureSearch(
            azure_search_endpoint=vector_store_address,
            azure_search_key=vector_store_password,
            index_name=index_name,
            embedding_function=embeddings.embed_query,
            fields=fields
        )

        vector_store.add_documents(documents=kbs)    

upload_kbs()
        


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


### upload docs to cognitive search

In [None]:
# from azure.search.documents.indexes.models import (
#     SearchableField,
#     SearchField,
#     SearchFieldDataType,
#     SimpleField,
#     ScoringProfile,
#     TextWeights,
# )

# def upload_docs():
#     # chunks = [[500,120], [300,70], [200, 50], [100,20]]
#     chunks = [[500,120]]
#     chunks_encoder = [500]

#     fields = [
#     SimpleField(
#         name="id",
#         type=SearchFieldDataType.String,
#         key=True,
#         filterable=True,
#     ),
#     SearchableField(
#         name="content",
#         type=SearchFieldDataType.String,
#         searchable=True,
#     ),
#     SearchField(
#         name="content_vector",
#         type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
#         searchable=True,
#         vector_search_dimensions=len(embeddings.embed_query("Text")),
#         vector_search_configuration="default",
#     ),
#     SearchableField(
#         name="metadata",
#         type=SearchFieldDataType.String,
#         searchable=True,
#     ),
#     # Additional field to store the title
#     SearchableField(
#         name="title",
#         type=SearchFieldDataType.String,
#         searchable=True,
#     ),
#     # Additional field for filtering on document source
#     SimpleField(
#         name="source",
#         type=SearchFieldDataType.String,
#         filterable=True,
#     ),
# ]

#     for i in chunks:
#         filename = "kb_recursive_split_chunk{chunksize}_chunkoverlap{chunkoverlap}.jsonl".format(chunksize=i[0], chunkoverlap=i[1])

#         kbs = load_docs_from_jsonl(filename)
#         index_name: str = filename.replace("_", "-").replace(".jsonl", "")
#         vector_store: AzureSearch = AzureSearch(
#             azure_search_endpoint=vector_store_address,
#             azure_search_key=vector_store_password,
#             index_name=index_name,
#             embedding_function=embeddings.embed_query,
#             fields=fields
#         )

#         vector_store.add_documents(documents=kbs)

#     # for i in chunks_encoder:
#     #     encoder_filename = "kb_recursive_split_chunk{chunksize}_tiktoken_encoder.jsonl".format(chunksize=i)
#     #     encoder_kbs = load_docs_from_jsonl(encoder_filename)

#     #     index_name: str = encoder_filename.replace("_", "-").replace(".jsonl", "")
#     #     vector_store: AzureSearch = AzureSearch(
#     #         azure_search_endpoint=vector_store_address,
#     #         azure_search_key=vector_store_password,
#     #         index_name=index_name,
#     #         embedding_function=embeddings.embed_query,
#     #         fields=fields
#     #     )

#     #     vector_store.add_documents(documents=encoder_kbs)

# upload_docs()
        
