# 1. Install required library

In [None]:
%pip install semantic-kernel

# 2. Load env variables

In [2]:
# load the environment variables file
from dotenv import load_dotenv
import os
import openai

# Environment variable obtained for Azure Cosmos DB for MongoDB vCore
AZCOSMOS_CONNSTR=os.getenv("AZCOSMOS_CONNSTR")
AZCOSMOS_API=os.getenv("AZCOSMOS_API")
AZCOSMOS_DATABASE_NAME="dbm"
AZCOSMOS_CONTAINER_NAME="dbm_saro"

# Envrionment variables obtained for Azure OpenAI
openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_KEY") 
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") 
openai.api_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT")
openai.api_embeddings_deployment_name = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
openai.api_version = "2023-07-01-preview"

# collection name will be used multiple times in the code so we store it in a variable
collection_name = AZCOSMOS_CONTAINER_NAME

# Vector search index parameters
index_name = "VectorSearchIndex"
vector_dimensions = (
    1536  # text-embedding-ada-002 uses a 1536-dimensional embedding vector
)
num_lists = 1
similarity = "COS"  # cosine distance

# 3. Create Helper Function

This function takes in a json file of NoSQL records and checks if your data exists in the database using the id of the record then skips the record if it exists or generates embeddings and uploads the database record along with it's embedding.

The `save_information` function does two things: generate embeddings + upload the data to your database.

In [4]:
import json
from semantic_kernel.memory.semantic_text_memory import SemanticTextMemory
from semantic_kernel.memory.memory_store_base import MemoryStoreBase


async def upsert_data_to_memory_store(
    memory: SemanticTextMemory, store: MemoryStoreBase, data_file_path: str
) -> None:
    """
    This asynchronous function takes two memory stores and a data file path as arguments.
    It is designed to upsert (update or insert) data into the memory stores from the data file.

    Args:
        kernel_memory_store (callable): A callable object that represents the kernel memory store where data will be upserted.
        memory_store (callable): A callable object that represents the memory store where data will be upserted.
        data_file_path (str): The path to the data file that contains the data to be upserted.

    Returns:
        None. The function performs an operation that modifies the memory stores in-place.
    """
    with open(file=data_file_path, mode="r", encoding="utf-8") as f:
        data = json.load(f)
        n = 0
        for item in data:
            n += 1
            # check if the item already exists in the memory store
            # if the id doesn't exist, it throws an exception
            try:
                already_created = bool(
                    await store.get(
                        collection_name, item["id"], with_embedding=True
                    )
                )
            except Exception:
                already_created = False
            # if the record doesn't exist, we generate embeddings and save it to the database
            if not already_created:
                await memory.save_information(
                    collection=collection_name,
                    id=item["id"],
                    # the embedding is generated from the text field
                    text=item["content"],
                    description=item["title"],
                )
                print(
                    "Generating embeddings and saving new item:",
                    n,
                    "/",
                    len(data),
                    end="\r",
                )
            else:
                print("Skipping item already exists:", n, "/", len(data), end="\r")

# 4. Prepare Semantic Kernel, Generate embeddings for Azure Cosmos DB

In [7]:
import semantic_kernel as sk
from semantic_kernel.connectors.ai.open_ai import (
    AzureChatCompletion,
    AzureTextEmbedding,
)
from semantic_kernel.connectors.memory.azure_cosmosdb import (
    AzureCosmosDBMemoryStore,
)
from semantic_kernel.memory.semantic_text_memory import SemanticTextMemory
from semantic_kernel.core_plugins.text_memory_plugin import TextMemoryPlugin

# Intialize the kernel
kernel = sk.Kernel()

# Load the embeddings deployment name and initialize the text embedding with the required parameters, 
# Add the created embedding service to the semantic kernel instance.
kernel.add_service(
    AzureTextEmbedding(
        service_id="text_embedding",
        deployment_name=openai.api_embeddings_deployment_name,
        endpoint=openai.api_base,
        api_key=openai.api_key,
    )
)

# create azure cosmos db for mongo db vcore api store and collection with vector ivf
# currently, semantic kernel only supports the ivf vector kind
store = await AzureCosmosDBMemoryStore.create(
    cosmos_connstr=AZCOSMOS_CONNSTR,
    cosmos_api=AZCOSMOS_API,
    database_name=AZCOSMOS_DATABASE_NAME,
    collection_name=AZCOSMOS_CONTAINER_NAME,
    index_name=index_name,
    vector_dimensions=vector_dimensions,
    num_lists=num_lists,
    similarity=similarity,
)

# Add the created memory store to the semantic kernel instance.
memory = SemanticTextMemory(storage=store, embeddings_generator=kernel.get_service("text_embedding"))
#kernel.import_plugin_from_object(TextMemoryPlugin(memory), "TextMemoryPluginACDB")

#Call the helper function with the JSON data file to generate embeddings and create or update the database records.
# If the records already exit it will skip it.
# Records are identified by their ids. 
# **Note that you need to specify id, text, and description fields.The text field is what gets converted to embeddings.**
print("Upserting data to Azure Cosmos DB Memory Store...")
await upsert_data_to_memory_store(memory, store, "./data/dbm-saro.json")

Upserting data to Azure Cosmos DB Memory Store...
Skipping item already exists: 50 / 50

# 5. Test the Vector Database

In [11]:
# each time it calls the embedding model to generate embeddings from your query
query_term = "Is there a SARO for irrigation systems in Bicol?"
result = await memory.search(collection_name, query_term)

print(
    f"Result is: {result[0].text}\nRelevance Score: {result[0].relevance}\nFull Record: {result[0].additional_metadata}"
)


Result is: SARO Number: SARO-ROVI-24-0006778; Amount: 2,400,000.00; Approved Date: 09/03/2024 14:20:55;  Release Date: 09/03/2024 14:20:55; Department: 07 - Department of Agriculture; Agency: 021-National Irrigation Administration; Operating Unit: 070005 - Bicol; Purpose: Release of allotment for the construction of irrigation systems in Bicol.
Relevance Score: 0.8803885961273783
Full Record: {"text": "SARO Number: SARO-ROVI-24-0006778; Amount: 2,400,000.00; Approved Date: 09/03/2024 14:20:55;  Release Date: 09/03/2024 14:20:55; Department: 07 - Department of Agriculture; Agency: 021-National Irrigation Administration; Operating Unit: 070005 - Bicol; Purpose: Release of allotment for the construction of irrigation systems in Bicol.", "description": "SARO-ROVI-24-0006778", "additional_metadata": null}


In [22]:
# each time it calls the embedding model to generate embeddings from your query
query_term = "How many SAROs are there for Central Visayas?"
result = await memory.search(collection_name, query_term, 5, 0.75, False)

concatenated_result = ""

for item in result:
    #print(f"Result is: {item.text}")
    #print(f"Relevance Score: {item.relevance}")
    #print(f"Full Record: {item.additional_metadata}")
    concatenated_result += item.additional_metadata + ", "

print(concatenated_result.strip())  # Use strip() to remove any trailing whitespace

{"text": "SARO Number: SARO-ROVI-24-0006771; Amount: 3,600,000.00; Approved Date: 08/27/2024 16:25:10;  Release Date: 08/27/2024 16:25:10; Department: 16 - Department of Transportation; Agency: 014-Land Transportation Office; Operating Unit: 160003 - Central Visayas; Purpose: Release of allotment for the improvement of land transportation services in Central Visayas.", "description": "SARO-ROVI-24-0006771", "additional_metadata": null}, {"text": "SARO Number: SARO-ROVI-24-0006801; Amount: 3,500,000.00; Approved Date: 09/26/2024 14:04:30;  Release Date: 09/26/2024 14:04:30; Department: 16 - Department of Transportation; Agency: 044-Philippine Ports Authority; Operating Unit: 160010 - Central Visayas; Purpose: Release of allotment for the construction of new port facilities in Central Visayas.", "description": "SARO-ROVI-24-0006801", "additional_metadata": null}, {"text": "SARO Number: SARO-ROVI-24-0006790; Amount: 4,400,000.00; Approved Date: 09/15/2024 15:24:55;  Release Date: 09/15/20