# Prepare LangChain Documents

In [1]:
import os
import json
from pathlib import Path

def loadJSONsAtRuntime() -> dict:
	print("Loading jsons into memory")

	filepath = '..\extract_data\extracted_data'
	json_dicts = {}

	for filename in os.listdir(filepath):
		f = os.path.join(filepath, filename)
		if os.path.isfile(f):
			print("Loading json @ " + f)
			json_dict = json.loads(Path(f).read_text())
			chapterNumber = json_dict["Chapter Number"]
			json_dicts[chapterNumber] = json_dict

	print("Loading jsons into memory completed.")
	return json_dicts

In [2]:
import sys
sys.path.append('../pdfplumber')

import shutil

print("Starting vectorstore creation...")

# load the json data into a dictionary
# ......................................... #
print("Loading json files into memory...")
json_dicts = loadJSONsAtRuntime()
# ......................................... #



# create langchain documents with the data we need
# ......................................... #
from langchain_core.documents import Document

docs = []
print("Filtering data...")

for key,value in json_dicts.items():
    json_dict = value

    items = json_dict["Items"]
    chapterName = json_dict["Chapter Name"]

    for item in items:
        prefix = item["Prefix"]
        hsHeadingName = item["HS Hdg Name"]
        hscode = item["HS Code"]
        description = item["Description"]

        content = "Chapter Name: " + chapterName + " , HS Heading Name:" + hsHeadingName + " ,Prefix: " + prefix +  " , Description:" + description
        document = Document(
            page_content=content,
            metadata={ "HS Code": hscode }
        )
        docs.append(document)
# ......................................... #

Starting vectorstore creation...
Loading json files into memory...
Loading jsons into memory
Loading json @ ..\extract_data\extracted_data\1.json
Loading json @ ..\extract_data\extracted_data\28.json
Loading jsons into memory completed.
Filtering data...


# Cosmos Prep

In [3]:
indexing_policy = {
    "indexingMode": "consistent",
    "includedPaths": [{"path": "/*"}],
    "excludedPaths": [{"path": '/"_etag"/?'}],
    "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
}

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": "/embedding",
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": 1536,
        }
    ]
}

Below block took about 1 min 4s to complete with 2 tariff pdfs.

In [4]:
from azure.cosmos import CosmosClient, PartitionKey
from langchain_community.vectorstores.azure_cosmos_db_no_sql import (
    AzureCosmosDBNoSqlVectorSearch,
)
# from langchain_openai import AzureOpenAIEmbeddings
import openai
openai.api_key = os.environ['OPENAI_API_KEY']

HOST = os.environ["COSMOS_ENDPOINT"]
KEY = os.environ["COSMOS_KEY"]

cosmos_client = CosmosClient(HOST, KEY)
database_name = "langchain_python_db"
container_name = "langchain_python_container"
partition_key = PartitionKey(path="/id")
cosmos_container_properties = {"partition_key": partition_key}
cosmos_database_properties = {"id": database_name}

# openai_embeddings = AzureOpenAIEmbeddings(
#     openai_api_key=os.environ["OPENAI_API_KEY"],
# )
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

# insert the documents in AzureCosmosDBNoSql with their embedding
vector_search = AzureCosmosDBNoSqlVectorSearch.from_documents(
    documents=docs,
    embedding=embedding,
    cosmos_client=cosmos_client,
    database_name=database_name,
    container_name=container_name,
    vector_embedding_policy=vector_embedding_policy,
    indexing_policy=indexing_policy,
    cosmos_container_properties=cosmos_container_properties,
    cosmos_database_properties=cosmos_database_properties
)

  embedding = OpenAIEmbeddings()


### Test using the vectorstore

In [8]:
query = "bison"
results = vector_search.similarity_search(query)

print(results[0].page_content)

Chapter Name: Live animals  , HS Heading Name:Live bovine animals (+). ,Prefix: Buffalo: , Description:Pure-bred breeding animals
