In [49]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import fitz  # PyMuPDF

In [50]:
# Connect with no authentication
chroma_client = chromadb.HttpClient(host='chromadb', port=8000,)

In [51]:
# Connect with token authentication
chroma_client = chromadb.HttpClient(host='chromadb', port=8000,
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials="test-token"
    )
)

In [52]:
# # Connect with role-based authentication
# chroma_client = chromadb.HttpClient(host='chromadb', port=8000,
#     settings=Settings(
#         chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
#         chroma_server_authn_provider="chromadb.auth.simple_rbac_authz.SimpleRBACAuthorizationProvider",
#         chroma_client_auth_credentials="test-token-readonly"
#     )
# )

In [53]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="grc_docs", embedding_function=sentence_transformer_ef)

In [54]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


In [55]:
import os
# Directory containing PDF documents
docs_dir = "/app/pdfs"

# Prepare documents for indexing
documents = []
metadatas = []
ids = []
id = 1

for filename in os.listdir(docs_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(docs_dir, filename)
        text = extract_text_from_pdf(pdf_path)
        documents.append(text)
        metadatas.append({"filename": filename})
        ids.append(str(id))
        id += 1
        print(f"Indexed document: {filename}")


Indexed document: Schneider Electric understand the total sustainability impact of liion UPS batteries.pdf
Indexed document: Immersion_Cooling_for_High-Density_Sustainable_Computing.pdf
Indexed document: Wyoming_Use_Case_v2.pdf
Indexed document: Navigating Liquid Cooling Architectures for Data Centers with AI Workloads.pdf
Indexed document: shell-immersion-cooling-fluid-s5-x-brochure.pdf
Indexed document: Site Readiness Checklist Template JAN 2024 (2).pdf
Indexed document: Submer Thermodynamics.pdf
Indexed document: Castrol ON DC 15 - UK - EN .pdf
Indexed document: Vertiv-LiquidCooling-KIH-WP-EN-NA-SL.pdf
Indexed document: MergeIT-SustainableAppAdjacentVDIForAI&HPCWorkloads-Infographic-1280x720px-RGB-mk1.pdf
Indexed document: DC 20 - SDS .pdf
Indexed document: Five reasons to adop liquid cooling.pdf
Indexed document: Telefonica-Case-Study.pdf
Indexed document: Hypertec  Immersion-Born Trident Servers  5-13-2024.pdf
Indexed document: Capital Cost Analysis Immersion vs Air Cooled.pdf
Ind

In [56]:
# import csv

# # Load sample data (a restaurant menu of items)
# with open('./../menu_items.csv') as file:
#     lines = csv.reader(file)

#     # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
#     documents = []

#     # Store the corresponding menu item IDs in this array.
#     metadatas = []

#     # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
#     ids = []
#     id = 1

#     # Loop thru each line and populate the 3 arrays.
#     for i, line in enumerate(lines):
#         if i==0:
#             # Skip the first row (the column headers)
#             continue

#         documents.append(line[1])
#         metadatas.append({"item_id": line[0]})
#         ids.append(str(id))
#         id+=1

In [57]:
# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [58]:
print(documents)



In [59]:
# Query the vector database

results = collection.query(
    query_texts=["power usage"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])

results = collection.query(
    query_texts=["sustainability impact"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])

results = collection.query(
    query_texts=["cooling system"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])


[[{'filename': 'Sustainability-With-Substance-White-Paper.pdf'}, {'filename': 'Schneider Electric understand the total sustainability impact of liion UPS batteries.pdf'}, {'filename': 'shell-immersion-cooling-fluid-s5-x-brochure.pdf'}, {'filename': 'Five reasons to adop liquid cooling.pdf'}, {'filename': 'Wyoming_Use_Case_v2.pdf'}]]
[[{'filename': 'Sustainability-With-Substance-White-Paper.pdf'}, {'filename': 'Schneider Electric understand the total sustainability impact of liion UPS batteries.pdf'}, {'filename': 'Wyoming_Use_Case_v2.pdf'}, {'filename': 'MergeIT-SustainableAppAdjacentVDIForAI&HPCWorkloads-Infographic-1280x720px-RGB-mk1.pdf'}, {'filename': 'Site Readiness Checklist Template JAN 2024 (2).pdf'}]]
[[{'filename': 'Hypertec  Immersion-Born Trident Servers  5-13-2024.pdf'}, {'filename': 'Navigating Liquid Cooling Architectures for Data Centers with AI Workloads.pdf'}, {'filename': 'GRC-iceraq-series10-data-sheet Quad - Duo.pdf'}, {'filename': 'Vertiv-LiquidCooling-KIH-WP-EN-N