In [51]:
import os
import uuid
import json
import numpy as np
from PIL import Image
from pydantic import BaseModel
from sqlalchemy import create_engine, MetaData

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from pprint import pprint

from sentence_transformers import SentenceTransformer

from langchain.utilities import SQLDatabase
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.agents.agent_types import AgentType
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Qdrant

from qdrant_client import QdrantClient, models
from qdrant_client.http.models import PointStruct, VectorParams, Distance

from dotenv import load_dotenv
load_dotenv(override=True)

import warnings
warnings.filterwarnings("ignore")

import yaml

In [52]:
with open("../docs/cm_db_knowledge.yaml", "r") as f:
    db_docs = yaml.safe_load(f)
pprint(db_docs['database_docs'].keys())

dict_keys(['tables'])


In [53]:
embeddings = OllamaEmbeddings(model="bge-m3:latest")

In [54]:
points = []
for i, tbl_info in enumerate(db_docs['database_docs']['tables'].values()):
    tbl_name = tbl_info['name']
    desc = tbl_info['description']
    fields = tbl_info['fields']
    relations = tbl_info['relations']

    fields_str = ""
    for f in fields:
        for k, v in f.items():
            fields_str += f"{k}: {v}\n"

    relations_str = "\n".join(relations)
    
    content = ""
    content = f"Table: {tbl_name}\nDescription: {desc}\nFields:\n{fields_str}\nRelations:\n{relations_str}\n"

    point = PointStruct(
        id=i,
        vector=embeddings.embed_query(content),
        payload={
            "name": tbl_name,
            "description": desc,
            "fields": fields_str.split("\n"),
            "relations": relations
        }
    )
    points.append(point)


In [55]:
qdrant_client  = QdrantClient(url=os.getenv("QDRANT_URL"))

In [56]:
COL_NAME = "CM-DB-KNOWLEDGE"
if not qdrant_client.collection_exists(collection_name=COL_NAME): 
    qdrant_client.create_collection(
        collection_name=COL_NAME,
        vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
        # optional: tune shards, replication_factor, etc.
    )
else:
    print("Collection already created")

Collection already created


In [57]:
qdrant_client.upsert(collection_name=COL_NAME, points=points)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [58]:
q = "Does the status of latest RFI is draft?"

In [59]:
resp = qdrant_client.query_points(
    collection_name=COL_NAME,
    query=embeddings.embed_query(q),
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    limit=1,
)

resp.points

[ScoredPoint(id=0, version=1, score=0.28427514, payload={'name': 'document_document', 'description': 'document_document is a master table for all document types (RFI, Submittal, Inspection).\n', 'fields': ['id: [BIGINT] Primary key', 'deleted: [TIMESTAMP] Indicates when this object was deleted', 'created_at: [TIMESTAMP] Date and time when this object was created', 'code: [VARCHAR(50)] Unique code for this document', 'title: [VARCHAR(255)] Title of this document', 'type: [INTEGER] ENUM type of this document (0 = RFI, 1 = Submittal, 2 = Inspection)', 'process: [INTEGER] ENUM for the process status of each document type (0 = Draft, 1 = Review, 2 = Open, 3 = Waiting for Manager, 4 = Closed)', 'status: [INTEGER] ENUM for the task decision of each document type (0 = Invalid, 1 = Pending, 2 = In Process, 3 = Approved, 4 = Approved as Note, 5 = Rejected, 6 = Rejected as Note, 7 = Deleted)', 'due_date: [TIMESTAMP] Due date of the document (applies to RFI, Submittal, and Inspection)', 'schedule_