In [6]:
import sys
import os

root_dir = r"D:/BIN/Project/agentic_chatbot"
os.chdir(root_dir)
sys.path.append(root_dir)

from app.embeddings.embedding_manager import EmbeddingManager, POL_DOCS, PROD_DOCS
from app.retrievers.vector_store import VectorStore
from typing import List, Dict, Any, Tuple


class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: {query}")
        print(f"Top k: {top_k}, Score threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            
            else:
                print("No documents found")
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        


In [10]:
print("------Testing Embeddings and Vector Store------")
print("Embedding for product documents")
prod_texts = [doc.page_content for doc in PROD_DOCS]
embeddings = EmbeddingManager()
prod_embed = embeddings.generate_embeddings(prod_texts)
prod_vs = VectorStore(collection_name="product_vectorstore")
prod_vs.add_documents(PROD_DOCS, prod_embed)
print("-------------")

------Testing Embeddings and Vector Store------
Embedding for product documents
Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384
Generating embeddings for 30 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 10.34it/s]

Generated embeddings with shape: (30, 384)
Vector store initialized. Collection: product_vectorstore
Existing documents in collection: 90
Adding 30 documents to vector store...
Successfully added 30 documents to the vector's storage
Total documents in collection: 120
-------------





In [16]:
prod_retriever = RAGRetriever(vector_store=prod_vs, embedding_manager=embeddings)
res = prod_retriever.retrieve("Sedan hạng B")

for i in res:
    print(i)

Retrieving documents for query: Sedan hạng B
Top k: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.50it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)
{'id': 'doc_0b51e223_19', 'content': 'Sedan hạng B, tiết kiệm nhiên liệu', 'metadata': {'year': 2022, 'price_vnd': 489000000, 'transmission': 'Tự động', 'brand': 'Nissan', 'id': 20, 'segment': 'Hạng B', 'body_type': 'Sedan', 'doc_index': 19, 'origin': 'Thái Lan', 'seats': 5, 'engine': '1.0L Turbo', 'fuel': 'Xăng', 'model': 'Almera', 'content_length': 34}, 'similarity_score': 0.5216912925243378, 'distance': 0.47830870747566223, 'rank': 1}
{'id': 'doc_b4d88d29_19', 'content': 'Sedan hạng B, tiết kiệm nhiên liệu', 'metadata': {'transmission': 'Tự động', 'content_length': 34, 'origin': 'Thái Lan', 'engine': '1.0L Turbo', 'body_type': 'Sedan', 'doc_index': 19, 'fuel': 'Xăng', 'brand': 'Nissan', 'model': 'Almera', 'segment': 'Hạng B', 'seats': 5, 'id': 20, 'price_vnd': 489000000, 'year': 2022}, 'similarity_score': 0.5216912925243378, 'distance': 0.47830870747566223, 'rank': 2}
{'id': 'doc_e5b02339_19', 'content




In [23]:
type(res[0])

dict

In [22]:
(res[0])

{'id': 'doc_0b51e223_19',
 'content': 'Sedan hạng B, tiết kiệm nhiên liệu',
 'metadata': {'year': 2022,
  'price_vnd': 489000000,
  'transmission': 'Tự động',
  'brand': 'Nissan',
  'id': 20,
  'segment': 'Hạng B',
  'body_type': 'Sedan',
  'doc_index': 19,
  'origin': 'Thái Lan',
  'seats': 5,
  'engine': '1.0L Turbo',
  'fuel': 'Xăng',
  'model': 'Almera',
  'content_length': 34},
 'similarity_score': 0.5216912925243378,
 'distance': 0.47830870747566223,
 'rank': 1}