In [3]:
# -*- coding: utf-8 -*-
"""Modified RAG Application using LangChain, MongoDB Atlas Vector Search, Internal Vectorizer, and Groq Cloud.

This script sets up a RAG pipeline tailored to specific requirements:
1. Retrieves ALL relevant documents from MongoDB based on vector similarity in the 'combined_resume_vector' field.
2. Extracts a predefined set of fields from the retrieved documents.
3. Passes all extracted fields as context to the LLM (Groq Cloud).
4. Instructs the LLM to score and rank ALL documents based on the query and return their '_id' with scores in JSON format.

Prerequisites:
1.  A MongoDB Atlas cluster with a vector search index named 'resume_vector' configured for the 'combined_resume_vector' field.
2.  Python packages installed: langchain, langchain-mongodb, langchain-groq,
    langchain-community, pymongo, python-dotenv, sentence-transformers
    You can install them using pip:
    pip install langchain langchain-mongodb langchain-groq langchain-community pymongo python-dotenv sentence-transformers
3.  Environment variables set:
    - MONGO_URI: Your MongoDB Atlas connection string.
    - GROQ_API_KEYS: Your Groq API keys (comma-separated).
    - DB_NAME: The name of your MongoDB database.
    - COLLECTION_NAME: The name of your MongoDB collection.
    # Note: VECTOR_FIELD should match your index configuration ('combined_resume_vector').
    - VECTOR_FIELD: The name of the field containing vectors (e.g., 'combined_resume_vector').
"""

import os
import json
from typing import List, Dict, Optional, Tuple
from pymongo import MongoClient
from bson import ObjectId
from pydantic import BaseModel, Field

# Modern LangChain imports
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document

# Core modules from the codebase
from core.custom_logger import CustomLogger
from core.config import AppConfig
from core.helpers import JSONEncoder
from dotenv import load_dotenv
from properties.mango import MONGODB_URI, DB_NAME, COLLECTION_NAME

# Import the internal vectorizer for embeddings
from embeddings.vectorizer import Vectorizer

# --- Custom Embedding Adapter for LangChain ---


class VectorizerEmbeddingAdapter:
    """Adapter to make our internal Vectorizer compatible with LangChain embeddings interface"""

    def __init__(self, vectorizer: Vectorizer):
        self.vectorizer = vectorizer

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents"""
        return [self.vectorizer.generate_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        """Embed a single query text"""
        return self.vectorizer.generate_embedding(text)


# --- Configuration ---

load_dotenv()

# Initialize logger
logger_instance = CustomLogger()
logger = logger_instance.get_logger("rag_application")

# Configuration with fallbacks using AppConfig pattern
MONGODB_URI = os.environ.get(
    "MONGO_URI", AppConfig.MONGODB_CONNECTION_STRING or "YOUR_MONGO_URI_HERE"
)
DB_NAME = os.environ.get("DB_NAME", "YOUR_DB_NAME_HERE")
COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "YOUR_COLLECTION_NAME_HERE")
VECTOR_FIELD = os.environ.get("VECTOR_FIELD", "combined_resume_vector")
INDEX_NAME = "vector_search_index"
GROQ_API_KEY = (
    os.environ.get("GROQ_API_KEYS", "").split(",")[0].strip()
)  # Get first Groq API key

# Fields to extract from retrieved documents and pass to LLM
# IMPORTANT: Ensure these field names exactly match your MongoDB document structure.
FIELDS_TO_EXTRACT = [
    "_id",
    "user_id",
    "username",
    "contact_details",
    "total_experience",
    "notice_period",
    "currency",
    "pay_duration",
    "current_salary",
    "hike",
    "expected_salary",
    "skills",
    "may_also_known_skills",
    "labels",
    "experience",
    "academic_details",
    "source",
    "last_working_day",
    "is_tier1_mba",
    "is_tier1_engineering",
]


# Pydantic models for structured output
class CandidateScore(BaseModel):
    """Individual candidate with score"""
    id: str = Field(description="MongoDB ObjectId of the candidate")
    score: float = Field(description="Relevance score between 0.0 and 1.0")
    reason: str = Field(description="Brief reason for the score")


class AllRelevantResults(BaseModel):
    """Pydantic model for the LLM output containing all relevant candidates with scores"""
    total_candidates: int = Field(description="Total number of candidates evaluated")
    candidates: List[CandidateScore] = Field(description="List of all candidates with their scores, sorted by relevance")

    class Config:
        populate_by_name = True


# Basic configuration checks with logging
if not MONGODB_URI or MONGODB_URI == "YOUR_MONGO_URI_HERE":
    logger.warning("MONGO_URI environment variable not set or using placeholder.")
if not DB_NAME or DB_NAME == "YOUR_DB_NAME_HERE":
    logger.warning("DB_NAME environment variable not set or using placeholder.")
if not COLLECTION_NAME or COLLECTION_NAME == "YOUR_COLLECTION_NAME_HERE":
    logger.warning("COLLECTION_NAME environment variable not set or using placeholder.")
if not GROQ_API_KEY:
    logger.error(
        "GROQ_API_KEYS environment variable not set. Please set it to use Groq Cloud models."
    )

# --- Initialize Components ---

logger.info("Initializing RAG components...")


class RAGApplication:
    """Modern RAG Application class with improved error handling and logging"""

    def __init__(self):
        self.embeddings = None
        self.vector_store = None
        self.llm = None
        self.retrieval_chain = None
        self.client = None
        self.collection = None
        self.vectorizer = None

        self._initialize_components()

    def _initialize_components(self):
        """Initialize all RAG components"""
        try:
            self._initialize_embeddings()
            self._initialize_database()
            self._initialize_vector_store()
            self._initialize_llm()
            self._setup_retrieval_chain()
            logger.info("RAG Application initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize RAG Application: {e}")
            raise

    def _initialize_embeddings(self):
        """Initialize internal vectorizer for embeddings"""
        try:
            # Initialize the internal vectorizer (same as used throughout the codebase)
            self.vectorizer = Vectorizer()
            # Create LangChain-compatible adapter
            self.embeddings = VectorizerEmbeddingAdapter(self.vectorizer)
            logger.info(
                "Internal vectorizer (SentenceTransformer all-MiniLM-L6-v2) initialized successfully"
            )
        except Exception as e:
            logger.error(f"Error initializing internal vectorizer: {e}")
            logger.info(
                "Make sure sentence-transformers is installed: pip install sentence-transformers"
            )
            raise

    def _initialize_database(self):
        """Initialize MongoDB connection"""
        try:
            self.client = MongoClient(MONGODB_URI)
            db = self.client[DB_NAME]
            self.collection = db[COLLECTION_NAME]  # Test connection
            self.client.admin.command("ping")
            logger.info(
                f"Connected to MongoDB database '{DB_NAME}' and collection '{COLLECTION_NAME}'"
            )
        except Exception as e:
            logger.error(f"Error connecting to MongoDB: {e}")
            raise

    def _initialize_vector_store(self):
        """Initialize MongoDB Atlas Vector Search"""
        if self.embeddings is None or self.collection is None:
            raise ValueError("Embeddings and collection must be initialized first")

        try:
            self.vector_store = MongoDBAtlasVectorSearch(
                collection=self.collection,
                embedding=self.embeddings,
                index_name=INDEX_NAME,
                text_key="combined_resume",  # Use the full resume text field
                embedding_key=VECTOR_FIELD,
            )
            logger.info(
                f"MongoDB Atlas Vector Search initialized with index '{INDEX_NAME}' on field '{VECTOR_FIELD}'"
            )
        except Exception as e:
            logger.error(f"Error initializing MongoDBAtlasVectorSearch: {e}")
            raise

    def _initialize_llm(self):
        """Initialize Groq Cloud LLM model"""
        if not GROQ_API_KEY:
            logger.warning("Skipping LLM initialization due to missing Groq API key")
            return

        try:
            self.llm = ChatGroq(
                api_key=GROQ_API_KEY,
                model="gemma2-9b-it",  # Using same model as internal GroqcloudLLM
                temperature=0.1,  # Slightly higher temp for scoring variety
            )
            logger.info("Groq Cloud LLM (gemma2-9b-it) initialized")
        except Exception as e:
            logger.error(f"Error initializing Groq LLM: {e}")
            raise

    def _setup_retrieval_chain(self):
        """Setup the modern LangChain retrieval chain with LCEL for scoring all candidates"""
        if not self.llm:
            logger.warning("LLM not available, skipping chain setup")
            return
            
        # Define the prompt template for scoring all candidates
        prompt_template_text = """You are an expert HR assistant. You must analyze and score EVERY SINGLE candidate provided below.

CRITICAL INSTRUCTIONS:
- You MUST score ALL candidates provided in the context
- Do NOT return just one candidate - return ALL of them
- Each candidate has an "_id" field - you must include ALL these IDs in your response
- Score each candidate from 0.0 to 1.0 based on relevance to the query

SCORING RULES:
- 0.9-1.0: Perfect match (meets all key requirements)
- 0.7-0.8: Strong match (meets most requirements) 
- 0.5-0.6: Good match (meets some requirements)
- 0.3-0.4: Moderate match (some relevance)
- 0.1-0.2: Weak match (limited relevance)
- 0.0: No match

REQUIRED OUTPUT FORMAT:
You must return a JSON object with this exact structure:
{{
  "total_candidates": <number_of_candidates_you_analyzed>,
  "candidates": [
    {{
      "id": "<candidate_id_1>",
      "score": <score_between_0_and_1>,
      "reason": "<brief_explanation>"
    }},
    {{
      "id": "<candidate_id_2>",
      "score": <score_between_0_and_1>,
      "reason": "<brief_explanation>"
    }}
    // ... continue for ALL candidates
  ]
}}

CANDIDATE DATA TO ANALYZE:
{context}

USER QUERY: {question}

Now analyze EVERY candidate above and return ALL their scores in the required JSON format: """

    def _normalize_field_value(self, value) -> str:
        """Normalize field values for consistent processing"""
        if value is None:
            return ""
        if isinstance(value, (list, dict)):
            return json.dumps(value, cls=JSONEncoder)
        return str(value).strip()

    def _normalize_list_field(self, value) -> List[str]:
        """Normalize list fields for consistent processing"""
        if not value:
            return []
        if isinstance(value, str):
            return [value.strip()] if value.strip() else []
        if isinstance(value, list):
            return [self._normalize_field_value(item) for item in value if item]
        return [str(value)]

    def get_all_relevant_documents_and_context(
        self, question: str, similarity_threshold: float = 0.1, max_docs: int = 50
    ) -> Tuple[List[ObjectId], Optional[str]]:
        """Retrieves ALL relevant documents above similarity threshold, fetches specified fields, and formats them as context."""
        if not self.vector_store:
            logger.error("Vector store not initialized. Cannot retrieve documents.")
            return [], None

        logger.info(f"Retrieving all relevant documents for question: {question}")
        logger.info(f"Using similarity threshold: {similarity_threshold}, max docs: {max_docs}")
        
        try:
            # Use similarity_search_with_score to get similarity scores
            retrieved_docs_with_scores = self.vector_store.similarity_search_with_score(
                query=question, 
                k=max_docs
            )

            if not retrieved_docs_with_scores:
                logger.warning("No relevant documents found.")
                return [], None

            # Filter documents by similarity threshold
            filtered_docs = [
                (doc, score) for doc, score in retrieved_docs_with_scores 
                if score >= similarity_threshold
            ]
            
            logger.info(f"Found {len(filtered_docs)} documents above similarity threshold {similarity_threshold}")

            if not filtered_docs:
                logger.warning("No documents above similarity threshold.")
                return [], None

            # Extract _ids from metadata
            doc_ids = []
            for doc, score in filtered_docs:
                if hasattr(doc, "metadata") and "_id" in doc.metadata:
                    doc_id = doc.metadata["_id"]
                    if not isinstance(doc_id, ObjectId):
                        try:
                            doc_id = ObjectId(doc_id)
                        except Exception:
                            logger.warning(
                                f"Could not convert metadata _id '{doc_id}' to ObjectId. Skipping."
                            )
                            continue
                    doc_ids.append(doc_id)
                    logger.info(f"Document {doc_id} has similarity score: {score}")
                else:
                    logger.warning(f"Retrieved document missing _id in metadata: {doc}")

            if not doc_ids:
                logger.error("Could not extract valid _ids from retrieved documents.")
                return [], None

            logger.info(f"Retrieved document _ids: {doc_ids}")

            # Fetch the full documents with specified fields from MongoDB
            projection = {field: 1 for field in FIELDS_TO_EXTRACT}
            if "_id" not in projection:
                projection["_id"] = 1

            fetched_docs_cursor = self.collection.find(
                {"_id": {"$in": doc_ids}}, projection
            )
            
            # Format the context string with normalization
            context_parts = []
            logger.info("=== DOCUMENTS FETCHED FROM MONGODB ===")
            for i, doc in enumerate(fetched_docs_cursor):
                logger.info(f"Document {i+1} raw data:")
                logger.info(json.dumps(doc, indent=2, cls=JSONEncoder))

                # Normalize document fields
                normalized_doc = {}
                for field, value in doc.items():
                    if field == "_id":
                        normalized_doc[field] = str(value)
                    elif field in ["skills", "may_also_known_skills", "labels"]:
                        normalized_doc[field] = self._normalize_list_field(value)
                    else:
                        normalized_doc[field] = self._normalize_field_value(value)

                logger.info(f"Document {i+1} normalized data:")
                logger.info(json.dumps(normalized_doc, indent=2, cls=JSONEncoder))

                context_parts.append(
                    json.dumps(normalized_doc, indent=2, cls=JSONEncoder)
                )
            logger.info("=== END OF DOCUMENTS ===")

            context_string = "\n\n---\n\n".join(context_parts)
            logger.info("Formatted context for LLM")

            # Print the context retrieved from MongoDB
            logger.info("=== CONTEXT RETRIEVED FROM MONGODB ===")
            logger.info(f"Number of documents retrieved: {len(context_parts)}")
            logger.info("Context content:")
            logger.info(context_string)
            logger.info("=== END OF CONTEXT ===")

            return doc_ids, context_string

        except Exception as e:
            logger.error(f"Error during retrieval or context preparation: {e}")
            return [], None

    def get_all_scored_candidates(
        self, 
        question: str, 
        similarity_threshold: float = 0.1, 
        max_docs: int = 50
    ) -> Optional[Dict]:
        """Processes a question, retrieves ALL relevant context, asks LLM to score all candidates, and returns all IDs with scores in JSON format."""
        if not self.retrieval_chain:
            logger.error(
                "Retrieval chain is not available. Cannot process the question."
            )
            return None

        logger.info(f"Processing question for all scored candidates: {question}")
        retrieved_ids, context = self.get_all_relevant_documents_and_context(
            question, similarity_threshold, max_docs
        )

        if context is None or not retrieved_ids:
            logger.error("Failed to get context or retrieve IDs.")
            return None

        try:
            logger.info("Invoking retrieval chain for scoring all candidates...")
            logger.info(f"Total candidates to be scored: {len(retrieved_ids)}")
            
            # Use the modern LCEL chain
            raw_result = self.retrieval_chain.invoke(
                {"context": context, "question": question}
            )
            
            logger.info(f"Raw LLM Response: {raw_result}")
            
            # Parse the response manually
            if hasattr(raw_result, 'content'):
                response_text = raw_result.content
            else:
                response_text = str(raw_result)
            
            logger.info(f"LLM Response Text: {response_text}")
            
            # Try to extract JSON from the response
            try:
                # Look for JSON in the response
                import re
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    result = json.loads(json_str)
                    logger.info(f"Parsed JSON successfully: {result}")
                else:
                    raise ValueError("No JSON found in response")
                    
            except (json.JSONDecodeError, ValueError) as e:
                logger.warning(f"Failed to parse JSON from LLM response: {e}")
                logger.info("Creating fallback response with all candidates")
                
                # Fallback: create a structured response for all candidates
                result = {
                    "total_candidates": len(retrieved_ids),
                    "candidates": []
                }
                
                # Give basic scores to all candidates
                for i, doc_id in enumerate(retrieved_ids):
                    # Assign decreasing scores based on retrieval order
                    score = max(0.1, 0.9 - (i * 0.1))
                    result["candidates"].append({
                        "id": str(doc_id),
                        "score": round(score, 2),
                        "reason": f"Candidate {i+1} - retrieved by vector similarity"
                    })
            
            # Validate the result has the expected structure
            if isinstance(result, dict) and "candidates" in result:
                # Ensure all candidates are included
                result_ids = {candidate.get("id") for candidate in result.get("candidates", [])}
                retrieved_ids_str = {str(doc_id) for doc_id in retrieved_ids}
                
                missing_ids = retrieved_ids_str - result_ids
                if missing_ids:
                    logger.warning(f"LLM missed {len(missing_ids)} candidates. Adding them with default scores.")
                    for missing_id in missing_ids:
                        result["candidates"].append({
                            "id": missing_id,
                            "score": 0.3,
                            "reason": "Added automatically - missed by LLM analysis"
                        })
                
                # Sort by score (highest first)
                result["candidates"] = sorted(
                    result["candidates"], 
                    key=lambda x: x.get("score", 0), 
                    reverse=True
                )
                
                result["total_candidates"] = len(result["candidates"])
                
                logger.info("Successfully processed question and scored all candidates")
                logger.info(f"Total candidates scored: {result.get('total_candidates', 0)}")
                logger.info(f"Final Answer (JSON): {json.dumps(result, indent=2)}")
                return result
            else:
                logger.warning(f"Unexpected result format: {result}")
                # Fallback: return all IDs with default scores
                fallback_result = {
                    "total_candidates": len(retrieved_ids),
                    "candidates": [
                        {
                            "id": str(doc_id),
                            "score": 0.5,
                            "reason": "Fallback scoring due to unexpected LLM response format"
                        }
                        for doc_id in retrieved_ids
                    ]
                }
                return fallback_result

        except Exception as e:
            logger.error(f"Error during LLM processing: {e}")
            # Fallback: return all retrieved IDs with default scores
            fallback_result = {
                "total_candidates": len(retrieved_ids),
                "candidates": [
                    {
                        "id": str(doc_id),
                        "score": 0.5,
                        "reason": f"Fallback scoring due to error: {str(e)[:100]}"
                    }
                    for doc_id in retrieved_ids
                ]
            }
            return fallback_result

    # Keep backward compatibility methods
    def ask_resume_question_and_get_id(self, question: str) -> Optional[Dict]:
        """Backward compatibility: Returns the best scoring candidate"""
        all_results = self.get_all_scored_candidates(question)
        if all_results and "candidates" in all_results and all_results["candidates"]:
            best_candidate = all_results["candidates"][0]  # First one is highest scored
            return {"_id": best_candidate["id"], "score": best_candidate["score"]}
        return None

    def get_top_k_candidate_ids(self, question: str, k: int = 10) -> Optional[Dict]:
        """
        Returns top k candidates with their scores
        """
        all_results = self.get_all_scored_candidates(question)
        if all_results and "candidates" in all_results:
            top_k_candidates = all_results["candidates"][:k]
            result = {
                "total_evaluated": all_results.get("total_candidates", 0),
                "returned_count": len(top_k_candidates)
            }
            
            for i, candidate in enumerate(top_k_candidates):
                result[f"id{i+1}"] = candidate["id"]
                result[f"score{i+1}"] = candidate["score"]
                result[f"reason{i+1}"] = candidate["reason"]
            
            return result
        return None

    def get_relevant_ids_and_context(
        self, question: str, k: int = 3
    ) -> Tuple[List[ObjectId], Optional[str]]:
        """Backward compatibility function"""
        return self.get_all_relevant_documents_and_context(question, max_docs=k)

    def health_check(self) -> Dict[str, bool]:
        """Check the health of all components"""
        return {
            "embeddings_initialized": self.embeddings is not None,
            "vectorizer_initialized": self.vectorizer is not None,
            "vector_store_initialized": self.vector_store is not None,
            "llm_initialized": self.llm is not None,
            "retrieval_chain_initialized": self.retrieval_chain is not None,
            "mongodb_connection": self.client is not None,
        }


# --- Main Execution Example ---

# Global instance for backward compatibility
rag_app = None


def initialize_rag_application() -> RAGApplication:
    """Initialize the RAG application"""
    global rag_app
    if rag_app is None:
        rag_app = RAGApplication()
    return rag_app


def get_all_scored_candidates(
    question: str, 
    similarity_threshold: float = 0.1, 
    max_docs: int = 50
) -> Optional[Dict]:
    """Main function to get all relevant candidates with scores"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_all_scored_candidates(question, similarity_threshold, max_docs)


# Backward compatibility functions
def ask_resume_question_and_get_id(question: str) -> Optional[Dict]:
    """Backward compatibility function"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.ask_resume_question_and_get_id(question)


def get_relevant_ids_and_context(
    question: str, k: int = 3
) -> Tuple[List[ObjectId], Optional[str]]:
    """Backward compatibility function"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_relevant_ids_and_context(question, k)


def get_top_k_candidate_ids(question: str, k: int = 10) -> Optional[Dict]:
    """Backward compatibility function for getting top k candidate IDs"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_top_k_candidate_ids(question, k)


if __name__ == "__main__":
    try:
        # Check configuration
        if (
            MONGODB_URI != "YOUR_MONGO_URI_HERE"
            and GROQ_API_KEY
            and DB_NAME != "YOUR_DB_NAME_HERE"
            and COLLECTION_NAME != "YOUR_COLLECTION_NAME_HERE"
        ):
            # Initialize RAG application
            logger.info("--- Initializing RAG Application ---")
            rag_application = initialize_rag_application()

            # Health check
            health_status = rag_application.health_check()
            logger.info(f"Health Check: {health_status}")
            
            # Test query - Get ALL relevant candidates with scores
            sample_question = "Find candidates with 5 years above experience in python"
            logger.info("--- Running Sample Query for ALL Scored Candidates ---")
            
            # Get all relevant candidates with scores
            all_results = rag_application.get_all_scored_candidates(
                sample_question, 
                similarity_threshold=0.2,  # Adjust threshold as needed
                max_docs=100  # Maximum documents to consider
            )

            if all_results:
                logger.info("--- Query Finished --- All Scored Candidates Result ---")
                logger.info(f"Total candidates evaluated: {all_results.get('total_candidates', 0)}")
                logger.info(f"Result: {json.dumps(all_results, indent=2)}")
            else:
                logger.warning("--- Query Finished --- No candidates obtained.")

            # Also test the backward compatibility methods
            logger.info("--- Running Top 10 Candidates Query ---")
            top_10_result = rag_application.get_top_k_candidate_ids(sample_question, k=10)
            if top_10_result:
                logger.info("--- Top 10 Result ---")
                logger.info(f"Top 10: {json.dumps(top_10_result, indent=2)}")

            logger.info("--- Running Best Match Query ---")
            best_match_result = rag_application.ask_resume_question_and_get_id(sample_question)
            if best_match_result:
                logger.info("--- Best Match Result ---")
                logger.info(f"Best Match: {json.dumps(best_match_result, indent=2)}")
        else:
            logger.error("Configuration Incomplete:")
            logger.error(
                "Please ensure MONGO_URI, GROQ_API_KEYS, DB_NAME, COLLECTION_NAME are set."
            )
            logger.info("Run the script (e.g., python rag.py) to test.")

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

logger.info("RAG Application setup complete. Ready for execution or function calls.")

2025-06-02 18:32:16 - rag_application - INFO - Initializing RAG components...
2025-06-02 18:32:16 - rag_application - INFO - --- Initializing RAG Application ---
2025-06-02 18:32:20 - rag_application - INFO - Internal vectorizer (SentenceTransformer all-MiniLM-L6-v2) initialized successfully
  _crypto.X509.from_cryptography(_load_der_x509_certificate(cert))
2025-06-02 18:32:22 - rag_application - INFO - Connected to MongoDB database 'resume_db' and collection 'resumes'
2025-06-02 18:32:22 - rag_application - INFO - MongoDB Atlas Vector Search initialized with index 'vector_search_index' on field 'combined_resume_vector'
2025-06-02 18:32:23 - rag_application - INFO - Groq Cloud LLM (gemma2-9b-it) initialized
2025-06-02 18:32:23 - rag_application - INFO - RAG Application initialized successfully
2025-06-02 18:32:23 - rag_application - INFO - Health Check: {'embeddings_initialized': True, 'vectorizer_initialized': True, 'vector_store_initialized': True, 'llm_initialized': True, 'retrieval

In [6]:
# -*- coding: utf-8 -*-
"""Modified RAG Application using LangChain, MongoDB Atlas Vector Search, Internal Vectorizer, and Groq Cloud.

This script sets up a RAG pipeline tailored to specific requirements:
1. Retrieves ALL relevant documents from MongoDB based on vector similarity in the 'combined_resume_vector' field.
2. Extracts a predefined set of fields from the retrieved documents.
3. Passes all extracted fields as context to the LLM (Groq Cloud).
4. Instructs the LLM to score and rank ALL documents based on the query and return their '_id' with scores in JSON format.

Prerequisites:
1.  A MongoDB Atlas cluster with a vector search index named 'resume_vector' configured for the 'combined_resume_vector' field.
2.  Python packages installed: langchain, langchain-mongodb, langchain-groq,
    langchain-community, pymongo, python-dotenv, sentence-transformers
    You can install them using pip:
    pip install langchain langchain-mongodb langchain-groq langchain-community pymongo python-dotenv sentence-transformers
3.  Environment variables set:
    - MONGO_URI: Your MongoDB Atlas connection string.
    - GROQ_API_KEYS: Your Groq API keys (comma-separated).
    - DB_NAME: The name of your MongoDB database.
    - COLLECTION_NAME: The name of your MongoDB collection.
    # Note: VECTOR_FIELD should match your index configuration ('combined_resume_vector').
    - VECTOR_FIELD: The name of the field containing vectors (e.g., 'combined_resume_vector').
"""

import os
import json
from typing import List, Dict, Optional, Tuple
from pymongo import MongoClient
from bson import ObjectId
from pydantic import BaseModel, Field

# Modern LangChain imports
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document

# Core modules from the codebase
from core.custom_logger import CustomLogger
from core.config import AppConfig
from core.helpers import JSONEncoder
from dotenv import load_dotenv
from properties.mango import MONGODB_URI, DB_NAME, COLLECTION_NAME

# Import the internal vectorizer for embeddings
from embeddings.vectorizer import Vectorizer

# --- Custom Embedding Adapter for LangChain ---


class VectorizerEmbeddingAdapter:
    """Adapter to make our internal Vectorizer compatible with LangChain embeddings interface"""

    def __init__(self, vectorizer: Vectorizer):
        self.vectorizer = vectorizer

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents"""
        return [self.vectorizer.generate_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        """Embed a single query text"""
        return self.vectorizer.generate_embedding(text)


# --- Configuration ---

load_dotenv()

# Initialize logger
logger_instance = CustomLogger()
logger = logger_instance.get_logger("rag_application")

# Configuration with fallbacks using AppConfig pattern
MONGODB_URI = os.environ.get(
    "MONGO_URI", AppConfig.MONGODB_CONNECTION_STRING or "YOUR_MONGO_URI_HERE"
)
DB_NAME = os.environ.get("DB_NAME", "YOUR_DB_NAME_HERE")
COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "YOUR_COLLECTION_NAME_HERE")
VECTOR_FIELD = os.environ.get("VECTOR_FIELD", "combined_resume_vector")
INDEX_NAME = "vector_search_index"
GROQ_API_KEY = (
    os.environ.get("GROQ_API_KEYS", "").split(",")[0].strip()
)  # Get first Groq API key

# Fields to extract from retrieved documents and pass to LLM
# IMPORTANT: Ensure these field names exactly match your MongoDB document structure.
FIELDS_TO_EXTRACT = [
    "_id",
    "user_id",
    "username",
    "contact_details",
    "total_experience",
    "notice_period",
    "currency",
    "pay_duration",
    "current_salary",
    "hike",
    "expected_salary",
    "skills",
    "may_also_known_skills",
    "labels",
    "experience",
    "academic_details",
    "source",
    "last_working_day",
    "is_tier1_mba",
    "is_tier1_engineering",
]


# Pydantic models for structured output
class CandidateScore(BaseModel):
    """Individual candidate with score"""
    id: str = Field(description="MongoDB ObjectId of the candidate")
    score: float = Field(description="Relevance score between 0.0 and 1.0")
    reason: str = Field(description="Brief reason for the score")


class AllRelevantResults(BaseModel):
    """Pydantic model for the LLM output containing all relevant candidates with scores"""
    total_candidates: int = Field(description="Total number of candidates evaluated")
    candidates: List[CandidateScore] = Field(description="List of all candidates with their scores, sorted by relevance")

    class Config:
        populate_by_name = True


# Basic configuration checks with logging
if not MONGODB_URI or MONGODB_URI == "YOUR_MONGO_URI_HERE":
    logger.warning("MONGO_URI environment variable not set or using placeholder.")
if not DB_NAME or DB_NAME == "YOUR_DB_NAME_HERE":
    logger.warning("DB_NAME environment variable not set or using placeholder.")
if not COLLECTION_NAME or COLLECTION_NAME == "YOUR_COLLECTION_NAME_HERE":
    logger.warning("COLLECTION_NAME environment variable not set or using placeholder.")
if not GROQ_API_KEY:
    logger.error(
        "GROQ_API_KEYS environment variable not set. Please set it to use Groq Cloud models."
    )

# --- Initialize Components ---

logger.info("Initializing RAG components...")


class RAGApplication:
    """Modern RAG Application class with improved error handling and logging"""

    def __init__(self):
        self.embeddings = None
        self.vector_store = None
        self.llm = None
        self.retrieval_chain = None
        self.client = None
        self.collection = None
        self.vectorizer = None

        self._initialize_components()

    def _initialize_components(self):
        """Initialize all RAG components"""
        try:
            self._initialize_embeddings()
            self._initialize_database()
            self._initialize_vector_store()
            self._initialize_llm()
            self._setup_retrieval_chain()
            logger.info("RAG Application initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize RAG Application: {e}")
            raise

    def _initialize_embeddings(self):
        """Initialize internal vectorizer for embeddings"""
        try:
            # Initialize the internal vectorizer (same as used throughout the codebase)
            self.vectorizer = Vectorizer()
            # Create LangChain-compatible adapter
            self.embeddings = VectorizerEmbeddingAdapter(self.vectorizer)
            logger.info(
                "Internal vectorizer (SentenceTransformer all-MiniLM-L6-v2) initialized successfully"
            )
        except Exception as e:
            logger.error(f"Error initializing internal vectorizer: {e}")
            logger.info(
                "Make sure sentence-transformers is installed: pip install sentence-transformers"
            )
            raise

    def _initialize_database(self):
        """Initialize MongoDB connection"""
        try:
            self.client = MongoClient(MONGODB_URI)
            db = self.client[DB_NAME]
            self.collection = db[COLLECTION_NAME]  # Test connection
            self.client.admin.command("ping")
            logger.info(
                f"Connected to MongoDB database '{DB_NAME}' and collection '{COLLECTION_NAME}'"
            )
        except Exception as e:
            logger.error(f"Error connecting to MongoDB: {e}")
            raise

    def _initialize_vector_store(self):
        """Initialize MongoDB Atlas Vector Search"""
        if self.embeddings is None or self.collection is None:
            raise ValueError("Embeddings and collection must be initialized first")

        try:
            self.vector_store = MongoDBAtlasVectorSearch(
                collection=self.collection,
                embedding=self.embeddings,
                index_name=INDEX_NAME,
                text_key="combined_resume",  # Use the full resume text field
                embedding_key=VECTOR_FIELD,
            )
            logger.info(
                f"MongoDB Atlas Vector Search initialized with index '{INDEX_NAME}' on field '{VECTOR_FIELD}'"
            )
        except Exception as e:
            logger.error(f"Error initializing MongoDBAtlasVectorSearch: {e}")
            raise

    def _initialize_llm(self):
        """Initialize Groq Cloud LLM model"""
        if not GROQ_API_KEY:
            logger.warning("Skipping LLM initialization due to missing Groq API key")
            return

        try:
            self.llm = ChatGroq(
                api_key=GROQ_API_KEY,
                model="gemma2-9b-it",  # Using same model as internal GroqcloudLLM
                temperature=0.1,  # Slightly higher temp for scoring variety
            )
            logger.info("Groq Cloud LLM (gemma2-9b-it) initialized")
        except Exception as e:
            logger.error(f"Error initializing Groq LLM: {e}")
            raise

    def _setup_retrieval_chain(self):
        """Setup the modern LangChain retrieval chain with LCEL for scoring all candidates"""
        if not self.llm:
            logger.warning("LLM not available, skipping chain setup")
            return
            
        try:
            # Define the prompt template for scoring all candidates
            prompt_template_text = """You are an expert HR assistant. You must analyze and score EVERY SINGLE candidate provided below.

CRITICAL INSTRUCTIONS:
- You MUST score ALL candidates provided in the context
- Do NOT return just one candidate - return ALL of them
- Each candidate has an "_id" field - you must include ALL these IDs in your response
- Score each candidate from 0.0 to 1.0 based on relevance to the query

SCORING RULES:
- 0.9-1.0: Perfect match (meets all key requirements)
- 0.7-0.8: Strong match (meets most requirements) 
- 0.5-0.6: Good match (meets some requirements)
- 0.3-0.4: Moderate match (some relevance)
- 0.1-0.2: Weak match (limited relevance)
- 0.0: No match

REQUIRED OUTPUT FORMAT:
You must return a JSON object with this exact structure:
{{
  "total_candidates": <number_of_candidates_you_analyzed>,
  "candidates": [
    {{
      "id": "<candidate_id_1>",
      "score": <score_between_0_and_1>,
      "reason": "<brief_explanation>"
    }},
    {{
      "id": "<candidate_id_2>",
      "score": <score_between_0_and_1>,
      "reason": "<brief_explanation>"
    }}
  ]
}}

CANDIDATE DATA TO ANALYZE:
{context}

USER QUERY: {question}

Now analyze EVERY candidate above and return ALL their scores in the required JSON format:"""

            # Create the prompt template
            self.prompt = PromptTemplate(
                input_variables=["context", "question"],
                template=prompt_template_text
            )
            
            # Create the output parser
            self.output_parser = JsonOutputParser(pydantic_object=AllRelevantResults)
            
            # Create the retrieval chain using LCEL
            self.retrieval_chain = (
                {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
                | self.prompt
                | self.llm
                | self._parse_llm_response
            )
            
            logger.info("Retrieval chain with LCEL initialized successfully")
            
        except Exception as e:
            logger.error(f"Error setting up retrieval chain: {e}")
            raise

    def _parse_llm_response(self, response):
        """Parse LLM response and handle fallback"""
        try:
            # Get the content from the response
            if hasattr(response, 'content'):
                response_text = response.content
            else:
                response_text = str(response)
            
            logger.info(f"LLM Response Text: {response_text}")
            
            # Try to extract JSON from the response
            import re
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                result = json.loads(json_str)
                logger.info(f"Parsed JSON successfully: {result}")
                return result
            else:
                raise ValueError("No JSON found in response")
                
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning(f"Failed to parse JSON from LLM response: {e}")
            # Return the raw response for further processing
            return response_text

    def _normalize_field_value(self, value) -> str:
        """Normalize field values for consistent processing"""
        if value is None:
            return ""
        if isinstance(value, (list, dict)):
            return json.dumps(value, cls=JSONEncoder)
        return str(value).strip()

    def _normalize_list_field(self, value) -> List[str]:
        """Normalize list fields for consistent processing"""
        if not value:
            return []
        if isinstance(value, str):
            return [value.strip()] if value.strip() else []
        if isinstance(value, list):
            return [self._normalize_field_value(item) for item in value if item]
        return [str(value)]

    def get_all_relevant_documents_and_context(
        self, question: str, similarity_threshold: float = 0.1, max_docs: int = 50
    ) -> Tuple[List[ObjectId], Optional[str]]:
        """Retrieves ALL relevant documents above similarity threshold, fetches specified fields, and formats them as context."""
        if not self.vector_store:
            logger.error("Vector store not initialized. Cannot retrieve documents.")
            return [], None

        logger.info(f"Retrieving all relevant documents for question: {question}")
        logger.info(f"Using similarity threshold: {similarity_threshold}, max docs: {max_docs}")
        
        try:
            # Use similarity_search_with_score to get similarity scores
            retrieved_docs_with_scores = self.vector_store.similarity_search_with_score(
                query=question, 
                k=max_docs
            )

            if not retrieved_docs_with_scores:
                logger.warning("No relevant documents found.")
                return [], None

            # Filter documents by similarity threshold
            filtered_docs = [
                (doc, score) for doc, score in retrieved_docs_with_scores 
                if score >= similarity_threshold
            ]
            
            logger.info(f"Found {len(filtered_docs)} documents above similarity threshold {similarity_threshold}")

            if not filtered_docs:
                logger.warning("No documents above similarity threshold.")
                return [], None

            # Extract _ids from metadata
            doc_ids = []
            for doc, score in filtered_docs:
                if hasattr(doc, "metadata") and "_id" in doc.metadata:
                    doc_id = doc.metadata["_id"]
                    if not isinstance(doc_id, ObjectId):
                        try:
                            doc_id = ObjectId(doc_id)
                        except Exception:
                            logger.warning(
                                f"Could not convert metadata _id '{doc_id}' to ObjectId. Skipping."
                            )
                            continue
                    doc_ids.append(doc_id)
                    logger.info(f"Document {doc_id} has similarity score: {score}")
                else:
                    logger.warning(f"Retrieved document missing _id in metadata: {doc}")

            if not doc_ids:
                logger.error("Could not extract valid _ids from retrieved documents.")
                return [], None

            logger.info(f"Retrieved document _ids: {doc_ids}")

            # Fetch the full documents with specified fields from MongoDB
            projection = {field: 1 for field in FIELDS_TO_EXTRACT}
            if "_id" not in projection:
                projection["_id"] = 1

            fetched_docs_cursor = self.collection.find(
                {"_id": {"$in": doc_ids}}, projection
            )
            
            # Format the context string with normalization
            context_parts = []
            logger.info("=== DOCUMENTS FETCHED FROM MONGODB ===")
            for i, doc in enumerate(fetched_docs_cursor):
                logger.info(f"Document {i+1} raw data:")
                logger.info(json.dumps(doc, indent=2, cls=JSONEncoder))

                # Normalize document fields
                normalized_doc = {}
                for field, value in doc.items():
                    if field == "_id":
                        normalized_doc[field] = str(value)
                    elif field in ["skills", "may_also_known_skills", "labels"]:
                        normalized_doc[field] = self._normalize_list_field(value)
                    else:
                        normalized_doc[field] = self._normalize_field_value(value)

                logger.info(f"Document {i+1} normalized data:")
                logger.info(json.dumps(normalized_doc, indent=2, cls=JSONEncoder))

                context_parts.append(
                    json.dumps(normalized_doc, indent=2, cls=JSONEncoder)
                )
            logger.info("=== END OF DOCUMENTS ===")

            context_string = "\n\n---\n\n".join(context_parts)
            logger.info("Formatted context for LLM")

            # Print the context retrieved from MongoDB
            logger.info("=== CONTEXT RETRIEVED FROM MONGODB ===")
            logger.info(f"Number of documents retrieved: {len(context_parts)}")
            logger.info("Context content:")
            logger.info(context_string)
            logger.info("=== END OF CONTEXT ===")

            return doc_ids, context_string

        except Exception as e:
            logger.error(f"Error during retrieval or context preparation: {e}")
            return [], None

    def get_all_scored_candidates(
        self, 
        question: str, 
        similarity_threshold: float = 0.1, 
        max_docs: int = 50
    ) -> Optional[Dict]:
        """Processes a question, retrieves ALL relevant context, asks LLM to score all candidates, and returns all IDs with scores in JSON format."""
        if not self.retrieval_chain:
            logger.error(
                "Retrieval chain is not available. Cannot process the question."
            )
            return None

        logger.info(f"Processing question for all scored candidates: {question}")
        retrieved_ids, context = self.get_all_relevant_documents_and_context(
            question, similarity_threshold, max_docs
        )

        if context is None or not retrieved_ids:
            logger.error("Failed to get context or retrieve IDs.")
            return None

        try:
            logger.info("Invoking retrieval chain for scoring all candidates...")
            logger.info(f"Total candidates to be scored: {len(retrieved_ids)}")
            
            # Use the modern LCEL chain
            raw_result = self.retrieval_chain.invoke({
                "context": context, 
                "question": question
            })
            
            logger.info(f"Raw LLM Response: {raw_result}")
            
            # Process the result
            if isinstance(raw_result, dict) and "candidates" in raw_result:
                result = raw_result
            else:
                # Handle string response or parsing failure
                logger.warning("LLM response needs fallback processing")
                result = {
                    "total_candidates": len(retrieved_ids),
                    "candidates": []
                }
                
                # Give basic scores to all candidates
                for i, doc_id in enumerate(retrieved_ids):
                    # Assign decreasing scores based on retrieval order
                    score = max(0.1, 0.9 - (i * 0.1))
                    result["candidates"].append({
                        "id": str(doc_id),
                        "score": round(score, 2),
                        "reason": f"Candidate {i+1} - retrieved by vector similarity"
                    })
            
            # Validate the result has the expected structure
            if isinstance(result, dict) and "candidates" in result:
                # Ensure all candidates are included
                result_ids = {candidate.get("id") for candidate in result.get("candidates", [])}
                retrieved_ids_str = {str(doc_id) for doc_id in retrieved_ids}
                
                missing_ids = retrieved_ids_str - result_ids
                if missing_ids:
                    logger.warning(f"LLM missed {len(missing_ids)} candidates. Adding them with default scores.")
                    for missing_id in missing_ids:
                        result["candidates"].append({
                            "id": missing_id,
                            "score": 0.3,
                            "reason": "Added automatically - missed by LLM analysis"
                        })
                
                # Sort by score (highest first)
                result["candidates"] = sorted(
                    result["candidates"], 
                    key=lambda x: x.get("score", 0), 
                    reverse=True
                )
                
                result["total_candidates"] = len(result["candidates"])
                
                logger.info("Successfully processed question and scored all candidates")
                logger.info(f"Total candidates scored: {result.get('total_candidates', 0)}")
                logger.info(f"Final Answer (JSON): {json.dumps(result, indent=2)}")
                return result
            else:
                logger.warning(f"Unexpected result format: {result}")
                # Fallback: return all IDs with default scores
                fallback_result = {
                    "total_candidates": len(retrieved_ids),
                    "candidates": [
                        {
                            "id": str(doc_id),
                            "score": 0.5,
                            "reason": "Fallback scoring due to unexpected LLM response format"
                        }
                        for doc_id in retrieved_ids
                    ]
                }
                return fallback_result

        except Exception as e:
            logger.error(f"Error during LLM processing: {e}")
            # Fallback: return all retrieved IDs with default scores
            fallback_result = {
                "total_candidates": len(retrieved_ids),
                "candidates": [
                    {
                        "id": str(doc_id),
                        "score": 0.5,
                        "reason": f"Fallback scoring due to error: {str(e)[:100]}"
                    }
                    for doc_id in retrieved_ids
                ]
            }
            return fallback_result

    # Keep backward compatibility methods
    def ask_resume_question_and_get_id(self, question: str) -> Optional[Dict]:
        """Backward compatibility: Returns the best scoring candidate"""
        all_results = self.get_all_scored_candidates(question)
        if all_results and "candidates" in all_results and all_results["candidates"]:
            best_candidate = all_results["candidates"][0]  # First one is highest scored
            return {"_id": best_candidate["id"], "score": best_candidate["score"]}
        return None

    def get_top_k_candidate_ids(self, question: str, k: int = 5) -> Optional[Dict]:
        """
        Returns top k candidates with their scores
        """
        all_results = self.get_all_scored_candidates(question)
        if all_results and "candidates" in all_results:
            top_k_candidates = all_results["candidates"][:k]
            result = {
                "total_evaluated": all_results.get("total_candidates", 0),
                "returned_count": len(top_k_candidates)
            }
            
            for i, candidate in enumerate(top_k_candidates):
                result[f"id{i+1}"] = candidate["id"]
                result[f"score{i+1}"] = candidate["score"]
                result[f"reason{i+1}"] = candidate["reason"]
            
            return result
        return None

    def get_relevant_ids_and_context(
        self, question: str, k: int = 3
    ) -> Tuple[List[ObjectId], Optional[str]]:
        """Backward compatibility function"""
        return self.get_all_relevant_documents_and_context(question, max_docs=k)

    def health_check(self) -> Dict[str, bool]:
        """Check the health of all components"""
        return {
            "embeddings_initialized": self.embeddings is not None,
            "vectorizer_initialized": self.vectorizer is not None,
            "vector_store_initialized": self.vector_store is not None,
            "llm_initialized": self.llm is not None,
            "retrieval_chain_initialized": self.retrieval_chain is not None,
            "mongodb_connection": self.client is not None,
        }


# --- Main Execution Example ---

# Global instance for backward compatibility
rag_app = None


def initialize_rag_application() -> RAGApplication:
    """Initialize the RAG application"""
    global rag_app
    if rag_app is None:
        rag_app = RAGApplication()
    return rag_app


def get_all_scored_candidates(
    question: str, 
    similarity_threshold: float = 0.1, 
    max_docs: int = 50
) -> Optional[Dict]:
    """Main function to get all relevant candidates with scores"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_all_scored_candidates(question, similarity_threshold, max_docs)


# Backward compatibility functions
def ask_resume_question_and_get_id(question: str) -> Optional[Dict]:
    """Backward compatibility function"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.ask_resume_question_and_get_id(question)


def get_relevant_ids_and_context(
    question: str, k: int = 3
) -> Tuple[List[ObjectId], Optional[str]]:
    """Backward compatibility function"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_relevant_ids_and_context(question, k)


def get_top_k_candidate_ids(question: str, k: int = 10) -> Optional[Dict]:
    """Backward compatibility function for getting top k candidate IDs"""
    global rag_app
    if rag_app is None:
        rag_app = initialize_rag_application()
    return rag_app.get_top_k_candidate_ids(question, k)


if __name__ == "__main__":
    try:
        # Check configuration
        if (
            MONGODB_URI != "YOUR_MONGO_URI_HERE"
            and GROQ_API_KEY
            and DB_NAME != "YOUR_DB_NAME_HERE"
            and COLLECTION_NAME != "YOUR_COLLECTION_NAME_HERE"
        ):
            # Initialize RAG application
            logger.info("--- Initializing RAG Application ---")
            rag_application = initialize_rag_application()

            # Health check
            health_status = rag_application.health_check()
            logger.info(f"Health Check: {health_status}")
            
            # Test query - Get ALL relevant candidates with scores
            sample_question = "Find candidates with 5 years above experience in python"
            logger.info("--- Running Sample Query for ALL Scored Candidates ---")
            
            # Get all relevant candidates with scores
            all_results = rag_application.get_all_scored_candidates(
                sample_question, 
                similarity_threshold=0.2,  # Adjust threshold as needed
                max_docs=100  # Maximum documents to consider
            )

            if all_results:
                logger.info("--- Query Finished --- All Scored Candidates Result ---")
                logger.info(f"Total candidates evaluated: {all_results.get('total_candidates', 0)}")
                logger.info(f"Result: {json.dumps(all_results, indent=2)}")
            else:
                logger.warning("--- Query Finished --- No candidates obtained.")

            # Also test the backward compatibility methods
            logger.info("--- Running Top 10 Candidates Query ---")
            top_10_result = rag_application.get_top_k_candidate_ids(sample_question, k=5)
            if top_10_result:
                logger.info("--- Top 10 Result ---")
                logger.info(f"Top 10: {json.dumps(top_10_result, indent=2)}")

            logger.info("--- Running Best Match Query ---")
            best_match_result = rag_application.ask_resume_question_and_get_id(sample_question)
            if best_match_result:
                logger.info("--- Best Match Result ---")
                logger.info(f"Best Match: {json.dumps(best_match_result, indent=2)}")
        else:
            logger.error("Configuration Incomplete:")
            logger.error(
                "Please ensure MONGO_URI, GROQ_API_KEYS, DB_NAME, COLLECTION_NAME are set."
            )
            logger.info("Run the script (e.g., python rag.py) to test.")

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

logger.info("RAG Application setup complete. Ready for execution or function calls.")

2025-06-02 18:43:42 - rag_application - INFO - Initializing RAG components...
2025-06-02 18:43:42 - rag_application - INFO - --- Initializing RAG Application ---
2025-06-02 18:43:45 - rag_application - INFO - Internal vectorizer (SentenceTransformer all-MiniLM-L6-v2) initialized successfully
  _crypto.X509.from_cryptography(_load_der_x509_certificate(cert))
2025-06-02 18:43:46 - rag_application - INFO - Connected to MongoDB database 'resume_db' and collection 'resumes'
2025-06-02 18:43:46 - rag_application - INFO - MongoDB Atlas Vector Search initialized with index 'vector_search_index' on field 'combined_resume_vector'
2025-06-02 18:43:47 - rag_application - INFO - Groq Cloud LLM (gemma2-9b-it) initialized
2025-06-02 18:43:47 - rag_application - INFO - Retrieval chain with LCEL initialized successfully
2025-06-02 18:43:47 - rag_application - INFO - RAG Application initialized successfully
2025-06-02 18:43:47 - rag_application - INFO - Health Check: {'embeddings_initialized': True, 've

In [None]:
from fastapi import APIRouter, HTTPException, Query
from typing import List, Dict, Any, Optional
from Rag.runner import initialize_rag_app, ask_resume_question_enhanced
from core.custom_logger import CustomLogger

# Initialize logger
logger = CustomLogger().get_logger("rag_search")

# Create router instance
router = APIRouter(
    prefix="/rag",
    tags=["RAG Search"],
    responses={404: {"description": "Not found"}},
)


@router.post(
    "/vector-similarity-search",
    response_model=Dict[str, Any],
    summary="Vector Similarity Search",
    description="""
    Perform vector similarity search on resumes using the RAG system.
    
    **Parameters:**
    - query: The search query text
    - limit: Maximum number of results to return (default: 50)
    
    **Returns:**
    Dictionary containing:
    - total_found: Total number of matches found
    - results: List of matching resumes with similarity scores
    """,
    responses={
        200: {
            "description": "Successful search results",
            "content": {
                "application/json": {
                    "example": {
                        "total_found": 10,
                        "results": [
                            {
                                "_id": "resume123",
                                "contact_details": {
                                    "name": "John Doe",
                                    "current_city": "Mumbai",
                                },
                                "skills": ["Python", "React", "AWS"],
                                "total_experience": "5 years",
                                "similarity_score": 0.85,
                            }
                        ],
                    }
                }
            },
        },
        400: {"description": "Bad Request"},
        500: {"description": "Internal Server Error"},
    },
)
async def vector_similarity_search(
    query: str = Query(..., description="Search query text"),
    limit: int = Query(
        default=50, description="Maximum number of results to return", ge=1, le=100
    ),
):
    """
    Perform vector similarity search on resumes.
    """
    try:
        # Initialize RAG application
        rag_app = initialize_rag_app()

        # Perform vector similarity search
        result = rag_app.vector_similarity_search(query, limit)

        if "error" in result:
            raise HTTPException(status_code=500, detail=result["error"])

        return result

    except Exception as e:
        logger.error(f"Vector similarity search failed: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")


@router.post(
    "/llm-context-search",
    response_model=Dict[str, Any],
    summary="LLM Context Search",
    description="""
    Perform LLM-powered context search on resumes using the RAG system.
    
    **Parameters:**
    - query: The search query text
    - context_size: Number of documents to analyze (default: 5)
    
    **Returns:**
    Dictionary containing:
    - total_found: Total number of matches found
    - total_analyzed: Number of documents analyzed
    - statistics: Search statistics
    - results: List of matching resumes with relevance scores and match reasons
    """,
    responses={
        200: {
            "description": "Successful search results",
            "content": {
                "application/json": {
                    "example": {
                        "total_found": 10,
                        "total_analyzed": 5,
                        "statistics": {
                            "avg_relevance": 0.85,
                            "match_distribution": {"high": 3, "medium": 2, "low": 0},
                        },
                        "results": [
                            {
                                "_id": "resume123",
                                "contact_details": {
                                    "name": "John Doe",
                                    "current_city": "Mumbai",
                                },
                                "skills": ["Python", "React", "AWS"],
                                "total_experience": "5 years",
                                "relevance_score": 0.92,
                                "match_reason": "Strong match in Python and AWS skills",
                            }
                        ],
                    }
                }
            },
        },
        400: {"description": "Bad Request"},
        500: {"description": "Internal Server Error"},
    },
)
async def llm_context_search(
    query: str = Query(..., description="Search query text"),
    context_size: int = Query(
        default=5, description="Number of documents to analyze", ge=1, le=20
    ),
):
    """
    Perform LLM-powered context search on resumes.
    """
    try:
        # Initialize RAG application
        rag_app = initialize_rag_app()

        # Perform LLM context search
        result = rag_app.llm_context_search(query, context_size)

        if "error" in result:
            raise HTTPException(status_code=500, detail=result["error"])

        return result

    except Exception as e:
        logger.error(f"LLM context search failed: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
