In [None]:
# ===================================================================
# PHASE 5B: RAG SYSTEM WITH VECTOR DATABASE
# Retrieval Augmented Generation for intelligent job search
# ===================================================================
# Purpose: Semantic search and intelligent data retrieval for jobs
# Dependencies: chromadb, sentence-transformers, pandas
# ===================================================================

import os
import json
import logging
from typing import List, Dict, Optional, Any, Tuple
from dataclasses import dataclass, field
import pandas as pd
import numpy as np
from datetime import datetime

# ===================================================================
# LOGGING CONFIGURATION
# ===================================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("rag_system.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("RAGSystem")

# ===================================================================
# VECTOR DATABASE MANAGER
# ===================================================================

class VectorDatabaseManager:
    """
    Manages ChromaDB vector database for semantic search.
    Stores job embeddings for fast similarity search.
    """
    
    def __init__(
        self,
        persist_directory: str = "./vector_db",
        collection_name: str = "job_postings"
    ):
        """
        Initialize vector database.
        
        Args:
            persist_directory: Directory to persist the database
            collection_name: Name of the collection
        """
        self.logger = logging.getLogger("VectorDB")
        self.persist_directory = persist_directory
        self.collection_name = collection_name
        
        self.logger.info("="*70)
        self.logger.info("üóÑÔ∏è  INITIALIZING VECTOR DATABASE")
        self.logger.info("="*70)
        
        # Import ChromaDB
        try:
            import chromadb
            from chromadb.config import Settings
            self.chromadb = chromadb
        except ImportError:
            self.logger.error("‚ùå chromadb not installed!")
            self.logger.error("Install with: pip install chromadb")
            raise
        
        # Create persist directory if not exists
        os.makedirs(persist_directory, exist_ok=True)
        
        # Initialize ChromaDB client
        try:
            self.client = chromadb.PersistentClient(
                path=persist_directory,
                settings=Settings(
                    anonymized_telemetry=False,
                    allow_reset=True
                )
            )
            self.logger.info(f"‚úÖ ChromaDB client initialized at {persist_directory}")
        except Exception as e:
            self.logger.error(f"‚ùå Failed to initialize ChromaDB: {e}")
            raise
        
        # Get or create collection
        try:
            self.collection = self.client.get_or_create_collection(
                name=collection_name,
                metadata={"description": "Job postings for semantic search"}
            )
            self.logger.info(f"‚úÖ Collection '{collection_name}' ready")
            self.logger.info(f"üìä Current documents: {self.collection.count()}")
        except Exception as e:
            self.logger.error(f"‚ùå Failed to create collection: {e}")
            raise
        
        self.logger.info("="*70 + "\n")
    
    def add_documents(
        self,
        documents: List[str],
        metadatas: List[Dict[str, Any]],
        ids: List[str]
    ) -> bool:
        """
        Add documents to vector database.
        
        Args:
            documents: List of text documents to embed
            metadatas: List of metadata dictionaries
            ids: List of unique document IDs
            
        Returns:
            True if successful, False otherwise
        """
        try:
            self.logger.info(f"üìù Adding {len(documents)} documents to vector DB...")
            
            # Add to collection (ChromaDB handles embedding automatically)
            self.collection.add(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
            
            self.logger.info(f"‚úÖ Successfully added {len(documents)} documents")
            self.logger.info(f"üìä Total documents: {self.collection.count()}")
            
            return True
            
        except Exception as e:
            self.logger.error(f"‚ùå Error adding documents: {e}")
            return False
    
    def search(
        self,
        query: str,
        n_results: int = 10,
        filters: Optional[Dict[str, Any]] = None
    ) -> Dict[str, Any]:
        """
        Semantic search for similar documents.
        
        Args:
            query: Search query text
            n_results: Number of results to return
            filters: Optional metadata filters
            
        Returns:
            Dictionary with search results
        """
        try:
            self.logger.info(f"üîç Searching for: '{query}' (top {n_results})")
            
            # Perform semantic search
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results,
                where=filters  # Optional metadata filtering
            )
            
            # Format results
            formatted_results = {
                'query': query,
                'n_results': len(results['ids'][0]),
                'documents': results['documents'][0],
                'metadatas': results['metadatas'][0],
                'distances': results['distances'][0],
                'ids': results['ids'][0]
            }
            
            self.logger.info(f"‚úÖ Found {len(results['ids'][0])} results")
            
            return formatted_results
            
        except Exception as e:
            self.logger.error(f"‚ùå Search error: {e}")
            return {'query': query, 'n_results': 0, 'documents': [], 'metadatas': [], 'ids': []}
    
    def delete_collection(self):
        """Delete the entire collection."""
        try:
            self.client.delete_collection(name=self.collection_name)
            self.logger.info(f"üóëÔ∏è  Deleted collection '{self.collection_name}'")
        except Exception as e:
            self.logger.error(f"‚ùå Error deleting collection: {e}")
    
    def reset_collection(self):
        """Reset collection (delete and recreate)."""
        try:
            self.delete_collection()
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Job postings for semantic search"}
            )
            self.logger.info(f"üîÑ Collection '{self.collection_name}' reset")
        except Exception as e:
            self.logger.error(f"‚ùå Error resetting collection: {e}")


# ===================================================================
# JOB DATA INDEXER
# ===================================================================

class JobDataIndexer:
    """
    Indexes job data into vector database.
    Creates searchable embeddings from job postings.
    """
    
    def __init__(self, vector_db: VectorDatabaseManager):
        """
        Initialize job indexer.
        
        Args:
            vector_db: VectorDatabaseManager instance
        """
        self.vector_db = vector_db
        self.logger = logging.getLogger("JobIndexer")
    
    def create_job_document(self, job: pd.Series) -> str:
        """
        Create searchable document text from job data.
        Combines all relevant fields into a rich text representation.
        
        Args:
            job: Job data as pandas Series
            
        Returns:
            Formatted document string
        """
        # Build comprehensive document
        doc_parts = []
        
        # Position and company
        doc_parts.append(f"Position: {job.get('position_title', 'Unknown')}")
        doc_parts.append(f"Company: {job.get('company_name', 'Unknown')}")
        
        # Location and work mode
        location = job.get('location_city', 'Unknown')
        work_mode = job.get('work_mode', 'On-site')
        doc_parts.append(f"Location: {location} ({work_mode})")
        
        # Skills
        skills = job.get('skills_required', '')
        if pd.notna(skills) and skills:
            doc_parts.append(f"Required Skills: {skills}")
        
        # Experience
        exp_type = job.get('experience_type', 'Not specified')
        exp_min = job.get('experience_min_years', 0)
        exp_max = job.get('experience_max_years', 0)
        if exp_max > 0:
            doc_parts.append(f"Experience: {exp_min}-{exp_max} years ({exp_type})")
        else:
            doc_parts.append(f"Experience: {exp_type}")
        
        # Education
        education = job.get('education_required', '')
        if pd.notna(education) and education:
            doc_parts.append(f"Education: {education}")
        
        # Salary
        salary_max = job.get('salary_max', 0)
        if salary_max > 0:
            salary_lpa = salary_max / 100000
            doc_parts.append(f"Salary: ‚Çπ{salary_lpa:.1f} LPA")
        
        # Priority score
        priority = job.get('final_priority_score', 0)
        doc_parts.append(f"Priority Score: {priority:.1f}/100")
        
        # Combine all parts
        document = ". ".join(doc_parts)
        
        return document
    
    def create_job_metadata(self, job: pd.Series) -> Dict[str, Any]:
        """
        Create metadata dictionary for filtering and retrieval.
        
        Args:
            job: Job data as pandas Series
            
        Returns:
            Metadata dictionary
        """
        metadata = {
            'job_id': str(job.get('job_id', '')),
            'company_name': str(job.get('company_name', '')),
            'position_title': str(job.get('position_title', '')),
            'location_city': str(job.get('location_city', '')),
            'work_mode': str(job.get('work_mode', 'On-site')),
            'salary_max': float(job.get('salary_max', 0)),
            'priority_score': float(job.get('final_priority_score', 0)),
            'experience_type': str(job.get('experience_type', '')),
            'skill_count': int(job.get('skills_count', 0)) if pd.notna(job.get('skills_count')) else 0
        }
        
        return metadata
    
    def index_jobs(self, jobs_df: pd.DataFrame, batch_size: int = 100) -> bool:
        """
        Index all jobs into vector database.
        
        Args:
            jobs_df: DataFrame with job postings
            batch_size: Number of jobs to index per batch
            
        Returns:
            True if successful, False otherwise
        """
        self.logger.info(f"\n{'='*70}")
        self.logger.info(f"üìö INDEXING {len(jobs_df)} JOBS INTO VECTOR DATABASE")
        self.logger.info(f"{'='*70}\n")
        
        total_jobs = len(jobs_df)
        total_batches = (total_jobs + batch_size - 1) // batch_size
        
        for batch_num in range(0, total_jobs, batch_size):
            batch_jobs = jobs_df.iloc[batch_num:batch_num + batch_size]
            current_batch = batch_num // batch_size + 1
            
            self.logger.info(f"üì¶ Processing batch {current_batch}/{total_batches} ({len(batch_jobs)} jobs)...")
            
            try:
                # Prepare batch data
                documents = []
                metadatas = []
                ids = []
                
                for idx, job in batch_jobs.iterrows():
                    # Create document
                    doc = self.create_job_document(job)
                    documents.append(doc)
                    
                    # Create metadata
                    metadata = self.create_job_metadata(job)
                    metadatas.append(metadata)
                    
                    # Create unique ID
                    job_id = job.get('job_id', f"job_{idx}")
                    ids.append(str(job_id))
                
                # Add batch to vector DB
                success = self.vector_db.add_documents(
                    documents=documents,
                    metadatas=metadatas,
                    ids=ids
                )
                
                if success:
                    self.logger.info(f"‚úÖ Batch {current_batch}/{total_batches} indexed successfully\n")
                else:
                    self.logger.error(f"‚ùå Failed to index batch {current_batch}")
                    return False
                
            except Exception as e:
                self.logger.error(f"‚ùå Error indexing batch {current_batch}: {e}")
                return False
        
        self.logger.info(f"{'='*70}")
        self.logger.info(f"‚úÖ INDEXING COMPLETE - {total_jobs} JOBS INDEXED")
        self.logger.info(f"{'='*70}\n")
        
        return True


# ===================================================================
# INTELLIGENT RETRIEVAL SYSTEM
# ===================================================================

class IntelligentRetriever:
    """
    Intelligent retrieval system using RAG.
    Combines semantic search with filtering and ranking.
    """
    
    def __init__(
        self,
        vector_db: VectorDatabaseManager,
        jobs_df: pd.DataFrame
    ):
        """
        Initialize retriever.
        
        Args:
            vector_db: VectorDatabaseManager instance
            jobs_df: DataFrame with full job data
        """
        self.vector_db = vector_db
        self.jobs_df = jobs_df
        self.logger = logging.getLogger("IntelligentRetriever")
        
        # Create job_id to index mapping
        self.job_index_map = {
            str(job_id): idx 
            for idx, job_id in enumerate(jobs_df['job_id'])
        }
    
    def retrieve_jobs(
        self,
        query: str,
        n_results: int = 10,
        filters: Optional[Dict[str, Any]] = None,
        rerank: bool = True
    ) -> List[Dict[str, Any]]:
        """
        Retrieve relevant jobs using semantic search.
        
        Args:
            query: Natural language query
            n_results: Number of results to return
            filters: Optional metadata filters
            rerank: Whether to rerank results by priority score
            
        Returns:
            List of job dictionaries with relevance scores
        """
        self.logger.info(f"\n{'‚îÄ'*70}")
        self.logger.info(f"üîç RETRIEVING JOBS")
        self.logger.info(f"Query: {query}")
        self.logger.info(f"Filters: {filters}")
        self.logger.info(f"{'‚îÄ'*70}")
        
        try:
            # Step 1: Semantic search
            search_results = self.vector_db.search(
                query=query,
                n_results=n_results * 2,  # Get more for reranking
                filters=filters
            )
            
            if search_results['n_results'] == 0:
                self.logger.warning("‚ö†Ô∏è  No results found")
                return []
            
            # Step 2: Enrich with full job data
            enriched_results = []
            
            for i, job_id in enumerate(search_results['ids']):
                try:
                    # Get full job data
                    if job_id in self.job_index_map:
                        idx = self.job_index_map[job_id]
                        job_data = self.jobs_df.iloc[idx].to_dict()
                        
                        # Add search metadata
                        job_data['semantic_distance'] = search_results['distances'][i]
                        job_data['search_rank'] = i + 1
                        
                        # Calculate relevance score (lower distance = higher relevance)
                        # Distance is typically 0-2, we invert it to 0-100 scale
                        relevance = max(0, 100 - (search_results['distances'][i] * 50))
                        job_data['relevance_score'] = relevance
                        
                        enriched_results.append(job_data)
                        
                except Exception as e:
                    self.logger.error(f"Error enriching job {job_id}: {e}")
                    continue
            
            # Step 3: Rerank by priority score if enabled
            if rerank and enriched_results:
                self.logger.info("üìä Reranking by combined score...")
                
                for job in enriched_results:
                    # Combined score: 60% relevance + 40% priority
                    job['combined_score'] = (
                        job['relevance_score'] * 0.6 + 
                        job['final_priority_score'] * 0.4
                    )
                
                enriched_results.sort(key=lambda x: x['combined_score'], reverse=True)
            
            # Step 4: Limit to requested number
            final_results = enriched_results[:n_results]
            
            self.logger.info(f"‚úÖ Retrieved {len(final_results)} relevant jobs")
            self.logger.info(f"{'‚îÄ'*70}\n")
            
            return final_results
            
        except Exception as e:
            self.logger.error(f"‚ùå Retrieval error: {e}")
            return []
    
    def retrieve_by_filters(
        self,
        skills: Optional[List[str]] = None,
        locations: Optional[List[str]] = None,
        companies: Optional[List[str]] = None,
        min_salary: Optional[float] = None,
        work_mode: Optional[str] = None,
        n_results: int = 10
    ) -> List[Dict[str, Any]]:
        """
        Retrieve jobs using structured filters.
        
        Args:
            skills: List of required skills
            locations: List of preferred locations
            companies: List of preferred companies
            min_salary: Minimum salary in INR
            work_mode: Work mode preference
            n_results: Number of results
            
        Returns:
            List of filtered job dictionaries
        """
        self.logger.info(f"\n{'‚îÄ'*70}")
        self.logger.info(f"üîß FILTERING JOBS")
        self.logger.info(f"Skills: {skills}")
        self.logger.info(f"Locations: {locations}")
        self.logger.info(f"Companies: {companies}")
        self.logger.info(f"{'‚îÄ'*70}")
        
        try:
            # Start with all jobs
            filtered_df = self.jobs_df.copy()
            
            # Apply filters
            if skills:
                skill_pattern = '|'.join(skills)
                filtered_df = filtered_df[
                    filtered_df['skills_required'].str.contains(
                        skill_pattern, case=False, na=False
                    )
                ]
            
            if locations:
                location_pattern = '|'.join(locations)
                filtered_df = filtered_df[
                    filtered_df['location_city'].str.contains(
                        location_pattern, case=False, na=False
                    )
                ]
            
            if companies:
                company_pattern = '|'.join(companies)
                filtered_df = filtered_df[
                    filtered_df['company_name'].str.contains(
                        company_pattern, case=False, na=False
                    )
                ]
            
            if min_salary:
                filtered_df = filtered_df[filtered_df['salary_max'] >= min_salary]
            
            if work_mode:
                filtered_df = filtered_df[
                    filtered_df['work_mode'].str.contains(
                        work_mode, case=False, na=False
                    )
                ]
            
            # Sort by priority score
            filtered_df = filtered_df.sort_values('final_priority_score', ascending=False)
            
            # Limit results
            filtered_df = filtered_df.head(n_results)
            
            # Convert to list of dictionaries
            results = filtered_df.to_dict('records')
            
            self.logger.info(f"‚úÖ Filtered to {len(results)} jobs")
            self.logger.info(f"{'‚îÄ'*70}\n")
            
            return results
            
        except Exception as e:
            self.logger.error(f"‚ùå Filtering error: {e}")
            return []
    
    def get_job_by_id(self, job_id: str) -> Optional[Dict[str, Any]]:
        """
        Get specific job by ID.
        
        Args:
            job_id: Job identifier
            
        Returns:
            Job dictionary or None
        """
        try:
            if job_id in self.job_index_map:
                idx = self.job_index_map[job_id]
                return self.jobs_df.iloc[idx].to_dict()
            return None
        except Exception as e:
            self.logger.error(f"Error getting job {job_id}: {e}")
            return None


# ===================================================================
# RAG SYSTEM ORCHESTRATOR
# ===================================================================

class RAGSystem:
    """
    Complete RAG system orchestrator.
    Combines vector database, indexing, and retrieval.
    """
    
    def __init__(
        self,
        jobs_csv_path: str = "prioritized_jobs.csv",
        vector_db_path: str = "./vector_db",
        rebuild_index: bool = False
    ):
        """
        Initialize complete RAG system.
        
        Args:
            jobs_csv_path: Path to jobs CSV
            vector_db_path: Path to vector database
            rebuild_index: Whether to rebuild the index
        """
        self.logger = logging.getLogger("RAGSystem")
        
        self.logger.info("\n" + "="*70)
        self.logger.info("üöÄ INITIALIZING RAG SYSTEM")
        self.logger.info("="*70 + "\n")
        
        # Load jobs data
        try:
            self.jobs_df = pd.read_csv(jobs_csv_path)
            self.logger.info(f"‚úÖ Loaded {len(self.jobs_df)} jobs from {jobs_csv_path}")
        except Exception as e:
            self.logger.error(f"‚ùå Failed to load jobs: {e}")
            raise
        
        # Initialize vector database
        self.vector_db = VectorDatabaseManager(
            persist_directory=vector_db_path
        )
        
        # Initialize indexer and retriever
        self.indexer = JobDataIndexer(self.vector_db)
        self.retriever = IntelligentRetriever(self.vector_db, self.jobs_df)
        
        # Build or rebuild index
        if rebuild_index or self.vector_db.collection.count() == 0:
            self.logger.info("üî® Building vector index...")
            self.build_index()
        else:
            self.logger.info(f"‚úÖ Using existing index ({self.vector_db.collection.count()} documents)")
        
        self.logger.info("\n" + "="*70)
        self.logger.info("‚úÖ RAG SYSTEM READY")
        self.logger.info("="*70 + "\n")
    
    def build_index(self) -> bool:
        """Build vector index from jobs data."""
        return self.indexer.index_jobs(self.jobs_df)
    
    def search(
        self,
        query: str,
        n_results: int = 10,
        filters: Optional[Dict[str, Any]] = None
    ) -> List[Dict[str, Any]]:
        """
        Search for jobs using natural language.
        
        Args:
            query: Natural language query
            n_results: Number of results
            filters: Optional filters
            
        Returns:
            List of relevant jobs
        """
        return self.retriever.retrieve_jobs(query, n_results, filters)
    
    def filter_jobs(
        self,
        skills: Optional[List[str]] = None,
        locations: Optional[List[str]] = None,
        companies: Optional[List[str]] = None,
        min_salary: Optional[float] = None,
        work_mode: Optional[str] = None,
        n_results: int = 10
    ) -> List[Dict[str, Any]]:
        """Filter jobs using structured criteria."""
        return self.retriever.retrieve_by_filters(
            skills, locations, companies, min_salary, work_mode, n_results
        )


# ===================================================================
# TESTING & EXAMPLES
# ===================================================================

def test_rag_system():
    """Test the RAG system with example queries."""
    logger.info("\n" + "="*70)
    logger.info("üß™ TESTING RAG SYSTEM")
    logger.info("="*70 + "\n")
    
    # Initialize RAG system
    try:
        rag = RAGSystem(
            jobs_csv_path="prioritized_jobs.csv",
            rebuild_index=True  # Set to True first time
        )
    except Exception as e:
        logger.error(f"Failed to initialize RAG system: {e}")
        return
    
    # Test 1: Semantic search
    logger.info("\nüìù TEST 1: Semantic Search")
    results = rag.search(
        query="Python developer with machine learning experience in Bangalore",
        n_results=5
    )
    
    logger.info(f"\nTop 5 Results:")
    for i, job in enumerate(results, 1):
        logger.info(f"\n{i}. {job['position_title']} at {job['company_name']}")
        logger.info(f"   Location: {job['location_city']}")
        logger.info(f"   Skills: {str(job['skills_required'])[:60]}...")
        logger.info(f"   Relevance: {job['relevance_score']:.1f}/100")
        logger.info(f"   Priority: {job['final_priority_score']:.1f}/100")
    
    # Test 2: Structured filtering
    logger.info("\n\nüìù TEST 2: Structured Filtering")
    results = rag.filter_jobs(
        skills=["python", "sql"],
        locations=["Bangalore", "Remote"],
        min_salary=500000,
        n_results=5
    )
    
    logger.info(f"\nFiltered Results ({len(results)} jobs):")
    for i, job in enumerate(results, 1):
        logger.info(f"\n{i}. {job['position_title']} at {job['company_name']}")
        logger.info(f"   Salary: ‚Çπ{job['salary_max']/100000:.1f} LPA")
        logger.info(f"   Priority: {job['final_priority_score']:.1f}/100")


# ===================================================================
# MAIN EXECUTION
# ===================================================================

if __name__ == "__main__":
    # Run tests
    test_rag_system()
    
    # Interactive search (optional)
    print("\n" + "="*70)
    print("üîç INTERACTIVE SEMANTIC SEARCH")
    print("="*70)
    print("Enter search queries (or 'quit' to exit)\n")
    
    try:
        csv_path = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 4\prioritized_jobs.csv"
        rag = RAGSystem(jobs_csv_path=csv_path)
        
        while True:
            query = input("\nüîé Search: ").strip()
            
            if query.lower() in ['quit', 'exit']:
                break
            
            if not query:
                continue
            
            results = rag.search(query, n_results=3)
            
            print(f"\nüìä Found {len(results)} results:\n")
            for i, job in enumerate(results, 1):
                print(f"{i}. {job['position_title']} at {job['company_name']}")
                print(f"   üìç {job['location_city']} | üí∞ ‚Çπ{job['salary_max']/100000:.1f} LPA")
                print(f"   ‚≠ê Relevance: {job['relevance_score']:.1f}/100\n")
    
    except KeyboardInterrupt:
        print("\n\nüëã Goodbye!")
    except Exception as e:
        logger.error(f"Error: {e}")