In [None]:
# ===================================================================
# PHASE 5C: PDF INTEGRATION & MANAGEMENT SYSTEM
# Complete PDF handling for job attachments
# ===================================================================
# Purpose: Link PDFs to jobs, extract text, summarize, and manage
# Dependencies: PyPDF2, google-generativeai, pandas
# ===================================================================

import os
import json
import logging
import shutil
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple
from dataclasses import dataclass, field
import pandas as pd
from datetime import datetime
import hashlib

# ===================================================================
# LOGGING CONFIGURATION
# ===================================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("pdf_manager.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("PDFManager")

# ===================================================================
# PDF METADATA STRUCTURE
# ===================================================================

@dataclass
class PDFMetadata:
    """
    Metadata for a PDF attachment.
    Links PDF files to job postings.
    """
    pdf_id: str
    file_path: str
    file_name: str
    file_size: int  # in bytes
    job_id: Optional[str] = None
    message_id: Optional[str] = None
    email_subject: Optional[str] = None
    page_count: int = 0
    text_extracted: bool = False
    summary_generated: bool = False
    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            'pdf_id': self.pdf_id,
            'file_path': self.file_path,
            'file_name': self.file_name,
            'file_size': self.file_size,
            'file_size_mb': round(self.file_size / (1024 * 1024), 2),
            'job_id': self.job_id,
            'message_id': self.message_id,
            'email_subject': self.email_subject,
            'page_count': self.page_count,
            'text_extracted': self.text_extracted,
            'summary_generated': self.summary_generated,
            'created_at': self.created_at
        }


# ===================================================================
# PDF TEXT EXTRACTOR
# ===================================================================

class PDFTextExtractor:
    """
    Extract text content from PDF files.
    Handles various PDF formats and encoding issues.
    """
    
    def __init__(self):
        """Initialize PDF text extractor."""
        self.logger = logging.getLogger("PDFExtractor")
        
        # Try importing PDF libraries
        self.pdf_library = None
        
        # Try PyPDF2 first
        try:
            import PyPDF2
            self.PyPDF2 = PyPDF2
            self.pdf_library = "PyPDF2"
            self.logger.info("‚úÖ Using PyPDF2 for PDF extraction")
        except ImportError:
            pass
        
        # Try pdfplumber as alternative
        if not self.pdf_library:
            try:
                import pdfplumber
                self.pdfplumber = pdfplumber
                self.pdf_library = "pdfplumber"
                self.logger.info("‚úÖ Using pdfplumber for PDF extraction")
            except ImportError:
                pass
        
        if not self.pdf_library:
            self.logger.warning("‚ö†Ô∏è No PDF library available!")
            self.logger.warning("Install with: pip install PyPDF2 or pip install pdfplumber")
    
    def extract_text(self, pdf_path: str) -> Tuple[str, int]:
        """
        Extract text from PDF file.
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Tuple of (extracted_text, page_count)
        """
        if not self.pdf_library:
            self.logger.error("‚ùå No PDF library available for extraction")
            return "", 0
        
        try:
            if self.pdf_library == "PyPDF2":
                return self._extract_with_pypdf2(pdf_path)
            elif self.pdf_library == "pdfplumber":
                return self._extract_with_pdfplumber(pdf_path)
        except Exception as e:
            self.logger.error(f"‚ùå Error extracting text from {pdf_path}: {e}")
            return "", 0
    
    def _extract_with_pypdf2(self, pdf_path: str) -> Tuple[str, int]:
        """Extract text using PyPDF2."""
        text_parts = []
        page_count = 0
        
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = self.PyPDF2.PdfReader(file)
                page_count = len(pdf_reader.pages)
                
                for page_num, page in enumerate(pdf_reader.pages):
                    try:
                        text = page.extract_text()
                        if text:
                            text_parts.append(text)
                    except Exception as e:
                        self.logger.warning(f"‚ö†Ô∏è Error on page {page_num}: {e}")
                        continue
            
            full_text = "\n\n".join(text_parts)
            self.logger.info(f"‚úÖ Extracted {len(full_text)} chars from {page_count} pages")
            
            return full_text, page_count
            
        except Exception as e:
            self.logger.error(f"‚ùå PyPDF2 extraction failed: {e}")
            return "", 0
    
    def _extract_with_pdfplumber(self, pdf_path: str) -> Tuple[str, int]:
        """Extract text using pdfplumber."""
        text_parts = []
        page_count = 0
        
        try:
            with self.pdfplumber.open(pdf_path) as pdf:
                page_count = len(pdf.pages)
                
                for page_num, page in enumerate(pdf.pages):
                    try:
                        text = page.extract_text()
                        if text:
                            text_parts.append(text)
                    except Exception as e:
                        self.logger.warning(f"‚ö†Ô∏è Error on page {page_num}: {e}")
                        continue
            
            full_text = "\n\n".join(text_parts)
            self.logger.info(f"‚úÖ Extracted {len(full_text)} chars from {page_count} pages")
            
            return full_text, page_count
            
        except Exception as e:
            self.logger.error(f"‚ùå pdfplumber extraction failed: {e}")
            return "", 0


# ===================================================================
# PDF SUMMARIZER (using Gemini)
# ===================================================================

class PDFSummarizer:
    """
    Generate intelligent summaries of PDF content using Gemini.
    """
    
    def __init__(self, gemini_api_key: Optional[str] = None):
            """
            Initialize Gemini Client.
            
            Args:
                gemini_api_key: Gemini API key
            """
            # Use the specific logger name you requested or generic client name
            self.logger = logging.getLogger("GeminiClient") 
            
            # Import Gemini
            try:
                import google.generativeai as genai
                self.genai = genai
            except ImportError:
                self.logger.error("‚ùå google-generativeai not installed!")
                raise
            
            # Get API key from environment variable
            self.api_key = gemini_api_key or os.getenv('GEMINI_API_KEY')
            if not self.api_key:
                self.logger.warning("‚ö†Ô∏è No Gemini API key - summaries unavailable")
                self.model = None
                return
            
            # Configure and initialize model
            try:
                self.genai.configure(api_key=self.api_key)
                self.model = self.genai.GenerativeModel('gemini-1.5-flash')
                self.logger.info("‚úÖ Gemini summarizer initialized")
            except Exception as e:
                self.logger.error(f"‚ùå Failed to initialize Gemini: {e}")
                self.model = None
    
    def summarize_job_pdf(self, pdf_text: str, max_length: int = 500) -> Dict[str, Any]:
        """
        Generate structured summary of job PDF.
        
        Args:
            pdf_text: Extracted PDF text
            max_length: Maximum summary length
            
        Returns:
            Dictionary with summary components
        """
        if not self.model:
            self.logger.warning("‚ö†Ô∏è Summarizer not available")
            return {
                'summary': 'Summary unavailable - Gemini not configured',
                'key_points': [],
                'requirements': [],
                'benefits': []
            }
        
        try:
            # Truncate text if too long (Gemini has token limits)
            text_to_summarize = pdf_text[:15000]  # ~4000 words
            
            prompt = f"""Analyze this job description PDF and provide a structured summary.

PDF Content:
{text_to_summarize}

Provide a JSON response with:
{{
  "summary": "Brief 2-3 sentence overview",
  "key_points": ["Main point 1", "Main point 2", ...],
  "requirements": ["Requirement 1", "Requirement 2", ...],
  "benefits": ["Benefit 1", "Benefit 2", ...],
  "application_process": "How to apply (if mentioned)"
}}

Focus on:
- Job role and responsibilities
- Required skills and qualifications
- Salary and benefits (if mentioned)
- Application deadline and process
- Company culture (if mentioned)

Return ONLY valid JSON, no markdown or other text."""

            response = self.model.generate_content(prompt)
            result_text = response.text.strip()
            
            # Clean JSON (remove markdown if present)
            if result_text.startswith('```'):
                result_text = result_text.split('```')[1]
                if result_text.startswith('json'):
                    result_text = result_text[4:]
                result_text = result_text.strip()
            
            # Parse JSON
            summary_data = json.loads(result_text)
            
            self.logger.info("‚úÖ PDF summary generated successfully")
            return summary_data
            
        except Exception as e:
            self.logger.error(f"‚ùå Error generating summary: {e}")
            return {
                'summary': 'Error generating summary',
                'key_points': [],
                'requirements': [],
                'benefits': []
            }


# ===================================================================
# PDF MANAGER
# ===================================================================

class PDFManager:
    """
    Complete PDF management system.
    Handles organization, linking, extraction, and retrieval of PDFs.
    """
    
    def __init__(
        self,
        pdf_directory: str = "./pdfs",
        metadata_file: str = "pdf_metadata.json",
        gemini_api_key: Optional[str] = None
    ):
        """
        Initialize PDF manager.
        
        Args:
            pdf_directory: Directory containing PDF files
            metadata_file: Path to metadata JSON file
            gemini_api_key: Gemini API key for summarization
        """
        self.logger = logging.getLogger("PDFManager")
        self.pdf_directory = Path(pdf_directory)
        self.metadata_file = Path(metadata_file)
        
        self.logger.info("="*70)
        self.logger.info("üìÑ INITIALIZING PDF MANAGER")
        self.logger.info("="*70)
        
        # Create PDF directory if not exists
        self.pdf_directory.mkdir(exist_ok=True)
        self.logger.info(f"‚úÖ PDF directory: {self.pdf_directory}")
        
        # Initialize components
        self.extractor = PDFTextExtractor()
        self.summarizer = PDFSummarizer(gemini_api_key)
        
        # Load metadata
        self.metadata: Dict[str, PDFMetadata] = {}
        self.load_metadata()
        
        # PDF text cache (in-memory)
        self.text_cache: Dict[str, str] = {}
        
        # PDF summaries cache
        self.summary_cache: Dict[str, Dict[str, Any]] = {}
        
        self.logger.info("="*70 + "\n")
    
    def load_metadata(self):
        """Load PDF metadata from JSON file."""
        if self.metadata_file.exists():
            try:
                with open(self.metadata_file, 'r') as f:
                    data = json.load(f)
                    for pdf_id, metadata_dict in data.items():
                        self.metadata[pdf_id] = PDFMetadata(**metadata_dict)
                self.logger.info(f"‚úÖ Loaded metadata for {len(self.metadata)} PDFs")
            except Exception as e:
                self.logger.error(f"‚ùå Error loading metadata: {e}")
        else:
            self.logger.info("üìù No existing metadata found - starting fresh")
    
    def save_metadata(self):
        """Save PDF metadata to JSON file."""
        try:
            data = {
                pdf_id: metadata.to_dict() 
                for pdf_id, metadata in self.metadata.items()
            }
            with open(self.metadata_file, 'w') as f:
                json.dump(data, f, indent=2)
            self.logger.info(f"üíæ Saved metadata for {len(self.metadata)} PDFs")
        except Exception as e:
            self.logger.error(f"‚ùå Error saving metadata: {e}")
    
    def scan_directory(self) -> List[PDFMetadata]:
        """
        Scan PDF directory and register all PDF files.
        
        Returns:
            List of PDFMetadata objects for new files
        """
        self.logger.info(f"\n{'‚îÄ'*70}")
        self.logger.info("üîç SCANNING PDF DIRECTORY")
        self.logger.info(f"{'‚îÄ'*70}")
        
        new_pdfs = []
        
        try:
            # Find all PDF files
            pdf_files = list(self.pdf_directory.glob("**/*.pdf"))
            self.logger.info(f"üìÇ Found {len(pdf_files)} PDF files")
            
            for pdf_path in pdf_files:
                # Generate PDF ID from file hash
                pdf_id = self._generate_pdf_id(pdf_path)
                
                # Skip if already registered
                if pdf_id in self.metadata:
                    continue
                
                # Get file info
                file_stats = pdf_path.stat()
                
                # Create metadata
                metadata = PDFMetadata(
                    pdf_id=pdf_id,
                    file_path=str(pdf_path),
                    file_name=pdf_path.name,
                    file_size=file_stats.st_size
                )
                
                self.metadata[pdf_id] = metadata
                new_pdfs.append(metadata)
                
                self.logger.info(f"  ‚úÖ Registered: {pdf_path.name}")
            
            # Save updated metadata
            if new_pdfs:
                self.save_metadata()
                self.logger.info(f"\n‚úÖ Registered {len(new_pdfs)} new PDFs")
            else:
                self.logger.info(f"\n‚úÖ All PDFs already registered")
            
            self.logger.info(f"{'‚îÄ'*70}\n")
            
            return new_pdfs
            
        except Exception as e:
            self.logger.error(f"‚ùå Error scanning directory: {e}")
            return []
    
    def _generate_pdf_id(self, pdf_path: Path) -> str:
        """Generate unique ID for PDF based on file hash."""
        try:
            # Use file content hash for ID
            with open(pdf_path, 'rb') as f:
                file_hash = hashlib.md5(f.read()).hexdigest()
            return f"PDF_{file_hash[:12]}"
        except:
            # Fallback to filename-based ID
            return f"PDF_{pdf_path.stem}"
    
    def link_pdf_to_job(
        self,
        pdf_id: str,
        job_id: str,
        message_id: Optional[str] = None,
        email_subject: Optional[str] = None
    ) -> bool:
        """
        Link a PDF to a job posting.
        
        Args:
            pdf_id: PDF identifier
            job_id: Job posting identifier
            message_id: Original email message ID
            email_subject: Email subject line
            
        Returns:
            True if successful
        """
        try:
            if pdf_id not in self.metadata:
                self.logger.error(f"‚ùå PDF {pdf_id} not found")
                return False
            
            metadata = self.metadata[pdf_id]
            metadata.job_id = job_id
            metadata.message_id = message_id
            metadata.email_subject = email_subject
            
            self.save_metadata()
            self.logger.info(f"‚úÖ Linked PDF {pdf_id} to job {job_id}")
            
            return True
            
        except Exception as e:
            self.logger.error(f"‚ùå Error linking PDF: {e}")
            return False
    
    def extract_pdf_text(self, pdf_id: str, force: bool = False) -> Optional[str]:
        """
        Extract text from PDF.
        
        Args:
            pdf_id: PDF identifier
            force: Force re-extraction even if cached
            
        Returns:
            Extracted text or None
        """
        try:
            # Check cache first
            if not force and pdf_id in self.text_cache:
                self.logger.info(f"üìã Using cached text for {pdf_id}")
                return self.text_cache[pdf_id]
            
            # Get metadata
            if pdf_id not in self.metadata:
                self.logger.error(f"‚ùå PDF {pdf_id} not found")
                return None
            
            metadata = self.metadata[pdf_id]
            
            # Extract text
            self.logger.info(f"üìÑ Extracting text from {metadata.file_name}...")
            text, page_count = self.extractor.extract_text(metadata.file_path)
            
            if text:
                # Update metadata
                metadata.page_count = page_count
                metadata.text_extracted = True
                self.save_metadata()
                
                # Cache text
                self.text_cache[pdf_id] = text
                
                self.logger.info(f"‚úÖ Extracted {len(text)} characters from {page_count} pages")
                return text
            else:
                self.logger.warning(f"‚ö†Ô∏è No text extracted from {metadata.file_name}")
                return None
                
        except Exception as e:
            self.logger.error(f"‚ùå Error extracting text: {e}")
            return None
    
    def generate_pdf_summary(self, pdf_id: str, force: bool = False) -> Optional[Dict[str, Any]]:
        """
        Generate AI summary of PDF content.
        
        Args:
            pdf_id: PDF identifier
            force: Force regeneration even if cached
            
        Returns:
            Summary dictionary or None
        """
        try:
            # Check cache
            if not force and pdf_id in self.summary_cache:
                self.logger.info(f"üìã Using cached summary for {pdf_id}")
                return self.summary_cache[pdf_id]
            
            # Extract text if needed
            text = self.extract_pdf_text(pdf_id)
            if not text:
                return None
            
            # Generate summary
            self.logger.info(f"ü§ñ Generating AI summary for {pdf_id}...")
            summary = self.summarizer.summarize_job_pdf(text)
            
            if summary:
                # Update metadata
                if pdf_id in self.metadata:
                    self.metadata[pdf_id].summary_generated = True
                    self.save_metadata()
                
                # Cache summary
                self.summary_cache[pdf_id] = summary
                
                self.logger.info(f"‚úÖ Summary generated successfully")
                return summary
            else:
                return None
                
        except Exception as e:
            self.logger.error(f"‚ùå Error generating summary: {e}")
            return None
    
    def get_pdfs_for_job(self, job_id: str) -> List[PDFMetadata]:
        """
        Get all PDFs linked to a job.
        
        Args:
            job_id: Job identifier
            
        Returns:
            List of PDFMetadata objects
        """
        pdfs = [
            metadata for metadata in self.metadata.values()
            if metadata.job_id == job_id
        ]
        return pdfs
    
    def get_pdf_info(self, pdf_id: str) -> Optional[Dict[str, Any]]:
        """
        Get complete information about a PDF.
        
        Args:
            pdf_id: PDF identifier
            
        Returns:
            Dictionary with PDF info
        """
        if pdf_id not in self.metadata:
            return None
        
        metadata = self.metadata[pdf_id]
        
        info = metadata.to_dict()
        
        # Add text if available
        if pdf_id in self.text_cache:
            info['has_text'] = True
            info['text_preview'] = self.text_cache[pdf_id][:200] + "..."
        else:
            info['has_text'] = False
        
        # Add summary if available
        if pdf_id in self.summary_cache:
            info['has_summary'] = True
            info['summary'] = self.summary_cache[pdf_id]
        else:
            info['has_summary'] = False
        
        return info
    
    def process_all_pdfs(self, extract_text: bool = True, generate_summaries: bool = True):
        """
        Process all PDFs in the directory.
        
        Args:
            extract_text: Whether to extract text
            generate_summaries: Whether to generate summaries
        """
        self.logger.info(f"\n{'='*70}")
        self.logger.info("‚öôÔ∏è  PROCESSING ALL PDFs")
        self.logger.info(f"{'='*70}\n")
        
        total_pdfs = len(self.metadata)
        processed = 0
        
        for pdf_id, metadata in self.metadata.items():
            self.logger.info(f"üìÑ Processing {metadata.file_name}...")
            
            try:
                # Extract text
                if extract_text and not metadata.text_extracted:
                    self.extract_pdf_text(pdf_id)
                
                # Generate summary
                if generate_summaries and not metadata.summary_generated:
                    self.generate_pdf_summary(pdf_id)
                
                processed += 1
                self.logger.info(f"‚úÖ Processed {processed}/{total_pdfs}\n")
                
            except Exception as e:
                self.logger.error(f"‚ùå Error processing {metadata.file_name}: {e}\n")
                continue
        
        self.logger.info(f"{'='*70}")
        self.logger.info(f"‚úÖ PROCESSING COMPLETE - {processed}/{total_pdfs} PDFs")
        self.logger.info(f"{'='*70}\n")


# ===================================================================
# PDF-JOB LINKER
# ===================================================================

class PDFJobLinker:
    """
    Automatically link PDFs to jobs based on email MessageId.
    """
    
    def __init__(
        self,
        pdf_manager: PDFManager,
        emails_csv: str = "placement_emails.csv"
    ):
        """
        Initialize PDF-Job linker.
        
        Args:
            pdf_manager: PDFManager instance
            emails_csv: Path to emails CSV
        """
        self.pdf_manager = pdf_manager
        self.logger = logging.getLogger("PDFJobLinker")
        
        # Load emails data
        try:
            self.emails_df = pd.read_csv(emails_csv)
            self.logger.info(f"‚úÖ Loaded {len(self.emails_df)} emails")
        except Exception as e:
            self.logger.error(f"‚ùå Failed to load emails: {e}")
            self.emails_df = pd.DataFrame()
    
    def auto_link_pdfs(self, jobs_df: pd.DataFrame) -> int:
        """
        Automatically link PDFs to jobs based on MessageId.
        
        Args:
            jobs_df: DataFrame with job postings
            
        Returns:
            Number of PDFs linked
        """
        self.logger.info(f"\n{'='*70}")
        self.logger.info("üîó AUTO-LINKING PDFs TO JOBS")
        self.logger.info(f"{'='*70}\n")
        
        linked_count = 0
        
        try:
            # Create MessageId to Job mapping
            message_to_job = {}
            for _, job in jobs_df.iterrows():
                email_id = job.get('email_id', '')
                job_id = job.get('job_id', '')
                if email_id and job_id:
                    message_to_job[email_id] = job_id
            
            # Match PDFs by filename patterns (assuming format: MessageId_*.pdf)
            for pdf_id, metadata in self.pdf_manager.metadata.items():
                if metadata.job_id:
                    continue  # Already linked
                
                # Try to extract MessageId from filename
                filename = metadata.file_name
                
                # Common patterns: MSG_123.pdf, message_123_doc.pdf, etc.
                for message_id, job_id in message_to_job.items():
                    if message_id in filename or filename.startswith(message_id):
                        # Get email subject
                        email_subject = None
                        email_match = self.emails_df[self.emails_df['MessageId'] == message_id]
                        if not email_match.empty:
                            email_subject = email_match.iloc[0].get('Subject', '')
                        
                        # Link PDF to job
                        success = self.pdf_manager.link_pdf_to_job(
                            pdf_id=pdf_id,
                            job_id=job_id,
                            message_id=message_id,
                            email_subject=email_subject
                        )
                        
                        if success:
                            linked_count += 1
                            self.logger.info(f"  ‚úÖ Linked {filename} ‚Üí {job_id}")
                        break
            
            self.logger.info(f"\n{'='*70}")
            self.logger.info(f"‚úÖ LINKING COMPLETE - {linked_count} PDFs linked")
            self.logger.info(f"{'='*70}\n")
            
            return linked_count
            
        except Exception as e:
            self.logger.error(f"‚ùå Error in auto-linking: {e}")
            return linked_count


# ===================================================================
# TESTING & EXAMPLES
# ===================================================================

def test_pdf_system():
    """Test the PDF management system."""
    logger.info("\n" + "="*70)
    logger.info("üß™ TESTING PDF SYSTEM")
    logger.info("="*70 + "\n")
    
    # Initialize PDF manager
    pdf_dir = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 1\attachments"
    pdf_manager = PDFManager(
        pdf_directory=pdf_dir,
        gemini_api_key=os.getenv('GEMINI_API_KEY')
    )
    
    # Scan directory
    new_pdfs = pdf_manager.scan_directory()
    logger.info(f"‚úÖ Found {len(pdf_manager.metadata)} total PDFs")
    
    # Process one PDF as example
    if pdf_manager.metadata:
        pdf_id = list(pdf_manager.metadata.keys())[0]
        logger.info(f"\nüìÑ Testing with PDF: {pdf_id}")
        
        # Extract text
        text = pdf_manager.extract_pdf_text(pdf_id)
        if text:
            logger.info(f"‚úÖ Text extracted: {len(text)} characters")
        
        # Generate summary
        summary = pdf_manager.generate_pdf_summary(pdf_id)
        if summary:
            logger.info(f"‚úÖ Summary generated")
            logger.info(f"Summary: {summary.get('summary', 'N/A')}")


# ===================================================================
# MAIN EXECUTION
# ===================================================================

if __name__ == "__main__":
    test_pdf_system()