In [1]:
"""
Phase 3: Entity Structuring Pipeline with Incremental Processing
Refactored to use configuration file and support incremental processing.
"""

from __future__ import annotations
import os
import json
import re
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Set, Any, Optional, Tuple
from collections import defaultdict, Counter
from dataclasses import dataclass, field, asdict
import pandas as pd
import numpy as np

from config_manager import ConfigurationManager

# ===================================================================
# LOGGING CONFIGURATION
# ===================================================================
logger = logging.getLogger("EntityStructuring")


# ===================================================================
# DATA CLASSES FOR STRUCTURED ENTITIES
# ===================================================================

@dataclass
class Company:
    """Structured company entity with normalization."""
    name: str
    canonical_name: str
    confidence: float = 0.0
    
    def __post_init__(self):
        """Validate and normalize company data."""
        if not self.name:
            raise ValueError("Company name cannot be empty")
        self.canonical_name = self.canonical_name or self._normalize_name()
        self.confidence = max(0.0, min(1.0, self.confidence))
    
    def _normalize_name(self) -> str:
        """Normalize company name to canonical form."""
        # Will be loaded from config
        return self.name.strip()


@dataclass
class Position:
    """Structured job position entity."""
    title: str
    level: str = "Not Specified"
    confidence: float = 0.0
    
    def __post_init__(self):
        """Validate and normalize position data."""
        if not self.title:
            raise ValueError("Position title cannot be empty")
        self.title = self.title.title()
        self.confidence = max(0.0, min(1.0, self.confidence))


@dataclass
class Requirements:
    """Job requirements including skills, education, experience."""
    skills: List[str] = field(default_factory=list)
    education: List[str] = field(default_factory=list)
    experience_min: int = 0
    experience_max: int = 0
    experience_type: str = "Not Specified"
    
    def __post_init__(self):
        """Validate requirements."""
        self.skills = [s.strip().title() for s in self.skills if s.strip()]
        self.education = [e.strip().upper() for e in self.education if e.strip()]


@dataclass
class Location:
    """Structured location entity."""
    city: str
    state: str = "Not Specified"
    work_mode: str = "On-site"
    confidence: float = 0.0
    
    def __post_init__(self):
        """Validate and normalize location."""
        if not self.city:
            raise ValueError("City cannot be empty")
        self.confidence = max(0.0, min(1.0, self.confidence))


@dataclass
class Compensation:
    """Structured compensation information."""
    salary_min: int = 0
    salary_max: int = 0
    currency: str = "INR"
    period: str = "annual"
    raw_text: str = ""
    confidence: float = 0.0
    
    def __post_init__(self):
        """Validate compensation data."""
        if self.salary_min > self.salary_max and self.salary_max > 0:
            self.salary_min, self.salary_max = self.salary_max, self.salary_min
        self.confidence = max(0.0, min(1.0, self.confidence))


@dataclass
class Application:
    """Application details including deadline."""
    deadline: Optional[str] = None
    apply_link: Optional[str] = None
    contact_email: Optional[str] = None


@dataclass
class JobPosting:
    """Complete structured job posting."""
    job_id: str
    email_id: str
    company: Company
    position: Position
    requirements: Requirements
    location: Location
    compensation: Compensation
    application: Application
    metadata: Dict[str, Any] = field(default_factory=dict)
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'job_id': self.job_id,
            'email_id': self.email_id,
            'company': asdict(self.company),
            'position': asdict(self.position),
            'requirements': asdict(self.requirements),
            'location': asdict(self.location),
            'compensation': asdict(self.compensation),
            'application': asdict(self.application),
            'metadata': self.metadata
        }

    
    def calculate_completeness(self) -> float:
        """Calculate how complete this job posting is (0-1)."""
        scores = []
        scores.append(1.0 if self.company.name else 0.0)
        scores.append(1.0 if self.position.title else 0.0)
        scores.append(0.8 if self.requirements.skills else 0.0)
        scores.append(0.6 if self.location.city else 0.0)
        scores.append(0.5 if self.compensation.salary_max > 0 else 0.0)
        scores.append(0.3 if self.requirements.education else 0.0)
        scores.append(0.2 if self.application.deadline else 0.0)
        return sum(scores) / len(scores)


# ===================================================================
# ENTITY NORMALIZER (Config-Driven)
# ===================================================================

class EntityNormalizer:
    """Normalize and standardize extracted entities using config."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize normalizer with config."""
        self.logger = logging.getLogger("EntityNormalizer")
        norm_config = config.get("normalization", {})
        
        self.skill_map = norm_config.get("skill_map", {})
        self.degree_map = norm_config.get("degree_map", {})
        self.city_map = norm_config.get("city_map", {})
        self.company_suffixes = norm_config.get("company_suffixes", [])
    
    def normalize_skill(self, skill: str) -> str:
        """Normalize skill to canonical form."""
        skill_lower = skill.lower().strip()
        return self.skill_map.get(skill_lower, skill_lower)
    
    def normalize_degree(self, degree: str) -> str:
        """Normalize degree to canonical form."""
        degree_lower = degree.lower().strip()
        return self.degree_map.get(degree_lower, degree.upper())
    
    def normalize_city(self, city: str) -> str:
        """Normalize city name."""
        return self.city_map.get(city.lower(), city.title())
    
    def normalize_company_name(self, name: str) -> str:
        """Normalize company name."""
        name_upper = name.upper()
        for suffix in self.company_suffixes:
            name_upper = name_upper.replace(suffix, '').strip()
        return name_upper.strip()

    
    def normalize_skills_list(self, skills: List[str]) -> List[str]:
        """Normalize a list of skills."""
        normalized = []
        seen = set()
        for skill in skills:
            norm_skill = self.normalize_skill(skill)
            if norm_skill not in seen:
                normalized.append(norm_skill)
                seen.add(norm_skill)
        return normalized
    
    def normalize_degrees_list(self, degrees: List[str]) -> List[str]:
        """Normalize a list of degrees."""
        normalized = []
        seen = set()
        for degree in degrees:
            norm_degree = self.normalize_degree(degree)
            if norm_degree not in seen:
                normalized.append(norm_degree)
                seen.add(norm_degree)
        return normalized


# ===================================================================
# PARSERS (Config-Driven)
# ===================================================================

class SalaryParser:
    """Parse salary information using config patterns."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize with config."""
        self.logger = logging.getLogger("SalaryParser")
        salary_config = config.get("salary_parsing", {})
        
        self.patterns = []
        for pattern_config in salary_config.get("patterns", []):
            self.patterns.append((
                pattern_config.get("pattern", ""),
                pattern_config.get("name", ""),
                pattern_config.get("confidence", 0.8)
            ))
        
        self.default_currency = salary_config.get("default_currency", "INR")
        self.default_period = salary_config.get("default_period", "annual")
    
    def parse(self, text: str) -> Optional[Compensation]:
        """Parse salary from text."""
        if not text or not isinstance(text, str):
            return None
        
        text_lower = text.lower()
        
        try:
            for pattern, pattern_type, confidence in self.patterns:
                match = re.search(pattern, text_lower, re.IGNORECASE)
                if match:
                    return self._parse_match(match, pattern_type, confidence)
        except Exception as e:
            self.logger.error(f"Error parsing salary '{text}': {e}")
        
        return None

    
    def _parse_match(self, match, pattern_type: str, confidence: float) -> Optional[Compensation]:
        """Parse regex match into Compensation object."""
        try:
            if pattern_type == 'lpa_range':
                min_val = float(match.group(1))
                max_val = float(match.group(2))
                return Compensation(
                    salary_min=int(min_val * 100000),
                    salary_max=int(max_val * 100000),
                    currency=self.default_currency,
                    period=self.default_period,
                    raw_text=match.group(0),
                    confidence=confidence
                )
            
            elif pattern_type in ['lpa_single', 'ctc']:
                val = float(match.group(1))
                return Compensation(
                    salary_min=int(val * 100000),
                    salary_max=int(val * 100000),
                    currency=self.default_currency,
                    period=self.default_period,
                    raw_text=match.group(0),
                    confidence=confidence
                )
            
            elif pattern_type == 'monthly':
                val = float(match.group(1))
                annual = val * 1000 * 12 if val < 1000 else val * 12
                return Compensation(
                    salary_min=int(annual),
                    salary_max=int(annual),
                    currency=self.default_currency,
                    period=self.default_period,
                    raw_text=match.group(0),
                    confidence=confidence
                )
        except Exception as e:
            self.logger.error(f"Error parsing match: {e}")
        
        return None


class ExperienceParser:
    """Parse experience requirements using config patterns."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize with config."""
        self.logger = logging.getLogger("ExperienceParser")
        exp_config = config.get("experience_parsing", {})
        self.patterns = exp_config.get("patterns", [])
        
        exp_types = config.get("experience_types", {})
        self.fresher_keywords = exp_types.get("fresher_keywords", [])

    
    def parse(self, text: str) -> Tuple[int, int]:
        """Parse experience requirement."""
        if not text or not isinstance(text, str):
            return (0, 0)
        
        text_lower = text.lower()
        
        # Check for fresher keywords
        if any(word in text_lower for word in self.fresher_keywords):
            return (0, 0)
        
        try:
            for pattern in self.patterns:
                match = re.search(pattern, text_lower)
                if match:
                    groups = match.groups()
                    if len(groups) == 2:
                        return (int(groups[0]), int(groups[1]))
                    elif len(groups) == 1:
                        val = int(groups[0])
                        return (val, val)
        except Exception as e:
            self.logger.error(f"Error parsing experience '{text}': {e}")
        
        return (0, 0)


class DeadlineParser:
    """Parse deadline dates using config patterns."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize with config."""
        self.logger = logging.getLogger("DeadlineParser")
        deadline_config = config.get("deadline_parsing", {})
        
        self.date_patterns = []
        for pattern_config in deadline_config.get("date_patterns", []):
            self.date_patterns.append((
                pattern_config.get("pattern", ""),
                pattern_config.get("format", "")
            ))
        
        self.relative_keywords = deadline_config.get("relative_keywords", {})
    
    def parse(self, text: str) -> Optional[str]:
        """Parse deadline from text."""
        if not text or not isinstance(text, str):
            return None
        
        try:
            # Try standard date formats
            for pattern, fmt in self.date_patterns:
                match = re.search(pattern, text)
                if match:
                    date_str = match.group(0)
                    try:
                        dt = datetime.strptime(date_str, fmt)
                        return dt.strftime('%Y-%m-%d')
                    except:
                        continue
            
            # Handle relative dates
            text_lower = text.lower()
            for keyword, days_offset in self.relative_keywords.items():
                if keyword in text_lower:
                    future_date = datetime.now() + timedelta(days=days_offset)
                    return future_date.strftime('%Y-%m-%d')
        except Exception as e:
            self.logger.error(f"Error parsing deadline '{text}': {e}")
        
        return None



# ===================================================================
# INCREMENTAL PROCESSING STATE MANAGER
# ===================================================================

class StateManager:
    """Manage incremental processing state."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize state manager."""
        self.logger = logging.getLogger("StateManager")
        inc_config = config.get("incremental_processing", {})
        
        self.enabled = inc_config.get("enabled", True)
        self.state_dir = inc_config.get("state_directory", "state")
        self.state_file = inc_config.get("state_file", "processed_message_ids.txt")
        self.checkpoint_interval = inc_config.get("checkpoint_interval", 50)
        self.force_full_reprocess = inc_config.get("force_full_reprocess", False)
        
        # Create state directory if it doesn't exist
        os.makedirs(self.state_dir, exist_ok=True)
        
        self.state_path = os.path.join(self.state_dir, self.state_file)
        self.processed_ids: Set[str] = set()
        
        if self.enabled and not self.force_full_reprocess:
            self.load_state()
    
    def load_state(self) -> Set[str]:
        """Load processed message IDs from state file."""
        if os.path.exists(self.state_path):
            try:
                with open(self.state_path, 'r', encoding='utf-8') as f:
                    self.processed_ids = set(line.strip() for line in f if line.strip())
                self.logger.info(f"Loaded {len(self.processed_ids)} processed message IDs from state")
            except Exception as e:
                self.logger.error(f"Error loading state: {e}")
                self.processed_ids = set()
        else:
            self.logger.info("No existing state file found, starting fresh")
            self.processed_ids = set()
        
        return self.processed_ids
    
    def save_state(self, new_ids: Set[str]) -> None:
        """Save processed message IDs to state file."""
        try:
            self.processed_ids.update(new_ids)
            with open(self.state_path, 'w', encoding='utf-8') as f:
                for msg_id in sorted(self.processed_ids):
                    f.write(f"{msg_id}\n")
            self.logger.info(f"Saved {len(self.processed_ids)} processed message IDs to state")
        except Exception as e:
            self.logger.error(f"Error saving state: {e}")
    
    def is_processed(self, message_id: str) -> bool:
        """Check if message ID has been processed."""
        return message_id in self.processed_ids
    
    def get_unprocessed_emails(self, df: pd.DataFrame) -> pd.DataFrame:
        """Filter dataframe to only unprocessed emails."""
        if not self.enabled or self.force_full_reprocess:
            self.logger.info("Incremental processing disabled, processing all emails")
            return df
        
        # Filter out already processed emails
        unprocessed = df[~df['MessageId'].isin(self.processed_ids)]
        
        self.logger.info(
            f"Total emails: {len(df)}, "
            f"Already processed: {len(self.processed_ids)}, "
            f"New to process: {len(unprocessed)}"
        )
        
        return unprocessed



# ===================================================================
# RELATIONSHIP EXTRACTOR
# ===================================================================

class RelationshipExtractor:
    """Extract relationships between entities."""
    
    def __init__(self, config: Dict[str, Any]):
        """Initialize relationship extractor."""
        self.logger = logging.getLogger("RelationshipExtractor")
        self.config = config
        self.normalizer = EntityNormalizer(config)
        self.salary_parser = SalaryParser(config)
        self.experience_parser = ExperienceParser(config)
        self.deadline_parser = DeadlineParser(config)
        
        # Get processing limits from config
        proc_config = config.get("processing", {})
        self.max_companies = proc_config.get("max_companies_per_email", 3)
        self.max_positions = proc_config.get("max_positions_per_email", 3)
        self.max_jobs = proc_config.get("max_jobs_per_email", 5)
    
    def extract_job_postings(self, row: pd.Series, email_id: str) -> List[JobPosting]:
        """Extract structured job postings from a DataFrame row."""
        job_postings = []
        
        try:
            # Parse entities from row
            companies = self._parse_list(row.get('companies_extracted', ''))
            positions = self._parse_list(row.get('positions_extracted', ''))
            skills = self._parse_list(row.get('skills_extracted', ''))
            locations = self._parse_list(row.get('locations_extracted', ''))
            salary_texts = self._parse_list(row.get('salary_info', ''))
            experience_texts = self._parse_list(row.get('experience_required', ''))
            degrees = self._parse_list(row.get('degrees_required', ''))
            
            # If no companies or positions, return empty
            if not companies or not positions:
                self.logger.debug(f"No companies or positions found in {email_id}")
                return []
            
            # Normalize entities
            skills = self.normalizer.normalize_skills_list(skills)
            degrees = self.normalizer.normalize_degrees_list(degrees)
            
            # Create job postings
            for idx, (company_name, position_title) in enumerate(
                self._create_pairs(companies, positions)
            ):
                job_id = f"{email_id}_JOB_{idx+1}"
                
                try:
                    # Create company
                    company = Company(
                        name=company_name,
                        canonical_name=self.normalizer.normalize_company_name(company_name),
                        confidence=0.85
                    )
                    
                    # Create position
                    position = Position(
                        title=position_title,
                        confidence=0.85
                    )
                    
                    # Parse experience
                    exp_min, exp_max = 0, 0
                    if experience_texts:
                        exp_min, exp_max = self.experience_parser.parse(experience_texts[0])
                    
                    requirements = Requirements(
                        skills=skills,
                        education=degrees,
                        experience_min=exp_min,
                        experience_max=exp_max
                    )

                    
                    # Parse location
                    location_city = locations[idx] if idx < len(locations) else (
                        locations[0] if locations else "Not Specified"
                    )
                    
                    location = Location(
                        city=self.normalizer.normalize_city(location_city),
                        confidence=0.8
                    )
                    
                    # Parse salary
                    compensation = Compensation()
                    if salary_texts:
                        parsed_salary = self.salary_parser.parse(salary_texts[0])
                        if parsed_salary:
                            compensation = parsed_salary
                    
                    # Parse deadline
                    deadline = None
                    deadline_col = row.get('deadline', '')
                    if deadline_col:
                        deadline = self.deadline_parser.parse(str(deadline_col))
                    
                    application = Application(deadline=deadline)
                    
                    # Create job posting
                    job_posting = JobPosting(
                        job_id=job_id,
                        email_id=email_id,
                        company=company,
                        position=position,
                        requirements=requirements,
                        location=location,
                        compensation=compensation,
                        application=application,
                        metadata={
                            'extraction_timestamp': datetime.now().isoformat(),
                            'completeness_score': 0.0,
                            'source_subject': str(row.get('Subject', ''))[:100]
                        }
                    )
                    
                    # Calculate completeness
                    job_posting.metadata['completeness_score'] = job_posting.calculate_completeness()
                    
                    job_postings.append(job_posting)
                    
                except Exception as e:
                    self.logger.error(f"Error creating job posting {job_id}: {e}")
                    continue
        
        except Exception as e:
            self.logger.error(f"Error extracting job postings from {email_id}: {e}")
        
        return job_postings
    
    def _parse_list(self, text: str) -> List[str]:
        """Parse comma-separated string into list."""
        if not text or pd.isna(text) or text == '':
            return []
        return [item.strip() for item in str(text).split(',') if item.strip()]
    
    def _create_pairs(self, companies: List[str], positions: List[str]) -> List[Tuple[str, str]]:
        """Create company-position pairs."""
        if len(companies) == len(positions):
            return list(zip(companies, positions))
        
        # Create combinations but limit to avoid explosion
        pairs = []
        for company in companies[:self.max_companies]:
            for position in positions[:self.max_positions]:
                pairs.append((company, position))
        
        return pairs[:self.max_jobs]



# ===================================================================
# MAIN STRUCTURING PIPELINE
# ===================================================================

class EntityStructuringPipeline:
    """Main pipeline for entity structuring with incremental processing."""
    
    def __init__(self, config_path: str = "config.json"):
        """Initialize pipeline with configuration."""
        self.logger = logging.getLogger("StructuringPipeline")
        
        # Load configuration
        self.config_manager = ConfigurationManager(config_path)
        self.config = self.config_manager.load_config()
        
        # Initialize components
        self.state_manager = StateManager(self.config)
        self.relationship_extractor = RelationshipExtractor(self.config)
        
        # Get config values
        io_config = self.config.get("input_output", {})
        self.input_file = io_config.get("input_file", "../Phase 2/relevant_placement_emails.csv")
        self.output_csv = io_config.get("output_csv", "structured_job_postings.csv")
        self.output_json = io_config.get("output_json", "structured_job_postings.json")
        
        proc_config = self.config.get("processing", {})
        self.min_completeness = proc_config.get("min_completeness_score", 0.3)
        self.enable_analytics = proc_config.get("enable_analytics", True)
        
        self.logger.info("="*70)
        self.logger.info("üèóÔ∏è  ENTITY STRUCTURING PIPELINE INITIALIZED")
        self.logger.info("="*70)
    
    def process_dataset(self) -> Tuple[pd.DataFrame, List[Dict]]:
        """Process dataset with incremental processing support."""
        self.logger.info(f"\nLoading dataset: {self.input_file}")
        
        try:
            df = pd.read_csv(self.input_file)
            total_emails = len(df)
            self.logger.info(f"Loaded {total_emails} emails")
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {e}")
            raise
        
        # Get unprocessed emails
        unprocessed_df = self.state_manager.get_unprocessed_emails(df)
        
        if len(unprocessed_df) == 0:
            self.logger.info("No new emails to process!")
            return self._load_existing_results()
        
        # Load existing results
        existing_jobs = self._load_existing_jobs()
        
        # Process new emails
        self.logger.info(f"\nProcessing {len(unprocessed_df)} new emails...")
        
        new_job_postings = []
        processed_message_ids = set()
        stats = {
            'total_new_emails': len(unprocessed_df),
            'emails_with_jobs': 0,
            'total_new_jobs': 0
        }

        
        for idx, row in unprocessed_df.iterrows():
            email_id = row.get('MessageId', f"EMAIL_{idx}")
            
            try:
                # Extract job postings from this email
                job_postings = self.relationship_extractor.extract_job_postings(row, email_id)
                
                # Filter by completeness score
                job_postings = [
                    job for job in job_postings 
                    if job.calculate_completeness() >= self.min_completeness
                ]
                
                if job_postings:
                    stats['emails_with_jobs'] += 1
                    stats['total_new_jobs'] += len(job_postings)
                    
                    for job in job_postings:
                        new_job_postings.append(job.to_dict())
                
                # Mark as processed
                processed_message_ids.add(email_id)
                
                # Checkpoint every N emails
                if len(processed_message_ids) % self.state_manager.checkpoint_interval == 0:
                    self.state_manager.save_state(processed_message_ids)
                    self.logger.info(
                        f"   Checkpoint: {len(processed_message_ids)}/{len(unprocessed_df)} emails | "
                        f"Jobs extracted: {stats['total_new_jobs']}"
                    )
                    
            except Exception as e:
                self.logger.error(f"Error processing email {email_id}: {e}")
                continue
        
        # Final state save
        self.state_manager.save_state(processed_message_ids)
        
        # Combine with existing jobs
        all_job_postings = existing_jobs + new_job_postings
        
        # Log results
        self.logger.info(f"\n{'='*70}")
        self.logger.info(f"PROCESSING COMPLETE")
        self.logger.info(f"{'='*70}")
        self.logger.info(f"New Emails Processed: {stats['total_new_emails']}")
        self.logger.info(f"New Emails with Jobs: {stats['emails_with_jobs']}")
        self.logger.info(f"New Job Postings: {stats['total_new_jobs']}")
        self.logger.info(f"Total Job Postings: {len(all_job_postings)}")
        self.logger.info(f"{'='*70}\n")
        
        # Save outputs
        self._save_outputs(all_job_postings)
        
        # Create DataFrame
        jobs_df = self._create_dataframe(all_job_postings)
        
        return jobs_df, all_job_postings

    
    def _load_existing_jobs(self) -> List[Dict]:
        """Load existing job postings from JSON file."""
        if os.path.exists(self.output_json):
            try:
                with open(self.output_json, 'r', encoding='utf-8') as f:
                    existing = json.load(f)
                self.logger.info(f"Loaded {len(existing)} existing job postings")
                return existing
            except Exception as e:
                self.logger.error(f"Error loading existing jobs: {e}")
                return []
        return []
    
    def _load_existing_results(self) -> Tuple[pd.DataFrame, List[Dict]]:
        """Load existing results when no new emails to process."""
        existing_jobs = self._load_existing_jobs()
        jobs_df = self._create_dataframe(existing_jobs)
        return jobs_df, existing_jobs
    
    def _save_outputs(self, job_postings: List[Dict]) -> None:
        """Save structured job postings to CSV and JSON."""
        try:
            # Save JSON
            self.logger.info(f"Saving JSON: {self.output_json}")
            with open(self.output_json, 'w', encoding='utf-8') as f:
                json.dump(job_postings, f, indent=2, ensure_ascii=False)
            self.logger.info(f"Saved {len(job_postings)} jobs to JSON")
            
            # Save CSV
            self.logger.info(f"Saving CSV: {self.output_csv}")
            df = self._create_dataframe(job_postings)
            df.to_csv(self.output_csv, index=False)
            self.logger.info(f"Saved {len(df)} jobs to CSV\n")
            
        except Exception as e:
            self.logger.error(f"Error saving outputs: {e}")
    
    def _create_dataframe(self, job_postings: List[Dict]) -> pd.DataFrame:
        """Create flattened DataFrame from structured job postings."""
        flattened = []
        
        for job in job_postings:
            try:
                flat_job = {
                    'job_id': job['job_id'],
                    'email_id': job['email_id'],
                    'company_name': job['company']['name'],
                    'company_canonical': job['company']['canonical_name'],
                    'company_confidence': job['company']['confidence'],
                    'position_title': job['position']['title'],
                    'position_level': job['position']['level'],
                    'position_confidence': job['position']['confidence'],
                    'skills_required': ', '.join(job['requirements']['skills']),
                    'skills_count': len(job['requirements']['skills']),
                    'education_required': ', '.join(job['requirements']['education']),
                    'experience_min_years': job['requirements']['experience_min'],
                    'experience_max_years': job['requirements']['experience_max'],
                    'experience_type': job['requirements']['experience_type'],
                    'location_city': job['location']['city'],
                    'location_state': job['location']['state'],
                    'work_mode': job['location']['work_mode'],
                    'location_confidence': job['location']['confidence'],
                    'salary_min': job['compensation']['salary_min'],
                    'salary_max': job['compensation']['salary_max'],
                    'salary_currency': job['compensation']['currency'],
                    'salary_period': job['compensation']['period'],
                    'salary_raw_text': job['compensation']['raw_text'],
                    'salary_confidence': job['compensation']['confidence'],
                    'application_deadline': job['application']['deadline'],
                    'apply_link': job['application']['apply_link'],
                    'contact_email': job['application']['contact_email'],
                    'completeness_score': job['metadata']['completeness_score'],
                    'extraction_timestamp': job['metadata']['extraction_timestamp'],
                    'source_subject': job['metadata']['source_subject']
                }
                
                flattened.append(flat_job)
                
            except Exception as e:
                self.logger.error(f"Error flattening job {job.get('job_id')}: {e}")
                continue
        
        return pd.DataFrame(flattened)

    
    def generate_analytics(self) -> None:
        """Generate analytics report on structured data."""
        if not self.enable_analytics:
            self.logger.info("Analytics disabled in configuration")
            return
        
        self.logger.info(f"\n{'='*70}")
        self.logger.info(f"ANALYTICS REPORT")
        self.logger.info(f"{'='*70}\n")
        
        try:
            df = pd.read_csv(self.output_csv)
            
            # Basic statistics
            self.logger.info(f"BASIC STATISTICS:")
            self.logger.info(f"   Total Job Postings: {len(df)}")
            self.logger.info(f"   Unique Companies: {df['company_name'].nunique()}")
            self.logger.info(f"   Unique Positions: {df['position_title'].nunique()}")
            self.logger.info(f"   Unique Locations: {df['location_city'].nunique()}")
            
            # Top companies
            self.logger.info(f"\nTOP 10 HIRING COMPANIES:")
            top_companies = df['company_name'].value_counts().head(10)
            for company, count in top_companies.items():
                self.logger.info(f"   {company}: {count} positions")
            
            # Top positions
            self.logger.info(f"\nTOP 10 JOB POSITIONS:")
            top_positions = df['position_title'].value_counts().head(10)
            for position, count in top_positions.items():
                self.logger.info(f"   {position}: {count} openings")
            
            # Top locations
            self.logger.info(f"\nTOP 10 LOCATIONS:")
            top_locations = df['location_city'].value_counts().head(10)
            for location, count in top_locations.items():
                self.logger.info(f"   {location}: {count} jobs")
            
            # Salary statistics
            salary_data = df[df['salary_max'] > 0]['salary_max']
            if len(salary_data) > 0:
                self.logger.info(f"\nSALARY STATISTICS:")
                self.logger.info(f"   Jobs with Salary Info: {len(salary_data)} ({len(salary_data)/len(df)*100:.1f}%)")
                self.logger.info(f"   Average Salary: ‚Çπ{salary_data.mean()/100000:.2f} LPA")
                self.logger.info(f"   Median Salary: ‚Çπ{salary_data.median()/100000:.2f} LPA")
            
            # Completeness
            self.logger.info(f"\nDATA COMPLETENESS:")
            avg_completeness = df['completeness_score'].mean()
            self.logger.info(f"   Average Completeness: {avg_completeness:.2%}")
            
            self.logger.info(f"\n{'='*70}\n")
            
        except Exception as e:
            self.logger.error(f"Error generating analytics: {e}")


# ===================================================================
# MAIN EXECUTION
# ===================================================================

def main():
    """Main execution function."""
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler("entity_structuring.log"),
            logging.StreamHandler()
        ]
    )
    
    logger.info("\n" + "="*70)
    logger.info("STARTING ENTITY STRUCTURING PIPELINE")
    logger.info("="*70 + "\n")
    
    try:
        # Initialize pipeline
        pipeline = EntityStructuringPipeline()
        
        # Process dataset
        jobs_df, jobs_list = pipeline.process_dataset()
        
        # Generate analytics
        pipeline.generate_analytics()
        
        logger.info("="*70)
        logger.info("PIPELINE COMPLETED SUCCESSFULLY")
        logger.info("="*70)
        logger.info(f"Output Files:")
        logger.info(f"   - CSV: {pipeline.output_csv}")
        logger.info(f"   - JSON: {pipeline.output_json}")
        logger.info(f"   - Log: entity_structuring.log")
        logger.info("="*70 + "\n")
        
        return jobs_df, jobs_list
        
    except Exception as e:
        logger.error(f"PIPELINE FAILED: {e}")
        raise


if __name__ == "__main__":
    jobs_df, jobs_json = main()
    
    # Display sample
    print("\nSAMPLE STRUCTURED JOB POSTING:")
    print("="*70)
    if len(jobs_df) > 0:
        sample = jobs_df.iloc[0].to_dict()
        for key, value in sample.items():
            print(f"{key}: {value}")
    print("="*70)


2025-12-17 23:18:16,035 [INFO] 
2025-12-17 23:18:16,036 [INFO] STARTING ENTITY STRUCTURING PIPELINE

2025-12-17 23:18:16,038 [INFO] Configuration loaded from config.json
2025-12-17 23:18:16,038 [INFO] CONFIGURATION LOADED
2025-12-17 23:18:16,038 [INFO] Incremental Processing:
2025-12-17 23:18:16,044 [INFO]   Enabled: True
2025-12-17 23:18:16,045 [INFO]   State Directory: state
2025-12-17 23:18:16,046 [INFO]   State File: processed_message_ids.txt
2025-12-17 23:18:16,047 [INFO]   Checkpoint Interval: 50
2025-12-17 23:18:16,048 [INFO]   Force Full Reprocess: False
2025-12-17 23:18:16,049 [INFO] Input/Output:
2025-12-17 23:18:16,050 [INFO]   Input File: ../Phase 2/relevant_placement_emails.csv
2025-12-17 23:18:16,050 [INFO]   Output CSV: structured_job_postings.csv
2025-12-17 23:18:16,051 [INFO]   Output JSON: structured_job_postings.json
2025-12-17 23:18:16,052 [INFO] Processing:
2025-12-17 23:18:16,053 [INFO]   Max Jobs Per Email: 5
2025-12-17 23:18:16,055 [INFO]   Max Companies Per Ema


SAMPLE STRUCTURED JOB POSTING:
job_id: 192127511fd4e321_JOB_1
email_id: 192127511fd4e321
company_name: Glassdoor
company_canonical: GLASSDOOR
company_confidence: 0.85
position_title: Intern
position_level: Intern
position_confidence: 0.85
skills_required: Machine Learning
skills_count: 1
education_required: B.E, PHD
experience_min_years: 0
experience_max_years: 0
experience_type: Fresher
location_city: Gurgaon
location_state: Not Specified
work_mode: On-site
location_confidence: 0.8
salary_min: 2500000
salary_max: 2500000
salary_currency: INR
salary_period: annual
salary_raw_text: 25lpa
salary_confidence: 0.85
application_deadline: None
apply_link: None
contact_email: None
completeness_score: 0.6
extraction_timestamp: 2025-12-09T23:10:12.261894
source_subject: God bless you
